From 15ecc6153bd36e03ed263b62c30ed4041055a884 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 3 Aug 2018 11:51:13 +0300 Subject: [PATCH 001/134] Initial WiP commit. --- hc2/external/elfio/elf_types.hpp | 125 +- hc2/external/elfio/elfio.hpp | 159 +- hc2/external/elfio/elfio_dump.hpp | 30 +- hc2/external/elfio/elfio_dynamic.hpp | 14 +- hc2/external/elfio/elfio_header.hpp | 8 +- hc2/external/elfio/elfio_note.hpp | 24 +- hc2/external/elfio/elfio_relocation.hpp | 14 +- hc2/external/elfio/elfio_section.hpp | 45 +- hc2/external/elfio/elfio_segment.hpp | 40 +- hc2/external/elfio/elfio_strings.hpp | 12 +- hc2/external/elfio/elfio_symbols.hpp | 22 +- hc2/external/elfio/elfio_utils.hpp | 2 +- include/amp.h | 1202 ++------------ include/atomics.hpp | 271 ++++ include/coordinate | 6 +- include/grid_launch.h | 69 - include/grid_launch.hpp | 50 - include/hc.hpp | 1416 ++--------------- include/hc_defines.h | 5 - include/hc_printf.hpp | 1 - include/hc_short_vector.inl | 72 - include/hsa_atomic.h | 143 -- include/kalmar_buffer.h | 12 +- include/kalmar_cpu_launch.h | 48 - include/kalmar_exception.h | 5 +- include/kalmar_index.h | 4 +- include/kalmar_launch.h | 368 ++++- include/kalmar_runtime.h | 130 +- include/kalmar_serialize.h | 4 +- lib/CMakeLists.txt | 3 +- lib/cpu/mcwamp_cpu.cpp | 51 +- lib/hsa/mcwamp_hsa.cpp | 764 +++------ lib/mcwamp.cpp | 23 +- lib/mcwamp_atomic.cpp | 183 --- lib/mcwamp_impl.hpp | 4 - tests/Unit/Codegen/deser_decl.cpp | 27 - .../deser_decl_support_inheritclass.cpp | 28 - tests/Unit/Codegen/deser_def.cpp | 23 - tests/Unit/Codegen/deser_def_body.cpp | 26 - .../Unit/Codegen/deser_def_body_compound.cpp | 40 - ...def_body_compound_support_inheritclass.cpp | 50 - tests/Unit/Codegen/deser_def_ref.cpp | 41 - tests/Unit/Codegen/ser_decl.cpp | 32 - tests/Unit/Codegen/ser_decl_ref.cpp | 42 - tests/Unit/Codegen/ser_def.cpp | 29 - tests/Unit/Codegen/ser_def_body.cpp | 58 - .../ser_def_body_support_inheritclass.cpp | 63 - .../Codegen/ser_def_body_support_scalar.cpp | 57 - tests/Unit/Codegen/signature.cpp | 46 - tests/Unit/Codegen/trampoline.cpp | 25 - tests/Unit/Codegen/trampoline_byref.cpp | 25 - tests/Unit/Codegen/trampoline_name.cpp | 21 - .../HC/capture_struct_with_carray_by_copy.cpp | 27 +- .../capture_struct_with_carray_by_copy2.cpp | 16 - .../capture_struct_with_carray_by_copy3.cpp | 84 +- .../capture_struct_with_carray_by_copy4.cpp | 92 +- 56 files changed, 1576 insertions(+), 4605 deletions(-) create mode 100644 include/atomics.hpp delete mode 100644 include/grid_launch.h delete mode 100644 include/grid_launch.hpp delete mode 100644 include/hsa_atomic.h delete mode 100644 include/kalmar_cpu_launch.h delete mode 100644 lib/mcwamp_atomic.cpp delete mode 100644 tests/Unit/Codegen/deser_decl.cpp delete mode 100644 tests/Unit/Codegen/deser_decl_support_inheritclass.cpp delete mode 100644 tests/Unit/Codegen/deser_def.cpp delete mode 100644 tests/Unit/Codegen/deser_def_body.cpp delete mode 100644 tests/Unit/Codegen/deser_def_body_compound.cpp delete mode 100644 tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp delete mode 100644 tests/Unit/Codegen/deser_def_ref.cpp delete mode 100644 tests/Unit/Codegen/ser_decl.cpp delete mode 100644 tests/Unit/Codegen/ser_decl_ref.cpp delete mode 100644 tests/Unit/Codegen/ser_def.cpp delete mode 100644 tests/Unit/Codegen/ser_def_body.cpp delete mode 100644 tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp delete mode 100644 tests/Unit/Codegen/ser_def_body_support_scalar.cpp delete mode 100644 tests/Unit/Codegen/signature.cpp delete mode 100644 tests/Unit/Codegen/trampoline.cpp delete mode 100644 tests/Unit/Codegen/trampoline_byref.cpp delete mode 100644 tests/Unit/Codegen/trampoline_name.cpp diff --git a/hc2/external/elfio/elf_types.hpp b/hc2/external/elfio/elf_types.hpp index 1b90c4c57dd..1301cf4315f 100644 --- a/hc2/external/elfio/elf_types.hpp +++ b/hc2/external/elfio/elf_types.hpp @@ -460,53 +460,84 @@ typedef uint64_t Elf64_Off; #define STN_UNDEF 0 // Relocation types -#define R_386_NONE 0 -#define R_X86_64_NONE 0 -#define R_386_32 1 -#define R_X86_64_64 1 -#define R_386_PC32 2 -#define R_X86_64_PC32 2 -#define R_386_GOT32 3 -#define R_X86_64_GOT32 3 -#define R_386_PLT32 4 -#define R_X86_64_PLT32 4 -#define R_386_COPY 5 -#define R_X86_64_COPY 5 -#define R_386_GLOB_DAT 6 -#define R_X86_64_GLOB_DAT 6 -#define R_386_JMP_SLOT 7 -#define R_X86_64_JUMP_SLOT 7 -#define R_386_RELATIVE 8 -#define R_X86_64_RELATIVE 8 -#define R_386_GOTOFF 9 -#define R_X86_64_GOTPCREL 9 -#define R_386_GOTPC 10 -#define R_X86_64_32 10 -#define R_X86_64_32S 11 -#define R_X86_64_16 12 -#define R_X86_64_PC16 13 -#define R_X86_64_8 14 -#define R_X86_64_PC8 15 -#define R_X86_64_DTPMOD64 16 -#define R_X86_64_DTPOFF64 17 -#define R_X86_64_TPOFF64 18 -#define R_X86_64_TLSGD 19 -#define R_X86_64_TLSLD 20 -#define R_X86_64_DTPOFF32 21 -#define R_X86_64_GOTTPOFF 22 -#define R_X86_64_TPOFF32 23 -#define R_X86_64_PC64 24 -#define R_X86_64_GOTOFF64 25 -#define R_X86_64_GOTPC32 26 -#define R_X86_64_GOT64 27 -#define R_X86_64_GOTPCREL64 28 -#define R_X86_64_GOTPC64 29 -#define R_X86_64_GOTPLT64 30 -#define R_X86_64_PLTOFF64 31 -#define R_X86_64_GOTPC32_TLSDESC 34 -#define R_X86_64_TLSDESC_CALL 35 -#define R_X86_64_TLSDESC 36 -#define R_X86_64_IRELATIVE 37 +#define R_386_NONE 0 +#define R_X86_64_NONE 0 +#define R_386_32 1 +#define R_X86_64_64 1 +#define R_386_PC32 2 +#define R_X86_64_PC32 2 +#define R_386_GOT32 3 +#define R_X86_64_GOT32 3 +#define R_386_PLT32 4 +#define R_X86_64_PLT32 4 +#define R_386_COPY 5 +#define R_X86_64_COPY 5 +#define R_386_GLOB_DAT 6 +#define R_X86_64_GLOB_DAT 6 +#define R_386_JMP_SLOT 7 +#define R_X86_64_JUMP_SLOT 7 +#define R_386_RELATIVE 8 +#define R_X86_64_RELATIVE 8 +#define R_386_GOTOFF 9 +#define R_X86_64_GOTPCREL 9 +#define R_386_GOTPC 10 +#define R_X86_64_32 10 +#define R_386_32PLT 11 +#define R_X86_64_32S 11 +#define R_X86_64_16 12 +#define R_X86_64_PC16 13 +#define R_386_TLS_TPOFF 14 +#define R_X86_64_8 14 +#define R_386_TLS_IE 15 +#define R_X86_64_PC8 15 +#define R_386_TLS_GOTIE 16 +#define R_X86_64_DTPMOD64 16 +#define R_386_TLS_LE 17 +#define R_X86_64_DTPOFF64 17 +#define R_386_TLS_GD 18 +#define R_X86_64_TPOFF64 18 +#define R_386_TLS_LDM 19 +#define R_X86_64_TLSGD 19 +#define R_386_16 20 +#define R_X86_64_TLSLD 20 +#define R_386_PC16 21 +#define R_X86_64_DTPOFF32 21 +#define R_386_8 22 +#define R_X86_64_GOTTPOFF 22 +#define R_386_PC8 23 +#define R_X86_64_TPOFF32 23 +#define R_386_TLS_GD_32 24 +#define R_X86_64_PC64 24 +#define R_386_TLS_GD_PUSH 25 +#define R_X86_64_GOTOFF64 25 +#define R_386_TLS_GD_CALL 26 +#define R_X86_64_GOTPC32 26 +#define R_386_TLS_GD_POP 27 +#define R_X86_64_GOT64 27 +#define R_386_TLS_LDM_32 28 +#define R_X86_64_GOTPCREL64 28 +#define R_386_TLS_LDM_PUSH 29 +#define R_X86_64_GOTPC64 29 +#define R_386_TLS_LDM_CALL 30 +#define R_X86_64_GOTPLT64 30 +#define R_386_TLS_LDM_POP 31 +#define R_X86_64_PLTOFF64 31 +#define R_386_TLS_LDO_32 32 +#define R_386_TLS_IE_32 33 +#define R_386_TLS_LE_32 34 +#define R_X86_64_GOTPC32_TLSDESC 34 +#define R_386_TLS_DTPMOD32 35 +#define R_X86_64_TLSDESC_CALL 35 +#define R_386_TLS_DTPOFF32 36 +#define R_X86_64_TLSDESC 36 +#define R_386_TLS_TPOFF32 37 +#define R_X86_64_IRELATIVE 37 +#define R_386_SIZE32 38 +#define R_386_TLS_GOTDESC 39 +#define R_386_TLS_DESC_CALL 40 +#define R_386_TLS_DESC 41 +#define R_386_IRELATIVE 42 +#define R_386_GOT32X 43 #define R_X86_64_GNU_VTINHERIT 250 #define R_X86_64_GNU_VTENTRY 251 diff --git a/hc2/external/elfio/elfio.hpp b/hc2/external/elfio/elfio.hpp index b59295b342c..508f8e77d03 100644 --- a/hc2/external/elfio/elfio.hpp +++ b/hc2/external/elfio/elfio.hpp @@ -51,19 +51,21 @@ THE SOFTWARE. TYPE \ get_##FNAME() const \ { \ - return header->get_##FNAME(); \ + return header? header->get_##FNAME() : 0; \ } #define ELFIO_HEADER_ACCESS_GET_SET( TYPE, FNAME ) \ TYPE \ get_##FNAME() const \ { \ - return header->get_##FNAME(); \ + return header? header->get_##FNAME() : 0; \ } \ void \ set_##FNAME( TYPE val ) \ -{ \ - header->set_##FNAME( val ); \ +{ \ + if (header) { \ + header->set_##FNAME( val ); \ + } \ } \ namespace ELFIO { @@ -112,11 +114,9 @@ class elfio { clean(); - unsigned char e_ident[EI_NIDENT]; - - // Read ELF file signature - stream.seekg( 0 ); - stream.read( reinterpret_cast( &e_ident ), sizeof( e_ident ) ); + unsigned char e_ident[EI_NIDENT]; + // Read ELF file signature + stream.read( reinterpret_cast( &e_ident ), sizeof( e_ident ) ); // Is it ELF file? if ( stream.gcount() != sizeof( e_ident ) || @@ -133,7 +133,6 @@ class elfio } convertor.setup( e_ident[EI_DATA] ); - header = create_header( e_ident[EI_CLASS], e_ident[EI_DATA] ); if ( 0 == header ) { return false; @@ -143,9 +142,8 @@ class elfio } load_sections( stream ); - load_segments( stream ); - - return true; + bool is_still_good = load_segments( stream ); + return is_still_good; } //------------------------------------------------------------------------------ @@ -153,12 +151,11 @@ class elfio { std::ofstream f( file_name.c_str(), std::ios::out | std::ios::binary ); - if ( !f ) { + if ( !f || !header) { return false; } bool is_still_good = true; - // Define layout specific header fields // The position of the segment table is fixed after the header. // The position of the section table is variable and needs to be fixed @@ -172,6 +169,8 @@ class elfio current_file_pos = header->get_header_size() + header->get_segment_entry_size() * header->get_segments_num(); + calc_segment_alignment(); + is_still_good = layout_segments_and_their_sections(); is_still_good = is_still_good && layout_sections_without_segments(); is_still_good = is_still_good && layout_section_table(); @@ -248,6 +247,45 @@ class elfio } } +//------------------------------------------------------------------------------ + private: + bool is_offset_in_section( Elf64_Off offset, const section* sec ) const { + return offset >= sec->get_offset() && offset < sec->get_offset()+sec->get_size(); + } + +//------------------------------------------------------------------------------ + public: + + //! returns an empty string if no problems are detected, + //! or a string containing an error message if problems are found + std::string validate() const { + + // check for overlapping sections in the file + for ( int i = 0; i < sections.size(); ++i) { + for ( int j = i+1; j < sections.size(); ++j ) { + const section* a = sections[i]; + const section* b = sections[j]; + if ( !(a->get_type() & SHT_NOBITS) + && !(b->get_type() & SHT_NOBITS) + && (a->get_size() > 0) + && (b->get_size() > 0) + && (a->get_offset() > 0) + && (b->get_offset() > 0)) { + if ( is_offset_in_section( a->get_offset(), b ) + || is_offset_in_section( a->get_offset()+a->get_size()-1, b ) + || is_offset_in_section( b->get_offset(), a ) + || is_offset_in_section( b->get_offset()+b->get_size()-1, a )) { + return "Sections " + a->get_name() + " and " + b->get_name() + " overlap in file"; + } + } + } + } + + // more checks to be added here... + + return ""; + } + //------------------------------------------------------------------------------ private: //------------------------------------------------------------------------------ @@ -382,6 +420,18 @@ class elfio return num; } +//------------------------------------------------------------------------------ + //! Checks whether the addresses of the section entirely fall within the given segment. + //! It doesn't matter if the addresses are memory addresses, or file offsets, + //! they just need to be in the same address space + bool is_sect_in_seg ( Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin, Elf64_Off seg_end ) { + return seg_begin <= sect_begin + && sect_begin + sect_size <= seg_end + && sect_begin < seg_end; // this is important criteria when sect_size == 0 + // Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11) + // sect_begin=12, sect_size=0 -> shall return false! + } + //------------------------------------------------------------------------------ bool load_segments( std::istream& stream ) { @@ -417,14 +467,11 @@ class elfio // SHF_ALLOC sections are matched based on the virtual address // otherwise the file offset is matched if( psec->get_flags() & SHF_ALLOC - ? (segVBaseAddr <= psec->get_address() - && psec->get_address() + psec->get_size() - <= segVEndAddr) - : (segBaseOffset <= psec->get_offset() - && psec->get_offset() + psec->get_size() - <= segEndOffset)) { - seg->add_section_index( psec->get_index(), - psec->get_addr_align() ); + ? is_sect_in_seg( psec->get_address(), psec->get_size(), segVBaseAddr, segVEndAddr ) + : is_sect_in_seg( psec->get_offset(), psec->get_size(), segBaseOffset, segEndOffset )) { + // Alignment of segment shall not be updated, to preserve original value + // It will be re-calculated on saving. + seg->add_section_index( psec->get_index(), 0 ); } } @@ -517,6 +564,9 @@ class elfio for( size_t i = 0; i < worklist.size(); ++i ) { if( i != nextSlot && worklist[i]->is_offset_initialized() && worklist[i]->get_offset() == 0 ) { + if (worklist[nextSlot]->get_offset() == 0) { + ++nextSlot; + } std::swap(worklist[i],worklist[nextSlot]); ++nextSlot; } @@ -570,6 +620,20 @@ class elfio } +//------------------------------------------------------------------------------ + void calc_segment_alignment( ) + { + for( std::vector::iterator s = segments_.begin(); s != segments_.end(); ++s ) { + segment* seg = *s; + for ( int i = 0; i < seg->get_sections_num(); ++i ) { + section* sect = sections_[ seg->get_section_index_at(i) ]; + if ( sect->get_addr_align() > seg->get_align() ) { + seg->set_align( sect->get_addr_align() ); + } + } + } + } + //------------------------------------------------------------------------------ bool layout_segments_and_their_sections( ) { @@ -606,11 +670,12 @@ class elfio // have to be aligned else if ( seg->get_sections_num() && !section_generated[seg->get_section_index_at( 0 )] ) { - Elf64_Off cur_page_alignment = current_file_pos % seg->get_align(); - Elf64_Off req_page_alignment = seg->get_virtual_address() % seg->get_align(); + Elf_Xword align = seg->get_align() > 0 ? seg->get_align() : 1; + Elf64_Off cur_page_alignment = current_file_pos % align; + Elf64_Off req_page_alignment = seg->get_virtual_address() % align; Elf64_Off error = req_page_alignment - cur_page_alignment; - current_file_pos += ( seg->get_align() + error ) % seg->get_align(); + current_file_pos += ( seg->get_align() + error ) % align; seg_start_pos = current_file_pos; } else if ( seg->get_sections_num() ) { @@ -633,14 +698,20 @@ class elfio // Fix up the alignment if ( !section_generated[index] && sec->is_address_initialized() && SHT_NOBITS != sec->get_type() - && SHT_NULL != sec->get_type() ) { + && SHT_NULL != sec->get_type() + && 0 != sec->get_size() ) { // Align the sections based on the virtual addresses // when possible (this is what matters for execution) Elf64_Off req_offset = sec->get_address() - seg->get_virtual_address(); Elf64_Off cur_offset = current_file_pos - seg_start_pos; + if ( req_offset < cur_offset) { + // something has gone awfully wrong, abort! + // secAlign would turn out negative, seeking backwards and overwriting previous data + return false; + } secAlign = req_offset - cur_offset; } - else if (!section_generated[index]) { + else if (!section_generated[index] && !sec->is_address_initialized() ) { // If no address has been specified then only the section // alignment constraint has to be matched Elf_Xword align = sec->get_addr_align(); @@ -650,7 +721,7 @@ class elfio Elf64_Off error = current_file_pos % align; secAlign = ( align - error ) % align; } - else { + else if (section_generated[index] ) { // Alignment for already generated sections secAlign = sec->get_offset() - seg_start_pos - segment_filesize; } @@ -685,7 +756,15 @@ class elfio } seg->set_file_size( segment_filesize ); - seg->set_memory_size( segment_memory ); + + // If we already have a memory size from loading an elf file (value > 0), + // it must not shrink! + // Memory size may be bigger than file size and it is the loader's job to do something + // with the surplus bytes in memory, like initializing them with a defined value. + if ( seg->get_memory_size() < segment_memory ) { + seg->set_memory_size( segment_memory ); + } + seg->set_offset(seg_start_pos); } @@ -775,6 +854,16 @@ class elfio return parent->sections_.end(); } +//------------------------------------------------------------------------------ + std::vector::const_iterator begin() const { + return parent->sections_.cbegin(); + } + +//------------------------------------------------------------------------------ + std::vector::const_iterator end() const { + return parent->sections_.cend(); + } + //------------------------------------------------------------------------------ private: elfio* parent; @@ -820,6 +909,16 @@ class elfio return parent->segments_.end(); } +//------------------------------------------------------------------------------ + std::vector::const_iterator begin() const { + return parent->segments_.cbegin(); + } + +//------------------------------------------------------------------------------ + std::vector::const_iterator end() const { + return parent->segments_.cend(); + } + //------------------------------------------------------------------------------ private: elfio* parent; diff --git a/hc2/external/elfio/elfio_dump.hpp b/hc2/external/elfio/elfio_dump.hpp index 04948529603..d98c1ff1881 100644 --- a/hc2/external/elfio/elfio_dump.hpp +++ b/hc2/external/elfio/elfio_dump.hpp @@ -429,18 +429,22 @@ class dump //------------------------------------------------------------------------------ static void header( std::ostream& out, const elfio& reader ) - { - out << "ELF Header" << std::endl << std::endl - << " Class: " << str_class( reader.get_class() ) << std::endl - << " Encoding: " << str_endian( reader.get_encoding() ) << std::endl - << " ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl - << " Type: " << str_type( reader.get_type() ) << std::endl - << " Machine: " << str_machine( reader.get_machine() ) << std::endl - << " Version: " << str_version( reader.get_version() ) << std::endl - << " Entry: " << "0x" << std::hex << reader.get_entry() << std::endl - << " Flags: " << "0x" << std::hex << reader.get_flags() << std::endl - << std::endl; - } + { + if (!reader.get_header_size()) + { + return; + } + out << "ELF Header" << std::endl << std::endl + << " Class: " << str_class( reader.get_class() ) << std::endl + << " Encoding: " << str_endian( reader.get_encoding() ) << std::endl + << " ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl + << " Type: " << str_type( reader.get_type() ) << std::endl + << " Machine: " << str_machine( reader.get_machine() ) << std::endl + << " Version: " << str_version( reader.get_version() ) << std::endl + << " Entry: " << "0x" << std::hex << reader.get_entry() << std::endl + << " Flags: " << "0x" << std::hex << reader.get_flags() << std::endl + << std::endl; + } //------------------------------------------------------------------------------ static void @@ -728,7 +732,7 @@ class dump if ( dyn_no > 0 ) { out << "Dynamic section (" << sec->get_name() << ")" << std::endl; out << "[ Nr ] Tag Name/Value" << std::endl; - for ( int i = 0; i < dyn_no; ++i ) { + for ( Elf_Xword i = 0; i < dyn_no; ++i ) { Elf_Xword tag = 0; Elf_Xword value = 0; std::string str; diff --git a/hc2/external/elfio/elfio_dynamic.hpp b/hc2/external/elfio/elfio_dynamic.hpp index 6f2d041e0fc..64f13b9ce7a 100644 --- a/hc2/external/elfio/elfio_dynamic.hpp +++ b/hc2/external/elfio/elfio_dynamic.hpp @@ -26,13 +26,14 @@ THE SOFTWARE. namespace ELFIO { //------------------------------------------------------------------------------ -class dynamic_section_accessor +template< class S > +class dynamic_section_accessor_template { public: //------------------------------------------------------------------------------ - dynamic_section_accessor( const elfio& elf_file_, section* section_ ) : - elf_file( elf_file_ ), - dynamic_section( section_ ) + dynamic_section_accessor_template( const elfio& elf_file_, S* section_ ) : + elf_file( elf_file_ ), + dynamic_section( section_ ) { } @@ -245,9 +246,12 @@ class dynamic_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* dynamic_section; + S* dynamic_section; }; +using dynamic_section_accessor = dynamic_section_accessor_template
; +using const_dynamic_section_accessor = dynamic_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_DYNAMIC_HPP diff --git a/hc2/external/elfio/elfio_header.hpp b/hc2/external/elfio/elfio_header.hpp index d689a8899f7..e8713cd7894 100644 --- a/hc2/external/elfio/elfio_header.hpp +++ b/hc2/external/elfio/elfio_header.hpp @@ -38,11 +38,11 @@ class elf_header ELFIO_GET_ACCESS_DECL( unsigned char, class ); ELFIO_GET_ACCESS_DECL( unsigned char, elf_version ); ELFIO_GET_ACCESS_DECL( unsigned char, encoding ); - ELFIO_GET_ACCESS_DECL( Elf_Word, version ); ELFIO_GET_ACCESS_DECL( Elf_Half, header_size ); ELFIO_GET_ACCESS_DECL( Elf_Half, section_entry_size ); ELFIO_GET_ACCESS_DECL( Elf_Half, segment_entry_size ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, version ); ELFIO_GET_SET_ACCESS_DECL( unsigned char, os_abi ); ELFIO_GET_SET_ACCESS_DECL( unsigned char, abi_version ); ELFIO_GET_SET_ACCESS_DECL( Elf_Half, type ); @@ -86,8 +86,6 @@ template< class T > class elf_header_impl : public elf_header header.e_ident[EI_CLASS] = elf_header_impl_types::file_class; header.e_ident[EI_DATA] = encoding; header.e_ident[EI_VERSION] = EV_CURRENT; - header.e_version = EV_CURRENT; - header.e_version = (*convertor)( header.e_version ); header.e_ehsize = ( sizeof( header ) ); header.e_ehsize = (*convertor)( header.e_ehsize ); header.e_shstrndx = (*convertor)( (Elf_Half)1 ); @@ -95,6 +93,8 @@ template< class T > class elf_header_impl : public elf_header header.e_shentsize = sizeof( typename elf_header_impl_types::Shdr_type ); header.e_phentsize = (*convertor)( header.e_phentsize ); header.e_shentsize = (*convertor)( header.e_shentsize ); + + set_version( EV_CURRENT ); } bool @@ -119,11 +119,11 @@ template< class T > class elf_header_impl : public elf_header ELFIO_GET_ACCESS( unsigned char, class, header.e_ident[EI_CLASS] ); ELFIO_GET_ACCESS( unsigned char, elf_version, header.e_ident[EI_VERSION] ); ELFIO_GET_ACCESS( unsigned char, encoding, header.e_ident[EI_DATA] ); - ELFIO_GET_ACCESS( Elf_Word, version, header.e_version ); ELFIO_GET_ACCESS( Elf_Half, header_size, header.e_ehsize ); ELFIO_GET_ACCESS( Elf_Half, section_entry_size, header.e_shentsize ); ELFIO_GET_ACCESS( Elf_Half, segment_entry_size, header.e_phentsize ); + ELFIO_GET_SET_ACCESS( Elf_Word, version, header.e_version); ELFIO_GET_SET_ACCESS( unsigned char, os_abi, header.e_ident[EI_OSABI] ); ELFIO_GET_SET_ACCESS( unsigned char, abi_version, header.e_ident[EI_ABIVERSION] ); ELFIO_GET_SET_ACCESS( Elf_Half, type, header.e_type ); diff --git a/hc2/external/elfio/elfio_note.hpp b/hc2/external/elfio/elfio_note.hpp index 35c6fe344cc..8619c7385db 100644 --- a/hc2/external/elfio/elfio_note.hpp +++ b/hc2/external/elfio/elfio_note.hpp @@ -38,12 +38,13 @@ namespace ELFIO { //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ -class note_section_accessor +template< class S > +class note_section_accessor_template { public: //------------------------------------------------------------------------------ - note_section_accessor( const elfio& elf_file_, section* section_ ) : - elf_file( elf_file_ ), note_section( section_ ) + note_section_accessor_template( const elfio& elf_file_, S* section_ ) : + elf_file( elf_file_ ), note_section( section_ ) { process_section(); } @@ -71,10 +72,10 @@ class note_section_accessor int align = sizeof( Elf_Word ); const endianess_convertor& convertor = elf_file.get_convertor(); - type = convertor( *(Elf_Word*)( pData + 2*align ) ); - Elf_Word namesz = convertor( *(Elf_Word*)( pData ) ); - descSize = convertor( *(Elf_Word*)( pData + sizeof( namesz ) ) ); - Elf_Word max_name_size = note_section->get_size() - note_start_positions[index]; + type = convertor( *(const Elf_Word*)( pData + 2*align ) ); + Elf_Word namesz = convertor( *(const Elf_Word*)( pData ) ); + descSize = convertor( *(const Elf_Word*)( pData + sizeof( namesz ) ) ); + Elf_Xword max_name_size = note_section->get_size() - note_start_positions[index]; if ( namesz > max_name_size || namesz + descSize > max_name_size ) { return false; @@ -144,9 +145,9 @@ class note_section_accessor while ( current + 3*align <= size ) { note_start_positions.push_back( current ); Elf_Word namesz = convertor( - *(Elf_Word*)( data + current ) ); + *(const Elf_Word*)( data + current ) ); Elf_Word descsz = convertor( - *(Elf_Word*)( data + current + sizeof( namesz ) ) ); + *(const Elf_Word*)( data + current + sizeof( namesz ) ) ); current += 3*sizeof( Elf_Word ) + ( ( namesz + align - 1 ) / align ) * align + @@ -157,10 +158,13 @@ class note_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* note_section; + S* note_section; std::vector note_start_positions; }; +using note_section_accessor = note_section_accessor_template
; +using const_note_section_accessor = note_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_NOTE_HPP diff --git a/hc2/external/elfio/elfio_relocation.hpp b/hc2/external/elfio/elfio_relocation.hpp index d13d8b23c7f..238598e97ba 100644 --- a/hc2/external/elfio/elfio_relocation.hpp +++ b/hc2/external/elfio/elfio_relocation.hpp @@ -73,13 +73,14 @@ template<> struct get_sym_and_type< Elf64_Rela > //------------------------------------------------------------------------------ -class relocation_section_accessor +template< class S > +class relocation_section_accessor_template { public: //------------------------------------------------------------------------------ - relocation_section_accessor( const elfio& elf_file_, section* section_ ) : - elf_file( elf_file_ ), - relocation_section( section_ ) + relocation_section_accessor_template( const elfio& elf_file_, S* section_ ) : + elf_file( elf_file_ ), + relocation_section( section_ ) { } @@ -361,9 +362,12 @@ class relocation_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* relocation_section; + S* relocation_section; }; +using relocation_section_accessor = relocation_section_accessor_template
; +using const_relocation_section_accessor = relocation_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_RELOCATION_HPP diff --git a/hc2/external/elfio/elfio_section.hpp b/hc2/external/elfio/elfio_section.hpp index b2c9b456b55..cb188c14d08 100644 --- a/hc2/external/elfio/elfio_section.hpp +++ b/hc2/external/elfio/elfio_section.hpp @@ -45,6 +45,17 @@ class section ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, address ); ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, size ); ELFIO_GET_SET_ACCESS_DECL( Elf_Word, name_string_offset ); + ELFIO_GET_ACCESS_DECL ( Elf64_Off, offset ); + size_t stream_size; + size_t get_stream_size() const + { + return stream_size; + } + + void set_stream_size(size_t value) + { + stream_size = value; + } virtual const char* get_data() const = 0; virtual void set_data( const char* pData, Elf_Word size ) = 0; @@ -53,7 +64,7 @@ class section virtual void append_data( const std::string& data ) = 0; protected: - ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, offset ); + ELFIO_SET_ACCESS_DECL( Elf64_Off, offset ); ELFIO_SET_ACCESS_DECL( Elf_Half, index ); virtual void load( std::istream& f, @@ -223,23 +234,29 @@ class section_impl : public section std::streampos header_offset ) { std::fill_n( reinterpret_cast( &header ), sizeof( header ), '\0' ); + + stream.seekg ( 0, stream.end ); + set_stream_size ( stream.tellg() ); + stream.seekg( header_offset ); stream.read( reinterpret_cast( &header ), sizeof( header ) ); + Elf_Xword size = get_size(); - if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() ) { - try { - data = new char[size]; - } catch (const std::bad_alloc&) { - data = 0; - data_size = 0; - } - if ( 0 != size ) { - stream.seekg( (*convertor)( header.sh_offset ) ); - stream.read( data, size ); - data_size = size; - } - } + if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() && size < get_stream_size()) { + try { + data = new char[size + 1]; + } catch (const std::bad_alloc&) { + data = 0; + data_size = 0; + } + if ( 0 != size ) { + stream.seekg( (*convertor)( header.sh_offset ) ); + stream.read( data, size ); + data[size] = 0; //ensure data is ended with 0 to avoid oob read + data_size = size; + } + } } //------------------------------------------------------------------------------ diff --git a/hc2/external/elfio/elfio_segment.hpp b/hc2/external/elfio/elfio_segment.hpp index 35f17e939bc..02d752a90b0 100644 --- a/hc2/external/elfio/elfio_segment.hpp +++ b/hc2/external/elfio/elfio_segment.hpp @@ -92,6 +92,21 @@ class segment_impl : public segment ELFIO_GET_SET_ACCESS( Elf_Xword, file_size, ph.p_filesz ); ELFIO_GET_SET_ACCESS( Elf_Xword, memory_size, ph.p_memsz ); ELFIO_GET_ACCESS( Elf64_Off, offset, ph.p_offset ); + size_t stream_size; + +//------------------------------------------------------------------------------ + size_t + get_stream_size() const + { + return stream_size; + } + +//------------------------------------------------------------------------------ + void + set_stream_size(size_t value) + { + stream_size = value; + } //------------------------------------------------------------------------------ Elf_Half @@ -176,6 +191,10 @@ class segment_impl : public segment load( std::istream& stream, std::streampos header_offset ) { + + stream.seekg ( 0, stream.end ); + set_stream_size ( stream.tellg() ); + stream.seekg( header_offset ); stream.read( reinterpret_cast( &ph ), sizeof( ph ) ); is_offset_set = true; @@ -183,14 +202,19 @@ class segment_impl : public segment if ( PT_NULL != get_type() && 0 != get_file_size() ) { stream.seekg( (*convertor)( ph.p_offset ) ); Elf_Xword size = get_file_size(); - try { - data = new char[size]; - } catch (const std::bad_alloc&) { - data = 0; - } - if ( 0 != data ) { - stream.read( data, size ); - } + if ( size > get_stream_size() ) { + data = 0; + } else { + try { + data = new char[size + 1]; + } catch (const std::bad_alloc&) { + data = 0; + } + if ( 0 != data ) { + stream.read( data, size ); + data[size] = 0; + } + } } } diff --git a/hc2/external/elfio/elfio_strings.hpp b/hc2/external/elfio/elfio_strings.hpp index df952a2145d..552f000294f 100644 --- a/hc2/external/elfio/elfio_strings.hpp +++ b/hc2/external/elfio/elfio_strings.hpp @@ -30,12 +30,13 @@ THE SOFTWARE. namespace ELFIO { //------------------------------------------------------------------------------ -class string_section_accessor +template< class S > +class string_section_accessor_template { public: //------------------------------------------------------------------------------ - string_section_accessor( section* section_ ) : - string_section( section_ ) + string_section_accessor_template( S* section_ ) : + string_section( section_ ) { } @@ -88,9 +89,12 @@ class string_section_accessor //------------------------------------------------------------------------------ private: - section* string_section; + S* string_section; }; +using string_section_accessor = string_section_accessor_template
; +using const_string_section_accessor = string_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_STRINGS_HPP diff --git a/hc2/external/elfio/elfio_symbols.hpp b/hc2/external/elfio/elfio_symbols.hpp index 80e498d8d59..d18756a9af9 100644 --- a/hc2/external/elfio/elfio_symbols.hpp +++ b/hc2/external/elfio/elfio_symbols.hpp @@ -26,13 +26,14 @@ THE SOFTWARE. namespace ELFIO { //------------------------------------------------------------------------------ -class symbol_section_accessor +template< class S > +class symbol_section_accessor_template { public: //------------------------------------------------------------------------------ - symbol_section_accessor( const elfio& elf_file_, section* symbol_section_ ) : - elf_file( elf_file_ ), - symbol_section( symbol_section_ ) + symbol_section_accessor_template( const elfio& elf_file_, S* symbol_section_ ) : + elf_file( elf_file_ ), + symbol_section( symbol_section_ ) { find_hash_section(); } @@ -87,17 +88,17 @@ class symbol_section_accessor bool ret = false; if ( 0 != get_hash_table_index() ) { - Elf_Word nbucket = *(Elf_Word*)hash_section->get_data(); - Elf_Word nchain = *(Elf_Word*)( hash_section->get_data() + + Elf_Word nbucket = *(const Elf_Word*)hash_section->get_data(); + Elf_Word nchain = *(const Elf_Word*)( hash_section->get_data() + sizeof( Elf_Word ) ); Elf_Word val = elf_hash( (const unsigned char*)name.c_str() ); - Elf_Word y = *(Elf_Word*)( hash_section->get_data() + + Elf_Word y = *(const Elf_Word*)( hash_section->get_data() + ( 2 + val % nbucket ) * sizeof( Elf_Word ) ); std::string str; get_symbol( y, str, value, size, bind, type, section_index, other ); while ( str != name && STN_UNDEF != y && y < nchain ) { - y = *(Elf_Word*)( hash_section->get_data() + + y = *(const Elf_Word*)( hash_section->get_data() + ( 2 + nbucket + y ) * sizeof( Elf_Word ) ); get_symbol( y, str, value, size, bind, type, section_index, other ); } @@ -268,11 +269,14 @@ class symbol_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* symbol_section; + S* symbol_section; Elf_Half hash_section_index; const section* hash_section; }; +using symbol_section_accessor = symbol_section_accessor_template
; +using const_symbol_section_accessor = symbol_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_SYMBOLS_HPP diff --git a/hc2/external/elfio/elfio_utils.hpp b/hc2/external/elfio/elfio_utils.hpp index f8423bd1475..2baf5a77ccb 100644 --- a/hc2/external/elfio/elfio_utils.hpp +++ b/hc2/external/elfio/elfio_utils.hpp @@ -174,7 +174,7 @@ class endianess_convertor { get_host_encoding() const { static const int tmp = 1; - if ( 1 == *(char*)&tmp ) { + if ( 1 == *(const char*)&tmp ) { return ELFDATA2LSB; } else { diff --git a/include/amp.h b/include/amp.h index db59453179b..bddb29e1d5c 100644 --- a/include/amp.h +++ b/include/amp.h @@ -12,6 +12,7 @@ #pragma once +#include "atomics.hpp" #include "hc_defines.h" #include "kalmar_exception.h" #include "kalmar_index.h" @@ -19,7 +20,10 @@ #include "kalmar_buffer.h" #include "kalmar_serialize.h" #include "kalmar_launch.h" -#include "kalmar_cpu_launch.h" + +#include +#include +#include // forward declaration namespace Concurrency { @@ -58,6 +62,7 @@ using accelerator_view_removed = Kalmar::accelerator_view_removed; */ namespace Concurrency { +using namespace hc::atomics; using namespace Kalmar::enums; using namespace Kalmar::CLAMP; @@ -219,55 +224,37 @@ class accelerator_view { accelerator_view(std::shared_ptr pQueue) : pQueue(pQueue) {} std::shared_ptr pQueue; friend class accelerator; - - template friend - void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr&, size_t *, size_t *, const Kernel&); - template friend - std::shared_future* Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr&, size_t *, size_t *, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void launch_cpu_task(const std::shared_ptr&, Kernel const&, extent const&); -#endif + + template + friend + void Kalmar::launch_kernel( + const std::shared_ptr&, + const Domain&, + const Kernel&); + template + friend + std::shared_future Kalmar::launch_kernel_async( + const std::shared_ptr&, + const Domain&, + const Kernel&); template friend class array; template friend class array_view; - - template friend - void parallel_for_each(Concurrency::extent, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, Concurrency::extent, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, Concurrency::extent<1>, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, Concurrency::extent<2>, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, Concurrency::extent<3>, const Kernel&); - - template friend - void parallel_for_each(tiled_extent, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, tiled_extent, const Kernel&); - - template friend - void parallel_for_each(tiled_extent, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, tiled_extent, const Kernel&); - - template friend - void parallel_for_each(tiled_extent, const Kernel&); - template friend - void parallel_for_each(const accelerator_view&, tiled_extent, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -public: -#endif - __attribute__((annotate("user_deserialize"))) - accelerator_view() restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - throw runtime_exception("errorMsg_throw", 0); -#endif - } + template + friend + void parallel_for_each(const Concurrency::extent&, const Kernel&); + template + friend + void parallel_for_each( + const accelerator_view&, const Concurrency::extent&, const Kernel&); + + template + friend + void parallel_for_each(const tiled_extent&, const Kernel&); + template + friend + void parallel_for_each( + const accelerator_view&, const tiled_extent&, const Kernel&); }; // ------------------------------------------------------------------------ @@ -1100,36 +1087,6 @@ class extent { // utility class for tiled_barrier // ------------------------------------------------------------------------ -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -template -void bar_wrapper(Ker *f, Ti *t) -{ - (*f)(*t); -} - -struct barrier_t { - std::unique_ptr ctx; - int idx; - barrier_t (int a) : - ctx(new ucontext_t[a + 1]) {} - template - void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) { - getcontext(&ctx[x]); - ctx[x].uc_stack.ss_sp = stack; - ctx[x].uc_stack.ss_size = S; - ctx[x].uc_link = &ctx[x - 1]; - makecontext(&ctx[x], (void (*)(void))bar_wrapper, 2, &f, tidx); - } - void swap(int a, int b) { - swapcontext(&ctx[a], &ctx[b]); - } - void wait() { - --idx; - swapcontext(&ctx[idx + 1], &ctx[idx]); - } -}; -#endif - #ifndef CLK_LOCAL_MEM_FENCE #define CLK_LOCAL_MEM_FENCE (1) #endif @@ -1151,20 +1108,6 @@ struct barrier_t { */ class tile_barrier { public: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - using pb_t = std::shared_ptr; - tile_barrier(pb_t pb) : pbar(pb) {} - - /** - * Copy constructor. Constructs a new tile_barrier from the supplied - * argument "other". - * - * @param[in] other An object of type tile_barrier from which to initialize - * this. - */ - tile_barrier(const tile_barrier& other) restrict(amp,cpu) : pbar(other.pbar) {} -#else - /** * Copy constructor. Constructs a new tile_barrier from the supplied * argument "other". @@ -1173,7 +1116,6 @@ class tile_barrier { * this. */ tile_barrier(const tile_barrier& other) restrict(amp,cpu) {} -#endif /** * Blocks execution of all threads in the thread tile until all threads in @@ -1186,11 +1128,7 @@ class tile_barrier { * wait_with_all_memory_fence(). */ void wait() const restrict(amp) { -#if __KALMAR_ACCELERATOR__ == 1 wait_with_all_memory_fence(); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } /** @@ -1203,11 +1141,7 @@ class tile_barrier { * before hitting the barrier. This is identical to wait(). */ void wait_with_all_memory_fence() const restrict(amp) { -#if __KALMAR_ACCELERATOR__ == 1 amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } /** @@ -1220,11 +1154,7 @@ class tile_barrier { * after the barrier are executed before hitting the barrier. */ void wait_with_global_memory_fence() const restrict(amp) { -#if __KALMAR_ACCELERATOR__ == 1 amp_barrier(CLK_GLOBAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } /** @@ -1238,20 +1168,11 @@ class tile_barrier { * hitting the barrier. */ void wait_with_tile_static_memory_fence() const restrict(amp) { -#if __KALMAR_ACCELERATOR__ == 1 amp_barrier(CLK_LOCAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - tile_barrier() restrict(amp,cpu) = default; - pb_t pbar; -#else - tile_barrier() restrict(amp) {} -#endif + tile_barrier() restrict(amp) = default; template friend class tiled_index; @@ -1397,16 +1318,7 @@ class tiled_index { tiled_index(const index<3>& g) restrict(amp, cpu) : global(g) {} private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - tiled_index(int a0, int a1, int a2, int b0, int b1, int b2, - int c0, int c1, int c2, tile_barrier& pb) restrict(amp,cpu) - : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), - tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_extent(D0, D1, D2) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() restrict(amp) + tiled_index() restrict(amp) : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))), local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))), tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))), @@ -1414,20 +1326,14 @@ class tiled_index { amp_get_global_id(1)-amp_get_local_id(1), amp_get_global_id(0)-amp_get_local_id(0))), tile_extent(D0, D1, D2) -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() restrict(amp, cpu) -#else - __attribute__((always_inline)) tiled_index() restrict(amp) -#endif // __KALMAR_ACCELERATOR__ {} - template friend - void parallel_for_each(const accelerator_view&, tiled_extent, const K&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile(K const&, tiled_extent const&, int); -#endif + template + friend + void parallel_for_each( + const accelerator_view&, const tiled_extent&, const K&); + friend + struct Kalmar::Indexer; }; /** @@ -1535,33 +1441,19 @@ class tiled_index { tiled_index(const index<1>& g) restrict(amp, cpu) : global(g) {} private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb) restrict(amp, cpu) - : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_extent(D0) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() restrict(amp) + tiled_index() restrict(amp) : global(index<1>(amp_get_global_id(0))), local(index<1>(amp_get_local_id(0))), tile(index<1>(amp_get_group_id(0))), tile_origin(index<1>(amp_get_global_id(0)-amp_get_local_id(0))), tile_extent(D0) -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() restrict(amp,cpu) -#else - __attribute__((always_inline)) tiled_index() restrict(amp) -#endif // __KALMAR_ACCELERATOR__ {} - template friend - void parallel_for_each(const accelerator_view&, tiled_extent, const K&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile(K const&, tiled_extent const&, int); -#endif + template friend + void parallel_for_each( + const accelerator_view&, const tiled_extent&, const K&); + friend + struct Kalmar::Indexer; }; /** @@ -1671,34 +1563,21 @@ class tiled_index { tiled_index(const index<2>& g) restrict(amp, cpu) : global(g) {} private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& tbar) restrict(amp, cpu) - : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(tbar), tile_extent(D0, D1) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() restrict(amp) + tiled_index() restrict(amp) : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))), local(index<2>(amp_get_local_id(1), amp_get_local_id(0))), tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))), tile_origin(index<2>(amp_get_global_id(1)-amp_get_local_id(1), amp_get_global_id(0)-amp_get_local_id(0))), tile_extent(D0, D1) -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() restrict(amp,cpu) -#else - __attribute__((always_inline)) tiled_index() restrict(amp) -#endif // __KALMAR_ACCELERATOR__ {} - template friend - void parallel_for_each(const accelerator_view&, tiled_extent, const K&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile(K const&, tiled_extent const&, int); -#endif + template + friend + void parallel_for_each( + const accelerator_view&, const tiled_extent&, const K&); + friend + struct Kalmar::Indexer; }; // ------------------------------------------------------------------------ @@ -2033,182 +1912,6 @@ class tiled_extent : public extent<1> /** @} */ }; -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -#define SSIZE 1024 * 10 -template -struct cpu_helper -{ - static inline void call(const Kernel& k, index& idx, const extent& ext) restrict(amp,cpu) { - int i; - for (i = 0; i < ext[N]; ++i) { - idx[N] = i; - cpu_helper::call(k, idx, ext); - } - } -}; -template -struct cpu_helper -{ - static inline void call(const Kernel& k, const index& idx, const extent& ext) restrict(amp,cpu) { - (const_cast(k))(idx); - } -}; - -template -void partitioned_task(const Kernel& ker, const extent& ext, int part) { - index idx; - int start = ext[0] * part / Kalmar::NTHREAD; - int end = ext[0] * (part + 1) / Kalmar::NTHREAD; - for (int i = start; i < end; i++) { - idx[0] = i; - cpu_helper<1, Kernel, N>::call(ker, idx, ext); - } -} - -template -void partitioned_task_tile(Kernel const& f, tiled_extent const& ext, int part) { - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D0 * SSIZE]; - tiled_index *tidx = new tiled_index[D0]; - tile_barrier::pb_t amp_bar = std::make_shared(D0); - tile_barrier tbar(amp_bar); - for (int tx = start; tx < end; tx++) { - int id = 0; - char *sp = stk; - tiled_index *tip = tidx; - for (int x = 0; x < D0; x++) { - new (tip) tiled_index(tx * D0 + x, x, tx, tbar); - amp_bar->setctx(++id, sp, f, tip, SSIZE); - sp += SSIZE; - ++tip; - } - amp_bar->idx = 0; - while (amp_bar->idx == 0) { - amp_bar->idx = id; - amp_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} -template -void partitioned_task_tile(Kernel const& f, tiled_extent const& ext, int part) { - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D1 * D0 * SSIZE]; - tiled_index *tidx = new tiled_index[D0 * D1]; - tile_barrier::pb_t amp_bar = std::make_shared(D0 * D1); - tile_barrier tbar(amp_bar); - - for (int tx = 0; tx < ext[1] / D1; tx++) - for (int ty = start; ty < end; ty++) { - int id = 0; - char *sp = stk; - tiled_index *tip = tidx; - for (int x = 0; x < D1; x++) - for (int y = 0; y < D0; y++) { - new (tip) tiled_index(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar); - amp_bar->setctx(++id, sp, f, tip, SSIZE); - ++tip; - sp += SSIZE; - } - amp_bar->idx = 0; - while (amp_bar->idx == 0) { - amp_bar->idx = id; - amp_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -void partitioned_task_tile(Kernel const& f, tiled_extent const& ext, int part) { - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D2 * D1 * D0 * SSIZE]; - tiled_index *tidx = new tiled_index[D0 * D1 * D2]; - tile_barrier::pb_t amp_bar = std::make_shared(D0 * D1 * D2); - tile_barrier tbar(amp_bar); - - for (int i = 0; i < ext[2] / D2; i++) - for (int j = 0; j < ext[1] / D1; j++) - for(int k = start; k < end; k++) { - int id = 0; - char *sp = stk; - tiled_index *tip = tidx; - for (int x = 0; x < D2; x++) - for (int y = 0; y < D1; y++) - for (int z = 0; z < D0; z++) { - new (tip) tiled_index(D2 * i + x, - D1 * j + y, - D0 * k + z, - x, y, z, i, j, k, tbar); - amp_bar->setctx(++id, sp, f, tip, SSIZE); - ++tip; - sp += SSIZE; - } - amp_bar->idx = 0; - while (amp_bar->idx == 0) { - amp_bar->idx = id; - amp_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -void launch_cpu_task(const std::shared_ptr& pQueue, Kernel const& f, - extent const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task, std::cref(f), std::cref(compute_domain), i); -} - -template -void launch_cpu_task(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile, - std::cref(f), std::cref(compute_domain), i); -} - -template -void launch_cpu_task(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile, - std::cref(f), std::cref(compute_domain), i); -} - -template -void launch_cpu_task(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile, - std::cref(f), std::cref(compute_domain), i); -} - -#endif - // ------------------------------------------------------------------------ // utility helper classes for array_view // ------------------------------------------------------------------------ @@ -5347,757 +5050,102 @@ completion_future copy_async(const array_view& src, const array& des } // ------------------------------------------------------------------------ -// atomic functions +// parallel_for_each // ------------------------------------------------------------------------ -/** @{ */ -/** - * Atomically read the value stored in dest , replace it with the value given - * in val and return the old value to the caller. This function provides - * overloads for int , unsigned int and float parameters. - * - * @param[out] dest A pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @param[in] val The new value to be stored in the location pointed to be dest - * @return These functions return the old value which was previously stored at - * dest, and that was atomically replaced. These functions always - * succeed. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_exchange_int(int *p, int val) restrict(amp); -extern "C" float atomic_exchange_float(float *p, float val) restrict(amp); - -static inline unsigned int atomic_exchange(unsigned int * dest, unsigned int val) restrict(amp,cpu) { - return atomic_exchange_unsigned(dest, val); -} -static inline int atomic_exchange(int * dest, int val) restrict(amp,cpu) { - return atomic_exchange_int(dest, val); -} -static inline float atomic_exchange(float * dest, float val) restrict(amp,cpu) { - return atomic_exchange_float(dest, val); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val); -int atomic_exchange_int(int *p, int val); -float atomic_exchange_float(float *p, float val); - -static inline unsigned int atomic_exchange(unsigned int *dest, unsigned int val) restrict(amp,cpu) { - return atomic_exchange_unsigned(dest, val); -} -static inline int atomic_exchange(int *dest, int val) restrict(amp,cpu) { - return atomic_exchange_int(dest, val); -} -static inline float atomic_exchange(float *dest, float val) restrict(amp,cpu) { - return atomic_exchange_float(dest, val); -} -#else -extern unsigned int atomic_exchange(unsigned int *dest, unsigned int val) restrict(amp,cpu); -extern int atomic_exchange(int *dest, int val) restrict(amp, cpu); -extern float atomic_exchange(float *dest, float val) restrict(amp, cpu); -#endif -/** @} */ - -/** @{ */ -/** - * These functions attempt to perform these three steps atomically: - * 1. Read the value stored in the location pointed to by dest - * 2. Compare the value read in the previous step with the value contained in - * the location pointed by expected_val - * 3. Carry the following operations depending on the result of the comparison - * of the previous step: - * a. If the values are identical, then the function tries to atomically - * change the value pointed by dest to the value in val. The function - * indicates by its return value whether this transformation has been - * successful or not. - * b. If the values are not identical, then the function stores the value - * read in step (1) into the location pointed to by expected_val, and - * returns false. - * - * @param[out] dest An pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @param[out] expected_val A pointer to a local variable or function - * parameter. Upon calling the function, the location - * pointed by expected_val contains the value the - * caller expects dest to contain. Upon return from - * the function, expected_val will contain the most - * recent value read from dest. - * @param[in] val The new value to be stored in the location pointed to be dest - * @return The return value indicates whether the function has been successful - * in atomically reading, comparing and modifying the contents of the - * memory location. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val) restrict(amp); -extern "C" int atomic_compare_exchange_int(int *dest, int expected_val, int val) restrict(amp); - -static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) restrict(amp,cpu) { - *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val); - return (*dest == val); -} -static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) restrict(amp,cpu) { - *expected_val = atomic_compare_exchange_int(dest, *expected_val, val); - return (*dest == val); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val); -int atomic_compare_exchange_int(int *dest, int expected_val, int val); - -static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) restrict(amp,cpu) { - *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val); - return (*dest == val); -} -static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) restrict(amp,cpu) { - *expected_val = atomic_compare_exchange_int(dest, *expected_val, val); - return (*dest == val); -} -#else -extern unsigned int atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) restrict(amp,cpu); -extern int atomic_compare_exchange(int *dest, int *expected_val, int val) restrict(amp, cpu); -#endif -/** @} */ - - -/** @{ */ -/** - * Atomically read the value stored in dest, apply the binary numerical - * operation specific to the function with the read value and val serving as - * input operands, and store the result back to the location pointed by dest. - * - * In terms of sequential semantics, the operation performed by any of the - * above function is described by the following piece of pseudo-code: - * - * *dest = *dest @f$\otimes@f$ val; - * - * Where the operation denoted by @f$\otimes@f$ is one of: addition - * (atomic_fetch_add), subtraction (atomic_fetch_sub), find maximum - * (atomic_fetch_max), find minimum (atomic_fetch_min), bit-wise AND - * (atomic_fetch_and), bit-wise OR (atomic_fetch_or), bit-wise XOR - * (atomic_fetch_xor). - * - * @param[out] dest An pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @param[in] val The second operand which participates in the calculation of - * the binary operation whose result is stored into the - * location pointed to be dest. - * @return These functions return the old value which was previously stored at - * dest, and that was atomically replaced. These functions always - * succeed. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_add_int(int *p, int val) restrict(amp); -extern "C" float atomic_add_float(float *p, float val) restrict(amp); - -static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_add_unsigned(x, y); -} -static inline int atomic_fetch_add(int *x, int y) restrict(amp,cpu) { - return atomic_add_int(x, y); -} -static inline float atomic_fetch_add(float *x, float y) restrict(amp,cpu) { - return atomic_add_float(x, y); -} - -extern "C" unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_sub_int(int *p, int val) restrict(amp); -extern "C" float atomic_sub_float(float *p, float val) restrict(amp); - -static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_sub_unsigned(x, y); -} -static inline int atomic_fetch_sub(int *x, int y) restrict(amp,cpu) { - return atomic_sub_int(x, y); -} -static inline int atomic_fetch_sub(float *x, float y) restrict(amp,cpu) { - return atomic_sub_float(x, y); -} - -extern "C" unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_and_int(int *p, int val) restrict(amp); - -static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_and_unsigned(x, y); -} -static inline int atomic_fetch_and(int *x, int y) restrict(amp,cpu) { - return atomic_and_int(x, y); -} - -extern "C" unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_or_int(int *p, int val) restrict(amp); - -static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_or_unsigned(x, y); -} -static inline int atomic_fetch_or(int *x, int y) restrict(amp,cpu) { - return atomic_or_int(x, y); -} - -extern "C" unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_xor_int(int *p, int val) restrict(amp); - -static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_xor_unsigned(x, y); -} -static inline int atomic_fetch_xor(int *x, int y) restrict(amp,cpu) { - return atomic_xor_int(x, y); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val); -int atomic_add_int(int *p, int val); -float atomic_add_float(float *p, float val); - -static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_add_unsigned(x, y); -} -static inline int atomic_fetch_add(int *x, int y) restrict(amp,cpu) { - return atomic_add_int(x, y); -} -static inline float atomic_fetch_add(float *x, float y) restrict(amp,cpu) { - return atomic_add_float(x, y); -} - -unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val); -int atomic_sub_int(int *p, int val); -float atomic_sub_float(float *p, float val); - -static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_sub_unsigned(x, y); -} -static inline int atomic_fetch_sub(int *x, int y) restrict(amp,cpu) { - return atomic_sub_int(x, y); -} -static inline float atomic_fetch_sub(float *x, float y) restrict(amp,cpu) { - return atomic_sub_float(x, y); -} - -unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val); -int atomic_and_int(int *p, int val); - -static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_and_unsigned(x, y); -} -static inline int atomic_fetch_and(int *x, int y) restrict(amp,cpu) { - return atomic_and_int(x, y); -} - -unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val); -int atomic_or_int(int *p, int val); - -static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_or_unsigned(x, y); -} -static inline int atomic_fetch_or(int *x, int y) restrict(amp,cpu) { - return atomic_or_int(x, y); -} - -unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val); -int atomic_xor_int(int *p, int val); - -static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) restrict(amp,cpu) { - return atomic_xor_unsigned(x, y); -} -static inline int atomic_fetch_xor(int *x, int y) restrict(amp,cpu) { - return atomic_xor_int(x, y); -} -#else -extern unsigned atomic_fetch_add(unsigned *x, unsigned y) restrict(amp,cpu); -extern int atomic_fetch_add(int *x, int y) restrict(amp, cpu); -extern float atomic_fetch_add(float *x, float y) restrict(amp, cpu); - -extern unsigned atomic_fetch_sub(unsigned *x, unsigned y) restrict(amp,cpu); -extern int atomic_fetch_sub(int *x, int y) restrict(amp, cpu); -extern float atomic_fetch_sub(float *x, float y) restrict(amp, cpu); - -extern unsigned atomic_fetch_and(unsigned *x, unsigned y) restrict(amp,cpu); -extern int atomic_fetch_and(int *x, int y) restrict(amp, cpu); - -extern unsigned atomic_fetch_or(unsigned *x, unsigned y) restrict(amp,cpu); -extern int atomic_fetch_or(int *x, int y) restrict(amp, cpu); - -extern unsigned atomic_fetch_xor(unsigned *x, unsigned y) restrict(amp,cpu); -extern int atomic_fetch_xor(int *x, int y) restrict(amp, cpu); -#endif - -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_max_int(int *p, int val) restrict(amp); - -static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) restrict(amp) { - return atomic_max_unsigned(x, y); -} -static inline int atomic_fetch_max(int *x, int y) restrict(amp) { - return atomic_max_int(x, y); -} - -extern "C" unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) restrict(amp); -extern "C" int atomic_min_int(int *p, int val) restrict(amp); - -static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) restrict(amp) { - return atomic_min_unsigned(x, y); -} -static inline int atomic_fetch_min(int *x, int y) restrict(amp) { - return atomic_min_int(x, y); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val); -int atomic_max_int(int *p, int val); - -static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) restrict(amp) { - return atomic_max_unsigned(x, y); -} -static inline int atomic_fetch_max(int *x, int y) restrict(amp) { - return atomic_max_int(x, y); -} - -unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val); -int atomic_min_int(int *p, int val); - -static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) restrict(amp) { - return atomic_min_unsigned(x, y); -} -static inline int atomic_fetch_min(int *x, int y) restrict(amp) { - return atomic_min_int(x, y); +template +inline +void parallel_for_each(const extent& compute_domain, const Kernel& f) +{ + parallel_for_each( + accelerator::get_auto_selection_view(), compute_domain, f); } -#else -extern int atomic_fetch_max(int * dest, int val) restrict(amp, cpu); -extern unsigned int atomic_fetch_max(unsigned int * dest, unsigned int val) restrict(amp, cpu); - -extern int atomic_fetch_min(int * dest, int val) restrict(amp, cpu); -extern unsigned int atomic_fetch_min(unsigned int * dest, unsigned int val) restrict(amp, cpu); -#endif - -/** @} */ -/** @{ */ -/** - * Atomically increment or decrement the value stored at the location point to - * by dest. - * - * @param[inout] dest An pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @return These functions return the old value which was previously stored at - * dest, and that was atomically replaced. These functions always - * succeed. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_inc_unsigned(unsigned int *p) restrict(amp); -extern "C" int atomic_inc_int(int *p) restrict(amp); - -static inline unsigned int atomic_fetch_inc(unsigned int *x) restrict(amp,cpu) { - return atomic_inc_unsigned(x); -} -static inline int atomic_fetch_inc(int *x) restrict(amp,cpu) { - return atomic_inc_int(x); +template +inline +void parallel_for_each( + const tiled_extent& compute_domain, const Kernel& f) +{ + parallel_for_each( + accelerator::get_auto_selection_view(), compute_domain, f); } -extern "C" unsigned int atomic_dec_unsigned(unsigned int *p) restrict(amp); -extern "C" int atomic_dec_int(int *p) restrict(amp); +template +inline +void validate_compute_domain(const Concurrency::extent& compute_domain) +{ + std::size_t sz{1}; + for (auto i = 0; i != n; ++i) { + sz *= compute_domain[i]; -static inline unsigned int atomic_fetch_dec(unsigned int *x) restrict(amp,cpu) { - return atomic_dec_unsigned(x); -} -static inline int atomic_fetch_dec(int *x) restrict(amp,cpu) { - return atomic_dec_int(x); + if (sz < 1) throw invalid_compute_domain{"Extent is not positive."}; + if (sz > UINT_MAX) throw invalid_compute_domain{"Extent is too large."}; + } } -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_inc_unsigned(unsigned int *p); -int atomic_inc_int(int *p); -static inline unsigned atomic_fetch_inc(unsigned *x) restrict(amp,cpu) { - return atomic_inc_unsigned(x); -} -static inline int atomic_fetch_inc(int *x) restrict(amp,cpu) { - return atomic_inc_int(x); -} +template +inline +void parallel_for_each( + const accelerator_view& av, + const extent& compute_domain, + const Kernel& f) +{ + if (av.get_accelerator().get_device_path() == L"cpu") { + throw runtime_exception{ + Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; + } -unsigned int atomic_dec_unsigned(unsigned int *p); -int atomic_dec_int(int *p); + validate_compute_domain(compute_domain); -static inline unsigned atomic_fetch_dec(unsigned *x) restrict(amp,cpu) { - return atomic_dec_unsigned(x); -} -static inline int atomic_fetch_dec(int *x) restrict(amp,cpu) { - return atomic_dec_int(x); + Kalmar::launch_kernel(av.pQueue, compute_domain, f); } -#else -extern int atomic_fetch_inc(int * _Dest) restrict(amp, cpu); -extern unsigned int atomic_fetch_inc(unsigned int * _Dest) restrict(amp, cpu); - -extern int atomic_fetch_dec(int * _Dest) restrict(amp, cpu); -extern unsigned int atomic_fetch_dec(unsigned int * _Dest) restrict(amp, cpu); -#endif - -/** @} */ - -// ------------------------------------------------------------------------ -// parallel_for_each -// ------------------------------------------------------------------------ -template -void parallel_for_each(const accelerator_view&, extent compute_domain, const Kernel& f); -template -void parallel_for_each(const accelerator_view& accl_view, - tiled_extent compute_domain, const Kernel& f); +// parallel_for_each, tiled +inline +void validate_tile_dims() +{} -template -void parallel_for_each(const accelerator_view& accl_view, - tiled_extent compute_domain, const Kernel& f); - -template -void parallel_for_each(const accelerator_view& accl_view, - tiled_extent compute_domain, const Kernel& f); - -template -void parallel_for_each(extent compute_domain, const Kernel& f){ - auto que = Kalmar::get_availabe_que(f); - const accelerator_view av(que); - parallel_for_each(av, compute_domain, f); -} - -template -void parallel_for_each(tiled_extent compute_domain, const Kernel& f) { - auto que = Kalmar::get_availabe_que(f); - const accelerator_view av(que); - parallel_for_each(av, compute_domain, f); -} +template +inline +void validate_tile_dims() +{ + static_assert( + dim <= 1024, "The maximum number of threads in a tile is 1024."); -template -void parallel_for_each(tiled_extent compute_domain, const Kernel& f) { - auto que = Kalmar::get_availabe_que(f); - const accelerator_view av(que); - parallel_for_each(av, compute_domain, f); + validate_tile_dims(); } -template -void parallel_for_each(tiled_extent compute_domain, const Kernel& f) { - auto que = Kalmar::get_availabe_que(f); - const accelerator_view av(que); - parallel_for_each(av, compute_domain, f); -} -template -struct pfe_helper +template +inline +void validate_tiled_compute_domain(const tiled_extent& compute_domain) { - static inline void call(Kernel& k, _Tp& idx) restrict(amp,cpu) { - int i; - for (i = 0; i < k.ext[N - 1]; ++i) { - idx[N - 1] = i; - pfe_helper::call(k, idx); + constexpr int tmp[]{dims...}; + + for (auto i = 0u; i != compute_domain.rank; ++i) { + if (compute_domain[i] % tmp[i]) { + throw invalid_compute_domain{"Extent not divisible by tile size."}; } } -}; -template -struct pfe_helper<0, Kernel, _Tp> -{ - static inline void call(Kernel& k, _Tp& idx) restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ == 1 - k.k(idx); -#endif - } -}; +} -template -class pfe_wrapper +template +inline +void parallel_for_each( + const accelerator_view& av, + const tiled_extent& compute_domain, + const Kernel& f) { -public: - explicit pfe_wrapper(extent& other, const Kernel& f) restrict(amp,cpu) - : ext(other), k(f) {} - void operator() (index idx) restrict(amp,cpu) { - pfe_helper, index>::call(*this, idx); - } -private: - const extent ext; - const Kernel k; - template - friend struct pfe_helper; -}; + validate_tile_dims(); -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -template -__attribute__((noinline,used)) -void parallel_for_each(const accelerator_view& av, extent compute_domain, - const Kernel& f) restrict(cpu, amp) { -#if __KALMAR_ACCELERATOR__ != 1 -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - int* foo1 = reinterpret_cast(&Kernel::__cxxamp_trampoline); - auto bar = &pfe_wrapper::operator(); - auto qq = &index::__cxxamp_opencl_index; - int* foo = reinterpret_cast(&pfe_wrapper::__cxxamp_trampoline); -#endif - size_t compute_domain_size = 1; - for(int i = 0 ; i < N ; i++) - { - if(compute_domain[i]<=0) - throw invalid_compute_domain("Extent is less or equal than 0."); - if (static_cast(compute_domain[i]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - compute_domain_size *= static_cast(compute_domain[i]); - if (compute_domain_size > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - } - - size_t ext[3] = {static_cast(compute_domain[N - 1]), - static_cast(compute_domain[N - 2]), - static_cast(compute_domain[N - 3])}; -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - return; - } -#endif if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); + throw runtime_exception{ + Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; } - const pfe_wrapper _pf(compute_domain, f); - Kalmar::mcw_cxxamp_launch_kernel, 3>(av.pQueue, ext, NULL, _pf); -#else -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - int* foo1 = reinterpret_cast(&Kernel::__cxxamp_trampoline); -#endif - auto bar = &pfe_wrapper::operator(); - auto qq = &index::__cxxamp_opencl_index; - int* foo = reinterpret_cast(&pfe_wrapper::__cxxamp_trampoline); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -//1D parallel_for_each, nontiled -template -__attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av, - extent<1> compute_domain, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - if(compute_domain[0]<=0) { - throw invalid_compute_domain("Extent is less or equal than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - return; - } -#endif - size_t ext = compute_domain[0]; - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - Kalmar::mcw_cxxamp_launch_kernel(av.pQueue, &ext, NULL, f); -#else //if __KALMAR_ACCELERATOR__ != 1 - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -//2D parallel_for_each, nontiled -template -__attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av, - extent<2> compute_domain, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - if(compute_domain[0]<=0 || compute_domain[1]<=0) { - throw invalid_compute_domain("Extent is less or equal than 0."); - } - if (static_cast(compute_domain[0]) * static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - return; - } -#endif - size_t ext[2] = {static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - Kalmar::mcw_cxxamp_launch_kernel(av.pQueue, ext, NULL, f); -#else //if __KALMAR_ACCELERATOR__ != 1 - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -//3D parallel_for_each, nontiled -template -__attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av, - extent<3> compute_domain, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - if(compute_domain[0]<=0 || compute_domain[1]<=0 || compute_domain[2]<=0) { - throw invalid_compute_domain("Extent is less or equal than 0."); - } - if (static_cast(compute_domain[0]) * static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[1]) * static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[0]) * static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[0]) * static_cast(compute_domain[1]) * static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - return; - } -#endif - size_t ext[3] = {static_cast(compute_domain[2]), - static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - Kalmar::mcw_cxxamp_launch_kernel(av.pQueue, ext, NULL, f); -#else //if __KALMAR_ACCELERATOR__ != 1 - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -//1D parallel_for_each, tiled -template -__attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av, - tiled_extent compute_domain, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - if(compute_domain[0]<=0) { - throw invalid_compute_domain("Extent is less or equal than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - size_t ext = compute_domain[0]; - size_t tile = compute_domain.tile_dim0; - static_assert( compute_domain.tile_dim0 <= 1024, "The maximum nuimber of threads in a tile is 1024"); - if(ext % tile != 0) { - throw invalid_compute_domain("Extent can't be evenly divisible by tile size."); - } -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - } else -#endif - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - Kalmar::mcw_cxxamp_launch_kernel(av.pQueue, &ext, &tile, f); -#else //if __KALMAR_ACCELERATOR__ != 1 - tiled_index this_is_used_to_instantiate_the_right_index; - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -//2D parallel_for_each, tiled -template -__attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av, - tiled_extent compute_domain, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - if(compute_domain[0]<=0 || compute_domain[1]<=0) { - throw invalid_compute_domain("Extent is less or equal than 0."); - } - if (static_cast(compute_domain[0]) * static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - size_t ext[2] = { static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - size_t tile[2] = { compute_domain.tile_dim1, - compute_domain.tile_dim0}; - static_assert( (compute_domain.tile_dim1 * compute_domain.tile_dim0)<= 1024, "The maximum nuimber of threads in a tile is 1024"); - if((ext[0] % tile[0] != 0) || (ext[1] % tile[1] != 0)) { - throw invalid_compute_domain("Extent can't be evenly divisible by tile size."); - } -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - } else -#endif - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - Kalmar::mcw_cxxamp_launch_kernel(av.pQueue, ext, tile, f); -#else //if __KALMAR_ACCELERATOR__ != 1 - tiled_index this_is_used_to_instantiate_the_right_index; - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -//3D parallel_for_each, tiled -template -__attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av, - tiled_extent compute_domain, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - if(compute_domain[0]<=0 || compute_domain[1]<=0 || compute_domain[2]<=0) { - throw invalid_compute_domain("Extent is less or equal than 0."); - } - if (static_cast(compute_domain[0]) * static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[1]) * static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[0]) * static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[0]) * static_cast(compute_domain[1]) * static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - size_t ext[3] = { static_cast(compute_domain[2]), - static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - size_t tile[3] = { compute_domain.tile_dim2, - compute_domain.tile_dim1, - compute_domain.tile_dim0}; - static_assert(( compute_domain.tile_dim2 * compute_domain.tile_dim1* compute_domain.tile_dim0)<= 1024, "The maximum nuimber of threads in a tile is 1024"); - if((ext[0] % tile[0] != 0) || (ext[1] % tile[1] != 0) || (ext[2] % tile[2] != 0)) { - throw invalid_compute_domain("Extent can't be evenly divisible by tile size."); - } -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - launch_cpu_task(av.pQueue, f, compute_domain); - } else -#endif - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - Kalmar::mcw_cxxamp_launch_kernel(av.pQueue, ext, tile, f); -#else //if __KALMAR_ACCELERATOR__ != 1 - tiled_index this_is_used_to_instantiate_the_right_index; - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop + validate_compute_domain(compute_domain); + validate_tiled_domain(compute_domain); + Kalmar::launch_kernel(av.pQueue, compute_domain, f); +} } // namespace Concurrency diff --git a/include/atomics.hpp b/include/atomics.hpp new file mode 100644 index 00000000000..c7f541f032c --- /dev/null +++ b/include/atomics.hpp @@ -0,0 +1,271 @@ +//===----------------------------------------------------------------------===// +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace hc +{ + namespace atomics + { + /** @{ */ + /** + * Atomically read the value stored in dest , replace it with the value + * given in val and return the old value to the caller. This function + * provides overloads for int, unsigned int, int64_t, uint64_t, float + * and double parameters. + * + * @param[out] dest A pointer to the location which needs to be + * atomically modified. The location may reside within + * an array, an array_view, global or tile_static + * memory. + * @param[in] val The new value to be stored in the location pointed to + * be dest. + * @return These functions return the old value which was previously + * stored at dest, and that was atomically replaced. These + * functions always succeed. + */ + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline + T atomic_exchange(T* dest, T val) [[cpu]][[hc]] + { + return __atomic_exchange_n(dest, val, __ATOMIC_RELAXED); + } + inline + float atomic_exchange(float* dest, float val) //[[cpu]][[hc]] + { + static_assert(sizeof(float) == sizeof(unsigned int), ""); + + unsigned int ui{}; + __builtin_memcpy(&ui, &val, sizeof(val)); + + unsigned int tmp{ + atomic_exchange(reinterpret_cast(dest), ui)}; + + float r{}; + __builtin_memcpy(&r, &tmp, sizeof(tmp)); + + return r; + } + inline + double atomic_exchange(double* dest, double val) //[[cpu]][[hc]] + { + static_assert(sizeof(double) == sizeof(std::uint64_t), ""); + + std::uint64_t ui{}; + __builtin_memcpy(&ui, &val, sizeof(val)); + + std::uint64_t tmp{ + atomic_exchange(reinterpret_cast(dest), ui)}; + + double r{}; + __builtin_memcpy(&r, &tmp, sizeof(tmp)); + + return r; + } + /** @} */ + + /** @{ */ + /** + * These functions attempt to perform these three steps atomically: + * 1. Read the value stored in the location pointed to by dest + * 2. Compare the value read in the previous step with the value + * contained in the location pointed by expected_val + * 3. Carry the following operations depending on the result of the + * comparison of the previous step: + * a. If the values are identical, then the function tries to + * atomically change the value pointed by dest to the value in + * val. The function indicates by its return value whether this + * transformation has been successful or not. + * b. If the values are not identical, then the function stores the + * value read in step (1) into the location pointed to by + * expected_val, and returns false. + * + * @param[out] dest A pointer to the location which needs to be + * atomically modified. The location may reside within + * an array, an array_view, global or tile_static + * memory. + * @param[out] expected_val A pointer to a local variable or function + * parameter. Upon calling the function, the + * location pointed by expected_val contains + * the value the caller expects dest to + * contain. Upon return from the function, + * expected_val will contain the most recent + * value read from dest. + * @param[in] val The new value to be stored in the location pointed to + * be dest. + * @return The return value indicates whether the function has been + * successful in atomically reading, comparing and modifying the + * contents of the memory location. + */ + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline + bool atomic_compare_exchange( + T* dest, T* expected_val, T val) [[cpu]][[hc]] + { + return __atomic_compare_exchange_n( + dest, + expected_val, + val, + false, + __ATOMIC_RELAXED, + __ATOMIC_RELAXED); + } + /** @} */ + + /** @{ */ + /** + * Atomically read the value stored in dest, apply the binary numerical + * operation specific to the function with the read value and val + * serving as input operands, and store the result back to the location + * pointed by dest. + * + * In terms of sequential semantics, the operation performed by any of + * the above function is described by the following piece of + * pseudo-code: + * + * *dest = *dest @f$\otimes@f$ val; + * + * Where the operation denoted by @f$\otimes@f$ is one of: addition + * (atomic_fetch_add), subtraction (atomic_fetch_sub), find maximum + * (atomic_fetch_max), find minimum (atomic_fetch_min), bit-wise AND + * (atomic_fetch_and), bit-wise OR (atomic_fetch_or), bit-wise XOR + * (atomic_fetch_xor). + * + * @param[out] dest A pointer to the location which needs to be + * atomically modified. The location may reside within + * an array, an array_view, global or tile_static + * memory. + * @param[in] val The second operand which participates in the + * calculation of the binary operation whose result is + * stored into the location pointed to be dest. + * @return These functions return the old value which was previously + * stored at dest, and that was atomically replaced. These + * functions always succeed. + */ + + /** @} */ + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline + T atomic_fetch_add(T* dest, T val) [[cpu]][[hc]] + { + return __atomic_fetch_add(dest, val, __ATOMIC_RELAXED); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + T atomic_fetch_sub(T* dest, T val) [[cpu]][[hc]] + { + return __atomic_fetch_sub(dest, val, __ATOMIC_RELAXED); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + T atomic_fetch_max(T* dest, T val) [[cpu]][[hc]] + { + return __sync_fetch_and_max(dest, val); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + T atomic_fetch_min(T* dest, T val) [[cpu]][[hc]] + { + return __sync_fetch_and_min(dest, val); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + T atomic_fetch_and(T* dest, T val) [[cpu]][[hc]] + { + return __atomic_fetch_and(dest, val, __ATOMIC_RELAXED); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + T atomic_fetch_or(T* dest, T val) [[cpu]][[hc]] + { + return __atomic_fetch_or(dest, val, __ATOMIC_RELAXED); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + T atomic_fetch_xor(T* dest, T val) [[cpu]][[hc]] + { + return __atomic_fetch_xor(dest, val, __ATOMIC_RELAXED); + } + + /** @{ */ + /** + * Atomically increment or decrement the value stored at the location + * point to by dest. + * + * @param[out] dest A pointer to the location which needs to be + * atomically modified. The location may reside within + * an array, an array_view, global or tile_static + * memory. + * @return These functions return the old value which was previously + * stored at dest, and that was atomically replaced. These + * functions always succeed. + */ + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline + T atomic_fetch_inc(T* dest) [[cpu]][[hc]] + { + return __atomic_fetch_add(dest, T{1}, __ATOMIC_RELAXED); + } + + template< + typename T, + typename std::enable_if< + std::is_integral{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline + T atomic_fetch_dec(T* dest) [[cpu]][[hc]] + { + return __atomic_fetch_sub(dest, T{1}, __ATOMIC_RELAXED); + } + /** @} */ + } // Namespace atomics. +} // Namespace hc. \ No newline at end of file diff --git a/include/coordinate b/include/coordinate index 959e6d624e0..f811cf57fe4 100644 --- a/include/coordinate +++ b/include/coordinate @@ -299,15 +299,11 @@ private: template friend struct offset_helper; public: - __attribute__((annotate("__cxxamp_opencl_index"))) - void __cxxamp_opencl_index() restrict(amp,cpu) + void __cxxamp_opencl_index() restrict(amp, cpu) #if __KALMAR_ACCELERATOR__ == 1 { offset_helper>::set(*this); } -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - { - } #else ; #endif diff --git a/include/grid_launch.h b/include/grid_launch.h deleted file mode 100644 index f91d23341a3..00000000000 --- a/include/grid_launch.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include - -#include - -#define GRID_LAUNCH_VERSION 20 - -// Extern definitions -namespace hc{ -class completion_future; -class accelerator_view; -} - - -// 3 dim structure for groups and grids. -typedef struct gl_dim3 -{ - int x,y,z; - gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {}; -} gl_dim3; - -typedef enum gl_barrier_bit { - barrier_bit_queue_default, - barrier_bit_none, - barrier_bit_wait, -} gl_barrier_bit; - - -// grid_launch_parm contains information used to launch the kernel. -typedef struct grid_launch_parm -{ - //! Grid dimensions - gl_dim3 grid_dim; - - //! Group dimensions - gl_dim3 group_dim;; - - //! Amount of dynamic group memory to use with the kernel launch. - //! This memory is in addition to the amount used statically in the kernel. - unsigned int dynamic_group_mem_bytes;; - - //! Control setting of barrier bit on per-packet basis: - //! See gl_barrier_bit description. - //! Placeholder, is not used to control packet dispatch yet - enum gl_barrier_bit barrier_bit; - - //! Value of packet fences to apply to launch. - //! The correspond to the value of bits 9:14 in the AQL packet, - //! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t. - //! Set to -1 for conservative defaults. - //! Placeholder, is not used to control packet dispatch yet - unsigned int launch_fence; - - //! Pointer to the accelerator_view where the kernel should execute. - //! If NULL, the default view on the default accelerator is used. - hc::accelerator_view *av; - - //! Pointe to the completion_future used to track the status of the command. - //! If NULL, the command does not write status. In this case, - //! synchronization can be enforced with queue-level waits or - //! waiting on younger commands. - hc::completion_future *cf; - - grid_launch_parm() = default; -} grid_launch_parm; - - -extern void init_grid_launch(grid_launch_parm *gl); diff --git a/include/grid_launch.hpp b/include/grid_launch.hpp deleted file mode 100644 index 04ce7e03664..00000000000 --- a/include/grid_launch.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include "grid_launch.h" -#include "hc.hpp" - -class grid_launch_parm_cxx : public grid_launch_parm -{ -public: - grid_launch_parm_cxx() = default; - - // customized serialization: don't need av and cf in kernel - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(int), &grid_dim.x); - s.Append(sizeof(int), &grid_dim.y); - s.Append(sizeof(int), &grid_dim.z); - s.Append(sizeof(int), &group_dim.x); - s.Append(sizeof(int), &group_dim.y); - s.Append(sizeof(int), &group_dim.z); - } - - __attribute__((annotate("user_deserialize"))) - grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y, int grid_dim_z, - int group_dim_x, int group_dim_y, int group_dim_z) { - grid_dim.x = grid_dim_x; - grid_dim.y = grid_dim_y; - grid_dim.z = grid_dim_z; - group_dim.x = group_dim_x; - group_dim.y = group_dim_y; - group_dim.z = group_dim_z; - } -}; - - -extern inline void grid_launch_init(grid_launch_parm *lp) { - lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1; - - lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1; - - lp->dynamic_group_mem_bytes = 0; - - lp->barrier_bit = barrier_bit_queue_default; - lp->launch_fence = -1; - - // TODO - set to NULL? - static hc::accelerator_view av = hc::accelerator().get_default_view(); - lp->av = &av; - lp->cf = NULL; -} - diff --git a/include/hc.hpp b/include/hc.hpp index 4f4ee5f0c79..b80944897df 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -12,6 +12,7 @@ #pragma once +#include "atomics.hpp" #include "hc_defines.h" #include "kalmar_exception.h" #include "kalmar_index.h" @@ -21,10 +22,12 @@ #include "kalmar_buffer.h" #include "kalmar_math.h" -#include "hsa_atomic.h" -#include "kalmar_cpu_launch.h" #include "hcc_features.hpp" +#include +#include +#include + #ifndef __HC__ # define __HC__ [[hc]] #endif @@ -47,6 +50,7 @@ namespace hc { class AmPointerInfo; +using namespace atomics; using namespace Kalmar::enums; using namespace Kalmar::CLAMP; @@ -630,57 +634,47 @@ class accelerator_view { friend class accelerator; template friend class array; template friend class array_view; - - template friend - void* Kalmar::mcw_cxxamp_get_kernel(const std::shared_ptr&, const Kernel&); - template friend - void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(const std::shared_ptr&, size_t *, size_t *, const Kernel&, void*, size_t); - template friend - std::shared_ptr Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(const std::shared_ptr&, size_t *, size_t *, const Kernel&, void*, size_t); - template friend - void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr&, size_t *, size_t *, const Kernel&); - template friend - std::shared_ptr Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr&, size_t *, size_t *, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - completion_future launch_cpu_task_async(const std::shared_ptr&, Kernel const&, extent const&); -#endif + + template + friend + void Kalmar::launch_kernel_with_dynamic_group_memory( + const std::shared_ptr&, + const Domain&, + const Kernel&); + template + friend + std::shared_ptr + Kalmar::launch_kernel_with_dynamic_group_memory_async( + const std::shared_ptr&, + const Domain&, + const Kernel&); + template + friend + void Kalmar::launch_kernel( + const std::shared_ptr&, + const Domain&, + const Kernel&); + template + friend + std::shared_ptr Kalmar::launch_kernel_async( + const std::shared_ptr&, + const Domain&, + const Kernel&); // non-tiled parallel_for_each // generic version - template friend - completion_future parallel_for_each(const accelerator_view&, const extent&, const Kernel&); - - // 1D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&); - - // 2D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&); - - // 3D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&); - - // tiled parallel_for_each, 3D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&); - - // tiled parallel_for_each, 2D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - - // tiled parallel_for_each, 1D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); + template + friend + completion_future parallel_for_each( + const accelerator_view&, const extent&, const Kernel&); + // tiled parallel_for_each + // generic version + template + friend + completion_future parallel_for_each( + const accelerator_view&, const tiled_extent&, const Kernel&); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -public: -#endif - __attribute__((annotate("user_deserialize"))) accelerator_view() __CPU__ __HC__ { #if __KALMAR_ACCELERATOR__ != 1 throw runtime_exception("errorMsg_throw", 0); @@ -1391,35 +1385,20 @@ class completion_future { : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {} friend class Kalmar::HSAQueue; - + // non-tiled parallel_for_each // generic version - template friend - completion_future parallel_for_each(const accelerator_view&, const extent&, const Kernel&); - - // 1D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&); - - // 2D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&); - - // 3D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&); - - // tiled parallel_for_each, 3D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&); + template + friend + completion_future parallel_for_each( + const accelerator_view&, const extent&, const Kernel&); - // tiled parallel_for_each, 2D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - - // tiled parallel_for_each, 1D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); + // tiled parallel_for_each + // generic version + template + friend + completion_future parallel_for_each( + const accelerator_view&, const tiled_extent&, const Kernel&); // copy_async template friend @@ -3252,41 +3231,6 @@ extern "C" void* get_group_segment_base_pointer() __HC__; */ extern "C" void* get_dynamic_group_segment_base_pointer() __HC__; -// ------------------------------------------------------------------------ -// utility class for tiled_barrier -// ------------------------------------------------------------------------ - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -template -void bar_wrapper(Ker *f, Ti *t) -{ - (*f)(*t); -} - -struct barrier_t { - std::unique_ptr ctx; - int idx; - barrier_t (int a) : - ctx(new ucontext_t[a + 1]) {} - template - void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) { - getcontext(&ctx[x]); - ctx[x].uc_stack.ss_sp = stack; - ctx[x].uc_stack.ss_size = S; - ctx[x].uc_link = &ctx[x - 1]; - makecontext(&ctx[x], (void (*)(void))bar_wrapper, 2, &f, tidx); - } - void swap(int a, int b) { - swapcontext(&ctx[a], &ctx[b]); - } - void wait() __HC__ { - --idx; - swapcontext(&ctx[idx + 1], &ctx[idx]); - } -}; -#endif - - // ------------------------------------------------------------------------ // tiled_barrier // ------------------------------------------------------------------------ @@ -3300,20 +3244,6 @@ struct barrier_t { */ class tile_barrier { public: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - using pb_t = std::shared_ptr; - tile_barrier(pb_t pb) : pbar(pb) {} - - /** - * Copy constructor. Constructs a new tile_barrier from the supplied - * argument "other". - * - * @param[in] other An object of type tile_barrier from which to initialize - * this. - */ - tile_barrier(const tile_barrier& other) __CPU__ __HC__ : pbar(other.pbar) {} -#else - /** * Copy constructor. Constructs a new tile_barrier from the supplied * argument "other". @@ -3322,7 +3252,6 @@ class tile_barrier { * this. */ tile_barrier(const tile_barrier& other) __CPU__ __HC__ {} -#endif /** * Blocks execution of all threads in the thread tile until all threads in @@ -3335,11 +3264,7 @@ class tile_barrier { * wait_with_all_memory_fence(). */ void wait() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 wait_with_all_memory_fence(); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } /** @@ -3352,11 +3277,7 @@ class tile_barrier { * before hitting the barrier. This is identical to wait(). */ void wait_with_all_memory_fence() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } /** @@ -3369,11 +3290,7 @@ class tile_barrier { * after the barrier are executed before hitting the barrier. */ void wait_with_global_memory_fence() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 amp_barrier(CLK_GLOBAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } /** @@ -3387,20 +3304,11 @@ class tile_barrier { * hitting the barrier. */ void wait_with_tile_static_memory_fence() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 amp_barrier(CLK_LOCAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif } private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - tile_barrier() __CPU__ __HC__ = default; - pb_t pbar; -#else - tile_barrier() __HC__ {} -#endif + tile_barrier() __HC__ = default; template friend class tiled_index; @@ -3508,14 +3416,7 @@ class tiled_index { tiled_index(const index<3>& g) __CPU__ __HC__ : global(g) {} private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a0, int a1, int a2, int b0, int b1, int b2, int c0, int c1, int c2, tile_barrier& pb, int D0, int D1, int D2) __CPU__ __HC__ - : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() __HC__ + tiled_index() __HC__ : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))), local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))), tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))), @@ -3523,20 +3424,12 @@ class tiled_index { amp_get_global_id(1) - amp_get_local_id(1), amp_get_global_id(0) - amp_get_local_id(0))), tile_dim(index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0))) -#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() __CPU__ __HC__ -#else - __attribute__((always_inline)) tiled_index() __HC__ -#endif // __KALMAR_ACCELERATOR__ {} template friend completion_future parallel_for_each(const accelerator_view&, const tiled_extent&, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile_3D(K const&, tiled_extent<3> const&, int); -#endif + friend + struct Kalmar::Indexer; }; @@ -3609,33 +3502,18 @@ class tiled_index<1> { tiled_index(const index<1>& g) __CPU__ __HC__ : global(g) {} private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb, int D0) __CPU__ __HC__ - : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() __HC__ + tiled_index() __HC__ : global(index<1>(amp_get_global_id(0))), local(index<1>(amp_get_local_id(0))), tile(index<1>(amp_get_group_id(0))), tile_origin(index<1>(amp_get_global_id(0) - amp_get_local_id(0))), tile_dim(index<1>(amp_get_local_size(0))) -#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() __CPU__ __HC__ -#else - __attribute__((always_inline)) tiled_index() __HC__ -#endif // __KALMAR_ACCELERATOR__ {} template friend completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile_1D(K const&, tiled_extent<1> const&, int); -#endif + friend + struct Kalmar::Indexer; }; /** @@ -3707,227 +3585,21 @@ class tiled_index<2> { tiled_index(const index<2>& g) __CPU__ __HC__ : global(g) {} private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& pb, int D0, int D1) __CPU__ __HC__ - : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() __HC__ + tiled_index() __HC__ : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))), local(index<2>(amp_get_local_id(1), amp_get_local_id(0))), tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))), tile_origin(index<2>(amp_get_global_id(1) - amp_get_local_id(1), amp_get_global_id(0) - amp_get_local_id(0))), tile_dim(index<2>(amp_get_local_size(1), amp_get_local_size(0))) -#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() __CPU__ __HC__ -#else - __attribute__((always_inline)) tiled_index() __HC__ -#endif // __KALMAR_ACCELERATOR__ {} template friend completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile_2D(K const&, tiled_extent<2> const&, int); -#endif -}; - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -#define SSIZE 1024 * 10 -template -struct cpu_helper -{ - static inline void call(const Kernel& k, index& idx, const extent& ext) __CPU__ __HC__ { - int i; - for (i = 0; i < ext[N]; ++i) { - idx[N] = i; - cpu_helper::call(k, idx, ext); - } - } -}; -template -struct cpu_helper -{ - static inline void call(const Kernel& k, const index& idx, const extent& ext) __CPU__ __HC__ { - (const_cast(k))(idx); - } + friend + struct Kalmar::Indexer; }; -template -void partitioned_task(const Kernel& ker, const extent& ext, int part) { - index idx; - int start = ext[0] * part / Kalmar::NTHREAD; - int end = ext[0] * (part + 1) / Kalmar::NTHREAD; - for (int i = start; i < end; i++) { - idx[0] = i; - cpu_helper<1, Kernel, N>::call(ker, idx, ext); - } -} - -template -void partitioned_task_tile_1D(Kernel const& f, tiled_extent<1> const& ext, int part) { - int D0 = ext.tile_dim[0]; - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D0 * SSIZE]; - tiled_index<1> *tidx = new tiled_index<1>[D0]; - tile_barrier::pb_t hc_bar = std::make_shared(D0); - tile_barrier tbar(hc_bar); - for (int tx = start; tx < end; tx++) { - int id = 0; - char *sp = stk; - tiled_index<1> *tip = tidx; - for (int x = 0; x < D0; x++) { - new (tip) tiled_index<1>(tx * D0 + x, x, tx, tbar, D0); - hc_bar->setctx(++id, sp, f, tip, SSIZE); - sp += SSIZE; - ++tip; - } - hc_bar->idx = 0; - while (hc_bar->idx == 0) { - hc_bar->idx = id; - hc_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -void partitioned_task_tile_2D(Kernel const& f, tiled_extent<2> const& ext, int part) { - int D0 = ext.tile_dim[0]; - int D1 = ext.tile_dim[1]; - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D1 * D0 * SSIZE]; - tiled_index<2> *tidx = new tiled_index<2>[D0 * D1]; - tile_barrier::pb_t hc_bar = std::make_shared(D0 * D1); - tile_barrier tbar(hc_bar); - - for (int tx = 0; tx < ext[1] / D1; tx++) - for (int ty = start; ty < end; ty++) { - int id = 0; - char *sp = stk; - tiled_index<2> *tip = tidx; - for (int x = 0; x < D1; x++) - for (int y = 0; y < D0; y++) { - new (tip) tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1); - hc_bar->setctx(++id, sp, f, tip, SSIZE); - ++tip; - sp += SSIZE; - } - hc_bar->idx = 0; - while (hc_bar->idx == 0) { - hc_bar->idx = id; - hc_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -void partitioned_task_tile_3D(Kernel const& f, tiled_extent<3> const& ext, int part) { - int D0 = ext.tile_dim[0]; - int D1 = ext.tile_dim[1]; - int D2 = ext.tile_dim[2]; - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D2 * D1 * D0 * SSIZE]; - tiled_index<3> *tidx = new tiled_index<3>[D0 * D1 * D2]; - tile_barrier::pb_t hc_bar = std::make_shared(D0 * D1 * D2); - tile_barrier tbar(hc_bar); - - for (int i = 0; i < ext[2] / D2; i++) - for (int j = 0; j < ext[1] / D1; j++) - for(int k = start; k < end; k++) { - int id = 0; - char *sp = stk; - tiled_index<3> *tip = tidx; - for (int x = 0; x < D2; x++) - for (int y = 0; y < D1; y++) - for (int z = 0; z < D0; z++) { - new (tip) tiled_index<3>(D2 * i + x, - D1 * j + y, - D0 * k + z, - x, y, z, i, j, k, tbar, D0, D1, D2); - hc_bar->setctx(++id, sp, f, tip, SSIZE); - ++tip; - sp += SSIZE; - } - hc_bar->idx = 0; - while (hc_bar->idx == 0) { - hc_bar->idx = id; - hc_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - extent const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task, std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent<1> const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile_1D, - std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent<2> const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile_2D, - std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent<3> const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile_3D, - std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -#endif - // ------------------------------------------------------------------------ // utility helper classes for array_view // ------------------------------------------------------------------------ @@ -4268,7 +3940,7 @@ class array { * * @param[in] ext The extent in each dimension of this array. */ - explicit array(const extent& ext) + explicit array(const hc::extent& ext) : array(ext, accelerator(L"default").get_default_view()) {} /** @{ */ @@ -4302,10 +3974,10 @@ class array { * @param[in] srcEnd An ending iterator into the source container. */ template - array(const extent& ext, InputIter srcBegin) + array(const hc::extent& ext, InputIter srcBegin) : array(ext, srcBegin, accelerator(L"default").get_default_view()) {} template - array(const extent& ext, InputIter srcBegin, InputIter srcEnd) + array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd) : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {} /** @} */ @@ -4378,7 +4050,7 @@ class array { * this array. * @param[in] access_type The type of CPU access desired for this array. */ - array(const extent& ext, accelerator_view av, access_type cpu_access_type = access_type_auto) + array(const hc::extent& ext, accelerator_view av, access_type cpu_access_type = access_type_auto) #if __KALMAR_ACCELERATOR__ == 1 : m_device(ext.size()), extent(ext) {} #else @@ -4396,7 +4068,7 @@ class array { explicit array(int e0, int e1, int e2, void* accelerator_pointer) : array(hc::extent(e0, e1, e2), accelerator(L"default").get_default_view(), accelerator_pointer) {} - explicit array(const extent& ext, void* accelerator_pointer) + explicit array(const hc::extent& ext, void* accelerator_pointer) : array(ext, accelerator(L"default").get_default_view(), accelerator_pointer) {} /** @} */ @@ -4464,11 +4136,11 @@ class array { * @param[in] access_type The type of CPU access desired for this array. */ template - array(const extent& ext, InputIter srcBegin, accelerator_view av, + array(const hc::extent& ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) : array(ext, av, cpu_access_type) { copy(srcBegin, *this); } template - array(const extent& ext, InputIter srcBegin, InputIter srcEnd, + array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) : array(ext, av, cpu_access_type) { if (ext.size() < std::distance(srcBegin, srcEnd)) @@ -4522,10 +4194,10 @@ class array { */ template array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(extent(e0), srcBegin, av, cpu_access_type) {} + : array(hc::extent(e0), srcBegin, av, cpu_access_type) {} template array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(extent(e0), srcBegin, srcEnd, av, cpu_access_type) {} + : array(hc::extent(e0), srcBegin, srcEnd, av, cpu_access_type) {} template array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) : array(hc::extent(e0, e1), srcBegin, av, cpu_access_type) {} @@ -4553,7 +4225,7 @@ class array { * @param[in] associated_av An accelerator_view object which specifies a * target device accelerator. */ - array(const extent& ext, accelerator_view av, accelerator_view associated_av) + array(const hc::extent& ext, accelerator_view av, accelerator_view associated_av) #if __KALMAR_ACCELERATOR__ == 1 : m_device(ext.size()), extent(ext) {} #else @@ -4597,10 +4269,10 @@ class array { * target device accelerator. */ template - array(const extent& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) + array(const hc::extent& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) : array(ext, av, associated_av) { copy(srcBegin, *this); } template - array(const extent& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) + array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) : array(ext, av, associated_av) { if (ext.size() < std::distance(srcBegin, srcEnd)) throw runtime_exception("errorMsg_throw", 0); @@ -4645,10 +4317,10 @@ class array { */ template array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(extent(e0), srcBegin, av, associated_av) {} + : array(hc::extent(e0), srcBegin, av, associated_av) {} template array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(extent(e0), srcBegin, srcEnd, av, associated_av) {} + : array(hc::extent(e0), srcBegin, srcEnd, av, associated_av) {} template array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) : array(hc::extent(e0, e1), srcBegin, av, associated_av) {} @@ -4667,7 +4339,7 @@ class array { /** * Access the extent that defines the shape of this array. */ - extent get_extent() const __CPU__ __HC__ { return extent; } + hc::extent get_extent() const __CPU__ __HC__ { return extent; } /** * This property returns the accelerator_view representing the location @@ -4792,7 +4464,7 @@ class array { */ operator std::vector() const { std::vector vec(extent.size()); - copy(*this, vec.data()); + hc::copy(*this, vec.data()); return std::move(vec); } @@ -4934,7 +4606,7 @@ class array { * @return Returns a subsection of the source array at specified origin, * and with the specified extent. */ - array_view section(const index& origin, const extent& ext) __CPU__ __HC__ { + array_view section(const index& origin, const hc::extent& ext) __CPU__ __HC__ { #if __KALMAR_ACCELERATOR__ != 1 if ( !Kalmar::amp_helper, hc::extent>::contains(origin, ext ,this->extent) ) throw runtime_exception("errorMsg_throw", 0); @@ -4942,7 +4614,7 @@ class array { array_view av(*this); return av.section(origin, ext); } - array_view section(const index& origin, const extent& ext) const __CPU__ __HC__ { + array_view section(const index& origin, const hc::extent& ext) const __CPU__ __HC__ { array_view av(*this); return av.section(origin, ext); } @@ -4972,11 +4644,11 @@ class array { /** * Equivalent to "section(index(), ext)". */ - array_view section(const extent& ext) __CPU__ __HC__ { + array_view section(const hc::extent& ext) __CPU__ __HC__ { array_view av(*this); return av.section(ext); } - array_view section(const extent& ext) const __CPU__ __HC__ { + array_view section(const hc::extent& ext) const __CPU__ __HC__ { array_view av(*this); return av.section(ext); } @@ -5082,7 +4754,7 @@ class array { * to K from N. */ template array_view - view_as(const extent& viewExtent) __CPU__ __HC__ { + view_as(const hc::extent& viewExtent) __CPU__ __HC__ { #if __KALMAR_ACCELERATOR__ != 1 if( viewExtent.size() > extent.size()) throw runtime_exception("errorMsg_throw", 0); @@ -5091,7 +4763,7 @@ class array { return av; } template array_view - view_as(const extent& viewExtent) const __CPU__ __HC__ { + view_as(const hc::extent& viewExtent) const __CPU__ __HC__ { #if __KALMAR_ACCELERATOR__ != 1 if( viewExtent.size() > extent.size()) throw runtime_exception("errorMsg_throw", 0); @@ -5102,7 +4774,7 @@ class array { /** @} */ - ~array() {} + ~array() = default; // FIXME: functions below may be considered to move to private const acc_buffer_t& internal() const __CPU__ __HC__ { return m_device; } @@ -5164,7 +4836,7 @@ class array_view * @param[in] src An array which contains the data that this array_view is * bound to. */ - array_view(array& src) __CPU__ __HC__ + array_view(hc::array& src) __CPU__ __HC__ : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} // FIXME: following interfaces were not implemented yet @@ -5184,7 +4856,7 @@ class array_view * @param[in] extent The extent of this array_view. */ template ::value>::type> - array_view(const extent& extent, Container& src) + array_view(const hc::extent& extent, Container& src) : array_view(extent, src.data()) { static_assert( std::is_same::value, "container element type and array view element type must match"); } @@ -5198,7 +4870,7 @@ class array_view * size of extent, the behavior is undefined. * @param[in] ext The extent of this array_view. */ - array_view(const extent& ext, value_type* src) __CPU__ __HC__ + array_view(const hc::extent& ext, value_type* src) __CPU__ __HC__ #if __KALMAR_ACCELERATOR__ == 1 : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {} #else @@ -5215,7 +4887,7 @@ class array_view * * @param[in] ext The extent of this array_view. */ - explicit array_view(const extent& ext) + explicit array_view(const hc::extent& ext) : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {} /** @@ -5282,7 +4954,7 @@ class array_view /** * Access the extent that defines the shape of this array_view. */ - extent get_extent() const __CPU__ __HC__ { return extent; } + hc::extent get_extent() const __CPU__ __HC__ { return extent; } /** * Access the accelerator_view where the data source of the array_view is @@ -5607,7 +5279,7 @@ class array_view * and with the specified extent. */ array_view section(const index& idx, - const extent& ext) const __CPU__ __HC__ { + const hc::extent& ext) const __CPU__ __HC__ { #if __KALMAR_ACCELERATOR__ != 1 if ( !Kalmar::amp_helper, hc::extent>::contains(idx, ext,this->extent ) ) throw runtime_exception("errorMsg_throw", 0); @@ -5628,7 +5300,7 @@ class array_view /** * Equivalent to "section(index(), ext)". */ - array_view section(const extent& ext) const __CPU__ __HC__ { + array_view section(const hc::extent& ext) const __CPU__ __HC__ { index idx; return section(idx, ext); } @@ -5697,7 +5369,7 @@ class array_view * changed to K from 1. */ template - array_view view_as(extent viewExtent) const __CPU__ __HC__ { + array_view view_as(hc::extent viewExtent) const __CPU__ __HC__ { static_assert(N == 1, "view_as is only permissible on array views of rank 1"); #if __KALMAR_ACCELERATOR__ != 1 if ( viewExtent.size() > extent.size()) @@ -5707,7 +5379,7 @@ class array_view return av; } - ~array_view() __CPU__ __HC__ {} + ~array_view() __CPU__ __HC__ = default; // FIXME: the following functions could be considered to move to private const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; } @@ -5834,7 +5506,7 @@ class array_view * size of extent, the behavior is undefined. * @param[in] ext The extent of this array_view. */ - array_view(const extent& ext, const value_type* src) __CPU__ __HC__ + array_view(const hc::extent& ext, const value_type* src) __CPU__ __HC__ #if __KALMAR_ACCELERATOR__ == 1 : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {} #else @@ -5901,7 +5573,7 @@ class array_view /** * Access the extent that defines the shape of this array_view. */ - extent get_extent() const __CPU__ __HC__ { return extent; } + hc::extent get_extent() const __CPU__ __HC__ { return extent; } /** * Access the accelerator_view where the data source of the array_view is @@ -6192,7 +5864,7 @@ class array_view * and with the specified extent. */ array_view section(const index& idx, - const extent& ext) const __CPU__ __HC__ { + const hc::extent& ext) const __CPU__ __HC__ { array_view av(cache, ext, extent_base, idx + index_base, offset); return av; } @@ -6209,7 +5881,7 @@ class array_view /** * Equivalent to "section(index(), ext)". */ - array_view section(const extent& ext) const __CPU__ __HC__ { + array_view section(const hc::extent& ext) const __CPU__ __HC__ { index idx; return section(idx, ext); } @@ -6262,7 +5934,7 @@ class array_view int size = extent.size() * sizeof(T) / sizeof(ElementType); using buffer_type = typename array_view::acc_buffer_t; array_view av(buffer_type(cache), - extent<1>(size), + hc::extent<1>(size), (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); return av; } @@ -6276,7 +5948,7 @@ class array_view * changed to K from 1. */ template - array_view view_as(extent viewExtent) const __CPU__ __HC__ { + array_view view_as(hc::extent viewExtent) const __CPU__ __HC__ { static_assert(N == 1, "view_as is only permissible on array views of rank 1"); #if __KALMAR_ACCELERATOR__ != 1 if ( viewExtent.size() > extent.size()) @@ -6286,7 +5958,7 @@ class array_view return av; } - ~array_view() __CPU__ __HC__ {} + ~array_view() __CPU__ __HC__ = default; // FIXME: the following functions may be considered to move to private const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; } @@ -6305,7 +5977,7 @@ class array_view bool is_flat(const array_view&) noexcept; template friend void copy(const array&, const array_view&); - template + template friend void copy(InputIter, InputIter, const array_view&); template friend void copy(const array_view&, array&); @@ -6321,7 +5993,7 @@ class array_view // used by section and projection array_view(const acc_buffer_t& cache, const hc::extent& ext_now, - const extent& ext_b, + const hc::extent& ext_b, const index& idx_b, int off) __CPU__ __HC__ : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {} @@ -6734,7 +6406,7 @@ template void copy(InputIter srcBegin, array& dest) { InputIter srcEnd = srcBegin; std::advance(srcEnd, dest.get_extent().size()); - copy(srcBegin, srcEnd, dest); + hc::copy(srcBegin, srcEnd, dest); } /** @} */ @@ -6821,7 +6493,6 @@ void copy(const array_view &src, OutputIter destBegin) { // utility function for copy_async // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ // copy_async // ------------------------------------------------------------------------ @@ -7007,850 +6678,71 @@ completion_future copy_async(const array_view& src, const array& des return completion_future(fut.share()); } -// ------------------------------------------------------------------------ -// atomic functions -// ------------------------------------------------------------------------ - -/** @{ */ -/** - * Atomically read the value stored in dest , replace it with the value given - * in val and return the old value to the caller. This function provides - * overloads for int , unsigned int and float parameters. - * - * @param[out] dest A pointer to the location which needs to be atomically - * modified. The location may reside within a - * hc::array or hc::array_view or within a - * tile_static variable. - * @param[in] val The new value to be stored in the location pointed to be dest - * @return These functions return the old value which was previously stored at - * dest, and that was atomically replaced. These functions always - * succeed. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_exchange_int(int *p, int val) __HC__; -extern "C" float atomic_exchange_float(float *p, float val) __HC__; -extern "C" uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_exchange(unsigned int * dest, unsigned int val) __CPU__ __HC__ { - return atomic_exchange_unsigned(dest, val); -} -static inline int atomic_exchange(int * dest, int val) __CPU__ __HC__ { - return atomic_exchange_int(dest, val); -} -static inline float atomic_exchange(float * dest, float val) __CPU__ __HC__ { - return atomic_exchange_float(dest, val); -} -static inline uint64_t atomic_exchange(uint64_t * dest, uint64_t val) __CPU__ __HC__ { - return atomic_exchange_uint64(dest, val); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val); -int atomic_exchange_int(int *p, int val); -float atomic_exchange_float(float *p, float val); -uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__ { - return atomic_exchange_unsigned(dest, val); -} -static inline int atomic_exchange(int *dest, int val) __CPU__ __HC__ { - return atomic_exchange_int(dest, val); -} -static inline float atomic_exchange(float *dest, float val) __CPU__ __HC__ { - return atomic_exchange_float(dest, val); -} -static inline uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__ { - return atomic_exchange_uint64(dest, val); -} -#else -extern unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__; -extern int atomic_exchange(int *dest, int val) __CPU__ __HC__; -extern float atomic_exchange(float *dest, float val) __CPU__ __HC__; -extern uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__; -#endif -/** @} */ - -/** @{ */ -/** - * These functions attempt to perform these three steps atomically: - * 1. Read the value stored in the location pointed to by dest - * 2. Compare the value read in the previous step with the value contained in - * the location pointed by expected_val - * 3. Carry the following operations depending on the result of the comparison - * of the previous step: - * a. If the values are identical, then the function tries to atomically - * change the value pointed by dest to the value in val. The function - * indicates by its return value whether this transformation has been - * successful or not. - * b. If the values are not identical, then the function stores the value - * read in step (1) into the location pointed to by expected_val, and - * returns false. - * - * @param[out] dest An pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @param[out] expected_val A pointer to a local variable or function - * parameter. Upon calling the function, the location - * pointed by expected_val contains the value the - * caller expects dest to contain. Upon return from - * the function, expected_val will contain the most - * recent value read from dest. - * @param[in] val The new value to be stored in the location pointed to be dest - * @return The return value indicates whether the function has been successful - * in atomically reading, comparing and modifying the contents of the - * memory location. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val) __HC__; -extern "C" int atomic_compare_exchange_int(int *dest, int expected_val, int val) __HC__; -extern "C" uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val) __HC__; - -static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ { - *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val); - return (*dest == val); -} -static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ { - *expected_val = atomic_compare_exchange_int(dest, *expected_val, val); - return (*dest == val); -} -static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ { - *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val); - return (*dest == val); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val); -int atomic_compare_exchange_int(int *dest, int expected_val, int val); -uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val); - -static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ { - *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val); - return (*dest == val); -} -static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ { - *expected_val = atomic_compare_exchange_int(dest, *expected_val, val); - return (*dest == val); -} -static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ { - *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val); - return (*dest == val); -} -#else -extern bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__; -extern bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__; -extern bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__; -#endif -/** @} */ - -/** @{ */ -/** - * Atomically read the value stored in dest, apply the binary numerical - * operation specific to the function with the read value and val serving as - * input operands, and store the result back to the location pointed by dest. - * - * In terms of sequential semantics, the operation performed by any of the - * above function is described by the following piece of pseudo-code: - * - * *dest = *dest @f$\otimes@f$ val; - * - * Where the operation denoted by @f$\otimes@f$ is one of: addition - * (atomic_fetch_add), subtraction (atomic_fetch_sub), find maximum - * (atomic_fetch_max), find minimum (atomic_fetch_min), bit-wise AND - * (atomic_fetch_and), bit-wise OR (atomic_fetch_or), bit-wise XOR - * (atomic_fetch_xor). - * - * @param[out] dest An pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @param[in] val The second operand which participates in the calculation of - * the binary operation whose result is stored into the - * location pointed to be dest. - * @return These functions return the old value which was previously stored at - * dest, and that was atomically replaced. These functions always - * succeed. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_add_int(int *p, int val) __HC__; -extern "C" float atomic_add_float(float *p, float val) __HC__; -extern "C" uint64_t atomic_add_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_add_unsigned(x, y); -} -static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ { - return atomic_add_int(x, y); -} -static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ { - return atomic_add_float(x, y); -} -static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_add_uint64(x, y); -} - -extern "C" unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_sub_int(int *p, int val) __HC__; -extern "C" float atomic_sub_float(float *p, float val) __HC__; - -static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_sub_unsigned(x, y); -} -static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ { - return atomic_sub_int(x, y); -} -static inline int atomic_fetch_sub(float *x, float y) __CPU__ __HC__ { - return atomic_sub_float(x, y); -} - -extern "C" unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_and_int(int *p, int val) __HC__; -extern "C" uint64_t atomic_and_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_and_unsigned(x, y); -} -static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ { - return atomic_and_int(x, y); -} -static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_and_uint64(x, y); -} - -extern "C" unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_or_int(int *p, int val) __HC__; -extern "C" uint64_t atomic_or_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_or_unsigned(x, y); -} -static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ { - return atomic_or_int(x, y); -} -static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_or_uint64(x, y); -} - -extern "C" unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_xor_int(int *p, int val) __HC__; -extern "C" uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_xor_unsigned(x, y); -} -static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ { - return atomic_xor_int(x, y); -} -static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_xor_uint64(x, y); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val); -int atomic_add_int(int *p, int val); -float atomic_add_float(float *p, float val); -uint64_t atomic_add_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_add_unsigned(x, y); -} -static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ { - return atomic_add_int(x, y); -} -static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ { - return atomic_add_float(x, y); -} -static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_add_uint64(x, y); -} - -unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val); -int atomic_sub_int(int *p, int val); -float atomic_sub_float(float *p, float val); - -static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_sub_unsigned(x, y); -} -static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ { - return atomic_sub_int(x, y); -} -static inline float atomic_fetch_sub(float *x, float y) __CPU__ __HC__ { - return atomic_sub_float(x, y); -} - -unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val); -int atomic_and_int(int *p, int val); -uint64_t atomic_and_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_and_unsigned(x, y); -} -static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ { - return atomic_and_int(x, y); -} -static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_and_uint64(x, y); -} - -unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val); -int atomic_or_int(int *p, int val); -uint64_t atomic_or_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_or_unsigned(x, y); -} -static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ { - return atomic_or_int(x, y); -} -static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_or_uint64(x, y); -} - -unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val); -int atomic_xor_int(int *p, int val); -uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ { - return atomic_xor_unsigned(x, y); -} -static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ { - return atomic_xor_int(x, y); -} -static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ { - return atomic_xor_uint64(x, y); -} -#else -extern unsigned atomic_fetch_add(unsigned *x, unsigned y) __CPU__ __HC__; -extern int atomic_fetch_add(int *x, int y) __CPU__ __HC__; -extern float atomic_fetch_add(float *x, float y) __CPU__ __HC__; -extern uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__; - -extern unsigned atomic_fetch_sub(unsigned *x, unsigned y) __CPU__ __HC__; -extern int atomic_fetch_sub(int *x, int y) __CPU__ __HC__; -extern float atomic_fetch_sub(float *x, float y) __CPU__ __HC__; - -extern unsigned atomic_fetch_and(unsigned *x, unsigned y) __CPU__ __HC__; -extern int atomic_fetch_and(int *x, int y) __CPU__ __HC__; -extern uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__; - -extern unsigned atomic_fetch_or(unsigned *x, unsigned y) __CPU__ __HC__; -extern int atomic_fetch_or(int *x, int y) __CPU__ __HC__; -extern uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__; - -extern unsigned atomic_fetch_xor(unsigned *x, unsigned y) __CPU__ __HC__; -extern int atomic_fetch_xor(int *x, int y) __CPU__ __HC__; -extern uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__; -#endif - -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_max_int(int *p, int val) __HC__; -extern "C" uint64_t atomic_max_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ { - return atomic_max_unsigned(x, y); -} -static inline int atomic_fetch_max(int *x, int y) __HC__ { - return atomic_max_int(x, y); -} -static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ { - return atomic_max_uint64(x, y); -} - -extern "C" unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) __HC__; -extern "C" int atomic_min_int(int *p, int val) __HC__; -extern "C" uint64_t atomic_min_uint64(uint64_t *p, uint64_t val) __HC__; - -static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ { - return atomic_min_unsigned(x, y); -} -static inline int atomic_fetch_min(int *x, int y) __HC__ { - return atomic_min_int(x, y); -} -static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ { - return atomic_min_uint64(x, y); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val); -int atomic_max_int(int *p, int val); -uint64_t atomic_max_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ { - return atomic_max_unsigned(x, y); -} -static inline int atomic_fetch_max(int *x, int y) __HC__ { - return atomic_max_int(x, y); -} -static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ { - return atomic_max_uint64(x, y); -} - -unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val); -int atomic_min_int(int *p, int val); -uint64_t atomic_min_uint64(uint64_t *p, uint64_t val); - -static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ { - return atomic_min_unsigned(x, y); -} -static inline int atomic_fetch_min(int *x, int y) __HC__ { - return atomic_min_int(x, y); -} -static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ { - return atomic_min_uint64(x, y); -} -#else -extern int atomic_fetch_max(int * dest, int val) __CPU__ __HC__; -extern unsigned int atomic_fetch_max(unsigned int * dest, unsigned int val) __CPU__ __HC__; -extern uint64_t atomic_fetch_max(uint64_t * dest, uint64_t val) __CPU__ __HC__; - -extern int atomic_fetch_min(int * dest, int val) __CPU__ __HC__; -extern unsigned int atomic_fetch_min(unsigned int * dest, unsigned int val) __CPU__ __HC__; -extern uint64_t atomic_fetch_min(uint64_t * dest, uint64_t val) __CPU__ __HC__; -#endif - -/** @} */ - -/** @{ */ -/** - * Atomically increment or decrement the value stored at the location point to - * by dest. - * - * @param[inout] dest An pointer to the location which needs to be atomically - * modified. The location may reside within a - * concurrency::array or concurrency::array_view or within a - * tile_static variable. - * @return These functions return the old value which was previously stored at - * dest, and that was atomically replaced. These functions always - * succeed. - */ -#if __KALMAR_ACCELERATOR__ == 1 -extern "C" unsigned int atomic_inc_unsigned(unsigned int *p) __HC__; -extern "C" int atomic_inc_int(int *p) __HC__; - -static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ { - return atomic_inc_unsigned(x); -} -static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ { - return atomic_inc_int(x); -} - -extern "C" unsigned int atomic_dec_unsigned(unsigned int *p) __HC__; -extern "C" int atomic_dec_int(int *p) __HC__; - -static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ { - return atomic_dec_unsigned(x); -} -static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ { - return atomic_dec_int(x); -} -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -unsigned int atomic_inc_unsigned(unsigned int *p); -int atomic_inc_int(int *p); - -static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ { - return atomic_inc_unsigned(x); -} -static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ { - return atomic_inc_int(x); -} - -unsigned int atomic_dec_unsigned(unsigned int *p); -int atomic_dec_int(int *p); - -static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ { - return atomic_dec_unsigned(x); -} -static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ { - return atomic_dec_int(x); -} -#else -extern int atomic_fetch_inc(int * _Dest) __CPU__ __HC__; -extern unsigned int atomic_fetch_inc(unsigned int * _Dest) __CPU__ __HC__; - -extern int atomic_fetch_dec(int * _Dest) __CPU__ __HC__; -extern unsigned int atomic_fetch_dec(unsigned int * _Dest) __CPU__ __HC__; -#endif - -/** @} */ - -/** - * Atomically do the following operations: - * - reads the 32-bit value (original) from address pointer in global or group segment - * - computes ((original >= val) ? 0 : (original + 1)) - * - stores the result back to the address - * - * @return The original value retrieved from address pointer. - * - * Please refer to atomic_wrapinc in HSA PRM 6.6 for more detailed specification of the function. - */ -extern "C" unsigned int __atomic_wrapinc(unsigned int* address, unsigned int val) __HC__; - -/** - * Atomically do the following operations: - * - reads the 32-bit value (original) from address pointer in global or group segment - * - computes ((original == 0) || (original > val)) ? val : (original - 1) - * - stores the result back to the address - * - * @return The original value retrieved from address pointer. - * - * Please refer to atomic_wrapdec in HSA PRM 6.6 for more detailed specification of the function. - */ -extern "C" unsigned int __atomic_wrapdec(unsigned int* address, unsigned int val) __HC__; - - // ------------------------------------------------------------------------ // parallel_for_each // ------------------------------------------------------------------------ -template -completion_future parallel_for_each(const accelerator_view&, const extent&, const Kernel&); +template +completion_future parallel_for_each( + const accelerator_view&, const hc::extent&, const Kernel&); -template -completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&); +template +completion_future parallel_for_each( + const accelerator_view&, const tiled_extent&, const Kernel&); -template -completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - -template -completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); - -template -completion_future parallel_for_each(const extent& compute_domain, const Kernel& f) { - return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f); -} - -template -completion_future parallel_for_each(const tiled_extent<3>& compute_domain, const Kernel& f) { - return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f); +template +inline +completion_future parallel_for_each( + const hc::extent& compute_domain, const Kernel& f) +{ + return parallel_for_each( + accelerator::get_auto_selection_view(), compute_domain, f); } -template -completion_future parallel_for_each(const tiled_extent<2>& compute_domain, const Kernel& f) { - return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f); +template +inline +completion_future parallel_for_each( + const tiled_extent& compute_domain, const Kernel& f) { + return parallel_for_each( + accelerator::get_auto_selection_view(), compute_domain, f); } -template -completion_future parallel_for_each(const tiled_extent<1>& compute_domain, const Kernel& f) { - return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f); -} -template -struct pfe_helper -{ - static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ { - int i; - for (i = 0; i < k.ext[N - 1]; ++i) { - idx[N - 1] = i; - pfe_helper::call(k, idx); - } - } -}; -template -struct pfe_helper<0, Kernel, _Tp> +//ND parallel_for_each, nontiled +template +inline +completion_future parallel_for_each( + const accelerator_view& av, + const hc::extent& compute_domain, + const Kernel& f) { - static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 - k.k(idx); -#endif - } -}; + if (compute_domain.size() == 0) return completion_future{}; -template -class pfe_wrapper -{ -public: - explicit pfe_wrapper(const extent& other, const Kernel& f) __CPU__ __HC__ - : ext(other), k(f) {} - void operator() (index idx) __CPU__ __HC__ { - pfe_helper, index>::call(*this, idx); + if (av.get_accelerator().get_device_path() == L"cpu") { + throw hc::runtime_exception{ + Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; } -private: - const extent ext; - const Kernel k; - template - friend struct pfe_helper; -}; -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//ND parallel_for_each, nontiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( + return completion_future{ + Kalmar::launch_kernel_async(av.pQueue, compute_domain, f)}; +} + +//ND parallel_for_each, tiled +template +completion_future parallel_for_each( const accelerator_view& av, - const extent& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - for(int i = 0 ; i < N ; i++) - { - // silently return in case the any dimension of the extent is 0 - if (compute_domain[i] == 0) - return completion_future(); - if (compute_domain[i] < 0) - throw invalid_compute_domain("Extent is less than 0."); - if (static_cast(compute_domain[i]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - } - size_t ext[3] = {static_cast(compute_domain[N - 1]), - static_cast(compute_domain[N - 2]), - static_cast(compute_domain[N - 3])}; -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } -#endif + const tiled_extent& compute_domain, + const Kernel& f) +{ + if (compute_domain.size() == 0) return completion_future{}; + if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); + throw hc::runtime_exception{ + Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; } - const pfe_wrapper _pf(compute_domain, f); - return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async, 3>(av.pQueue, ext, NULL, _pf)); -#else -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - int* foo1 = reinterpret_cast(&Kernel::__cxxamp_trampoline); -#endif - auto bar = &pfe_wrapper::operator(); - auto qq = &index::__cxxamp_opencl_index; - int* foo = reinterpret_cast(&pfe_wrapper::__cxxamp_trampoline); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//1D parallel_for_each, nontiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( - const accelerator_view& av, const extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - // silently return in case the any dimension of the extent is 0 - if (compute_domain[0] == 0) - return completion_future(); - if (compute_domain[0] < 0) { - throw invalid_compute_domain("Extent is less than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } -#endif - size_t ext = compute_domain[0]; - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async(av.pQueue, &ext, NULL, f)); -#else //if __KALMAR_ACCELERATOR__ != 1 - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//2D parallel_for_each, nontiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( - const accelerator_view& av, const extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - // silently return in case the any dimension of the extent is 0 - if (compute_domain[0] == 0 || compute_domain[1] == 0) - return completion_future(); - if (compute_domain[0] < 0 || compute_domain[1] < 0) { - throw invalid_compute_domain("Extent is less than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } -#endif - size_t ext[2] = {static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async(av.pQueue, ext, NULL, f)); -#else //if __KALMAR_ACCELERATOR__ != 1 - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//3D parallel_for_each, nontiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( - const accelerator_view& av, const extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - // silently return in case the any dimension of the extent is 0 - if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0) - return completion_future(); - if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) { - throw invalid_compute_domain("Extent is less than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } -#endif - size_t ext[3] = {static_cast(compute_domain[2]), - static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async(av.pQueue, ext, NULL, f)); -#else //if __KALMAR_ACCELERATOR__ != 1 - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//1D parallel_for_each, tiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( - const accelerator_view& av, const tiled_extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - // silently return in case the any dimension of the extent is 0 - if (compute_domain[0] == 0) - return completion_future(); - if (compute_domain[0] < 0) { - throw invalid_compute_domain("Extent is less than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - size_t ext = compute_domain[0]; - size_t tile = compute_domain.tile_dim[0]; -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } else -#endif - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - void *kernel = Kalmar::mcw_cxxamp_get_kernel(av.pQueue, f); - return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(av.pQueue, &ext, &tile, f, kernel, compute_domain.get_dynamic_group_segment_size())); -#else //if __KALMAR_ACCELERATOR__ != 1 - tiled_index<1> this_is_used_to_instantiate_the_right_index; - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//2D parallel_for_each, tiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( - const accelerator_view& av, const tiled_extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - // silently return in case the any dimension of the extent is 0 - if (compute_domain[0] == 0 || compute_domain[1] == 0) - return completion_future(); - if (compute_domain[0] < 0 || compute_domain[1] < 0) { - throw invalid_compute_domain("Extent is less than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - size_t ext[2] = { static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - size_t tile[2] = { static_cast(compute_domain.tile_dim[1]), - static_cast(compute_domain.tile_dim[0]) }; -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } else -#endif - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - void *kernel = Kalmar::mcw_cxxamp_get_kernel(av.pQueue, f); - return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size())); -#else //if __KALMAR_ACCELERATOR__ != 1 - tiled_index<2> this_is_used_to_instantiate_the_right_index; - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#pragma clang diagnostic ignored "-Wunused-variable" -//3D parallel_for_each, tiled -template -__attribute__((noinline,used)) completion_future parallel_for_each( - const accelerator_view& av, const tiled_extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - // silently return in case the any dimension of the extent is 0 - if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0) - return completion_future(); - if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) { - throw invalid_compute_domain("Extent is less than 0."); - } - if (static_cast(compute_domain[0]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[1]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - if (static_cast(compute_domain[2]) > 4294967295L) - throw invalid_compute_domain("Extent size too large."); - size_t ext[3] = { static_cast(compute_domain[2]), - static_cast(compute_domain[1]), - static_cast(compute_domain[0])}; - size_t tile[3] = { static_cast(compute_domain.tile_dim[2]), - static_cast(compute_domain.tile_dim[1]), - static_cast(compute_domain.tile_dim[0]) }; -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (is_cpu()) { - return launch_cpu_task_async(av.pQueue, f, compute_domain); - } else -#endif - if (av.get_accelerator().get_device_path() == L"cpu") { - throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL); - } - void *kernel = Kalmar::mcw_cxxamp_get_kernel(av.pQueue, f); - return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size())); -#else //if __KALMAR_ACCELERATOR__ != 1 - tiled_index<3> this_is_used_to_instantiate_the_right_index; - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - auto foo = &Kernel::__cxxamp_trampoline; - auto bar = &Kernel::operator(); -#endif -} -#pragma clang diagnostic pop -} // namespace hc + return completion_future{ + Kalmar::launch_kernel_with_dynamic_group_memory_async( + av.pQueue, compute_domain, f)}; +} +} // namespace hc \ No newline at end of file diff --git a/include/hc_defines.h b/include/hc_defines.h index 15e933aaf22..2ec34e0e712 100644 --- a/include/hc_defines.h +++ b/include/hc_defines.h @@ -18,11 +18,6 @@ #include #include -// CPU execution path -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -#include -#endif - namespace hc { typedef __fp16 half; } diff --git a/include/hc_printf.hpp b/include/hc_printf.hpp index 954fa3490e6..b03e645da5c 100644 --- a/include/hc_printf.hpp +++ b/include/hc_printf.hpp @@ -11,7 +11,6 @@ #include #include "hc_am_internal.hpp" -#include "hsa_atomic.h" // The printf on the accelerator is only enabled when // The HCC_ENABLE_ACCELERATOR_PRINTF is defined diff --git a/include/hc_short_vector.inl b/include/hc_short_vector.inl index 37d97adac7d..64125bde1d5 100644 --- a/include/hc_short_vector.inl +++ b/include/hc_short_vector.inl @@ -235,17 +235,9 @@ public: data = v; } - __attribute__((annotate("user_deserialize"))) __vector_data_container(const SCALAR_TYPE x) __CPU_GPU__ { data = { x }; } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - for (auto &component : ar) { - s.Append(sizeof(SCALAR_TYPE), &component); - } - } }; @@ -271,18 +263,6 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } - - __attribute__((annotate("user_deserialize"))) - __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y) __CPU_GPU__ { - data = { x, y }; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - for (auto &component : ar) { - s.Append(sizeof(SCALAR_TYPE), &component); - } - } }; @@ -308,18 +288,6 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } - - __attribute__((annotate("user_deserialize"))) - __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z) __CPU_GPU__ { - data = { x, y, z }; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - for (auto &component : ar) { - s.Append(sizeof(SCALAR_TYPE), &component); - } - } }; @@ -345,18 +313,6 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } - - __attribute__((annotate("user_deserialize"))) - __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w) __CPU_GPU__ { - data = { x,y,z,w }; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - for (auto &component : ar) { - s.Append(sizeof(SCALAR_TYPE), &component); - } - } }; @@ -382,19 +338,6 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } - - __attribute__((annotate("user_deserialize"))) - __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w - , const SCALAR_TYPE s4, const SCALAR_TYPE s5, const SCALAR_TYPE s6, const SCALAR_TYPE s7) __CPU_GPU__ { - data = { x,y,z,w,s4,s5,s6,s7 }; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - for (auto &component : ar) { - s.Append(sizeof(SCALAR_TYPE), &component); - } - } }; @@ -420,21 +363,6 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } - - __attribute__((annotate("user_deserialize"))) - __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w - , const SCALAR_TYPE s4, const SCALAR_TYPE s5, const SCALAR_TYPE s6, const SCALAR_TYPE s7 - , const SCALAR_TYPE s8, const SCALAR_TYPE s9, const SCALAR_TYPE sA, const SCALAR_TYPE sB - , const SCALAR_TYPE sC, const SCALAR_TYPE sD, const SCALAR_TYPE sE, const SCALAR_TYPE sF) __CPU_GPU__ { - data = { x,y,z,w,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF }; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - for (auto &component : ar) { - s.Append(sizeof(SCALAR_TYPE), &component); - } - } }; diff --git a/include/hsa_atomic.h b/include/hsa_atomic.h deleted file mode 100644 index 599dc2be568..00000000000 --- a/include/hsa_atomic.h +++ /dev/null @@ -1,143 +0,0 @@ -#pragma once - -#define HSAIL_BUILTIN_GPU __attribute__((hc)) -#define HSAIL_BUILTIN_CPU __attribute__((cpu)) inline - -#ifdef __KALMAR_ACCELERATOR__ - -// fetch_add -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_add_int(int* dest, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_add_unsigned(unsigned int* dest, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_add_int64(int64_t* dest, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_add_uint64(uint64_t* dest, uint64_t val); - -// fetch_sub -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_sub_int(int* dest, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_sub_unsigned(unsigned int* dest, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_sub_int64(int64_t* dest, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_sub_uint64(uint64_t* dest, uint64_t val); - -// fetch_and -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_and_int(int* dest, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_and_unsigned(unsigned int* dest, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_and_int64(int64_t* dest, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_and_uint64(uint64_t* dest, uint64_t val); - -// fetch_or -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_or_int(int* dest, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_or_unsigned(unsigned int* dest, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_or_int64(int64_t* dest, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_or_uint64(uint64_t* dest, uint64_t val); - -// fetch_xor -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_xor_int(int* dest, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_xor_unsigned(unsigned int* dest, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_xor_int64(int64_t* dest, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_xor_uint64(uint64_t* dest, uint64_t val); - -// exchange -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_exchange_int(int* dest, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_exchange_unsigned(unsigned int* dest, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_exchange_int64(int64_t* dest, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_exchange_uint64(uint64_t* dest, uint64_t val); - -// compare_exchange -extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_compare_exchange_int(int* dest, int compare, int val); -extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_compare_exchange_unsigned(unsigned int* dest, unsigned int compare, unsigned int val); -extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_compare_exchange_int64(int64_t* dest, int64_t compare, int64_t val); -extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_compare_exchange_uint64(uint64_t* dest, uint64_t compare, uint64_t val); - -#else - -// fetch_add -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_add_int(int* dest, int val) -{ return __sync_fetch_and_add(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_add_unsigned(unsigned int* dest, unsigned int val) -{ return __sync_fetch_and_add(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_add_int64(int64_t* dest, int64_t val) -{ return __sync_fetch_and_add(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_add_uint64(uint64_t* dest, uint64_t val) -{ return __sync_fetch_and_add(dest, val); } - -// fetch_sub -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_sub_int(int* dest, int val) -{ return __sync_fetch_and_sub(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_sub_unsigned(unsigned int* dest, unsigned int val) -{ return __sync_fetch_and_sub(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_sub_int64(int64_t* dest, int64_t val) -{ return __sync_fetch_and_sub(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_sub_uint64(uint64_t* dest, uint64_t val) -{ return __sync_fetch_and_sub(dest, val); } - -// fetch_and -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_and_int(int* dest, int val) -{ return __sync_fetch_and_and(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_and_unsigned(unsigned int* dest, unsigned int val) -{ return __sync_fetch_and_and(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_and_int64(int64_t* dest, int64_t val) -{ return __sync_fetch_and_and(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_and_uint64(uint64_t* dest, uint64_t val) -{ return __sync_fetch_and_and(dest, val); } - -// fetch_or -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_or_int(int* dest, int val) -{ return __sync_fetch_and_or(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_or_unsigned(unsigned int* dest, unsigned int val) -{ return __sync_fetch_and_or(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_or_int64(int64_t* dest, int64_t val) -{ return __sync_fetch_and_or(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_or_uint64(uint64_t* dest, uint64_t val) -{ return __sync_fetch_and_or(dest, val); } - -// fetch_xor -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_xor_int(int* dest, int val) -{ return __sync_fetch_and_xor(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_xor_unsigned(unsigned int* dest, unsigned int val) -{ return __sync_fetch_and_xor(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_xor_int64(int64_t* dest, int64_t val) -{ return __sync_fetch_and_xor(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_xor_uint64(uint64_t* dest, uint64_t val) -{ return __sync_fetch_and_xor(dest, val); } - -// exchange -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_exchange_int(int* dest, int val) -{ return __sync_swap(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_exchange_unsigned(unsigned int* dest, unsigned int val) -{ return __sync_swap(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_exchange_int64(int64_t* dest, int64_t val) -{ return __sync_swap(dest, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_exchange_uint64(uint64_t* dest, uint64_t val) -{ return __sync_swap(dest, val); } - -// compare_exchange -extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_compare_exchange_int(int* dest, int compare, int val) -{ return __sync_val_compare_and_swap(dest, compare, val); } - -extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_compare_exchange_unsigned(unsigned int* dest, unsigned int compare, unsigned int val) -{ return __sync_val_compare_and_swap(dest, compare, val); } - -extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_compare_exchange_int64(int64_t* dest, int64_t compare, int64_t val) -{ return __sync_val_compare_and_swap(dest, compare, val); } - -extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_compare_exchange_uint64(uint64_t* dest, uint64_t compare, uint64_t val) -{ return __sync_val_compare_and_swap(dest, compare, val); } - -#endif diff --git a/include/kalmar_buffer.h b/include/kalmar_buffer.h index 842c589f5d1..78f2a6e9e82 100644 --- a/include/kalmar_buffer.h +++ b/include/kalmar_buffer.h @@ -24,10 +24,9 @@ class _data { _data(int count, void* d) restrict(cpu, amp) : p_(static_cast(d)) {} template - _data(const _data& d) restrict(cpu, amp) + _data(const _data& d) restrict(cpu, amp) : p_(reinterpret_cast(d.get())) {} - __attribute__((annotate("user_deserialize"))) - explicit _data(T* t) restrict(cpu, amp) { p_ = t; } + explicit _data(T* t) restrict(cpu, amp) { p_ = t; } T* get(void) const restrict(cpu, amp) { return p_; } T* get_device_pointer() const restrict(cpu, amp) { return p_; } std::shared_ptr get_av() const { return nullptr; } @@ -98,12 +97,7 @@ class _data_host { void unmap_ptr(const void* addr, bool modify, size_t count, size_t offset) const { return mm->unmap(const_cast(addr), count * sizeof(T), offset * sizeof(T), modify); } void sync_to(std::shared_ptr pQueue) const { mm->sync(pQueue, false); } - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Serialize& s) const { - s.visit_buffer(mm.get(), !std::is_const::value, isArray); - } - __attribute__((annotate("user_deserialize"))) - explicit _data_host(typename std::remove_const::type* t) {} + explicit _data_host(typename std::remove_const::type* t) {} }; } // namespace Kalmar diff --git a/include/kalmar_cpu_launch.h b/include/kalmar_cpu_launch.h deleted file mode 100644 index b442d14b23f..00000000000 --- a/include/kalmar_cpu_launch.h +++ /dev/null @@ -1,48 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "hc_defines.h" -#include "kalmar_runtime.h" -#include "kalmar_serialize.h" - -namespace Kalmar { -template class tiled_extent; - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -static const unsigned int NTHREAD = std::thread::hardware_concurrency(); - -template -class CPUKernelRAII -{ - const std::shared_ptr pQueue; - const Kernel& f; - std::vector th; -public: - CPUKernelRAII(const std::shared_ptr pQueue, const Kernel& f) - : pQueue(pQueue), f(f), th(NTHREAD) { - CPUVisitor vis(pQueue); - Serialize s(&vis); - f.__cxxamp_serialize(s); - CLAMP::enter_kernel(); - } - std::thread& operator[](int i) { return th[i]; } - ~CPUKernelRAII() { - for (auto& t : th) - if (t.joinable()) - t.join(); - CPUVisitor vis(pQueue); - Serialize ss(&vis); - f.__cxxamp_serialize(ss); - CLAMP::leave_kernel(); - } -}; - -#endif - -} diff --git a/include/kalmar_exception.h b/include/kalmar_exception.h index b865540f9fc..55d7cb82c2f 100644 --- a/include/kalmar_exception.h +++ b/include/kalmar_exception.h @@ -16,9 +16,10 @@ namespace Kalmar { #define E_FAIL 0x80004005 #endif -static const char *__errorMsg_UnsupportedAccelerator = "concurrency::parallel_for_each is not supported on the selected accelerator \"CPU accelerator\"."; +static constexpr const char __errorMsg_UnsupportedAccelerator[]{ + "concurrency::parallel_for_each is not supported on the selected accelerator \"CPU accelerator\"."}; -typedef int HRESULT; +typedef decltype(E_FAIL) HRESULT; class runtime_exception : public std::exception { public: diff --git a/include/kalmar_index.h b/include/kalmar_index.h index c5ef82bb9c7..c5e77478c46 100644 --- a/include/kalmar_index.h +++ b/include/kalmar_index.h @@ -451,13 +451,11 @@ class index { public: __attribute__((annotate("__cxxamp_opencl_index"))) - void __cxxamp_opencl_index() restrict(amp,cpu) + void __cxxamp_opencl_index() restrict(amp, cpu) #if __KALMAR_ACCELERATOR__ == 1 { index_helper>::set(*this); } -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - {} #else ; #endif diff --git a/include/kalmar_launch.h b/include/kalmar_launch.h index e06f501d50b..28cf1134dd0 100644 --- a/include/kalmar_launch.h +++ b/include/kalmar_launch.h @@ -7,120 +7,312 @@ #pragma once -#include "hc_defines.h" #include "kalmar_runtime.h" #include "kalmar_serialize.h" +#include "../hc2/external/elfio/elfio.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace Concurrency +{ + template class tiled_extent; + template class tiled_index; +} + +namespace hc +{ + template class tiled_extent; + template class tiled_index; +} + /** \cond HIDDEN_SYMBOLS */ namespace Kalmar { template -static void append_kernel(const std::shared_ptr& pQueue, const Kernel& f, void* kernel) +inline +void append_kernel( + const std::shared_ptr& pQueue, const Kernel& f, void* kernel) { Kalmar::BufferArgumentsAppender vis(pQueue, kernel); Kalmar::Serialize s(&vis); - f.__cxxamp_serialize(s); + //f.__cxxamp_serialize(s); } -template -static inline std::shared_ptr get_availabe_que(const Kernel& f) +// template +// inline +// std::shared_ptr get_available_que(const Kernel& f) +// { +// Kalmar::QueueSearcher ser; +// Kalmar::Serialize s(&ser); +// f.__cxxamp_serialize(s); +// if (ser.get_que()) +// return ser.get_que(); +// else +// return getContext()->auto_select(); +// } + +struct Indexer { + template + operator index() const [[hc]] + { + int tmp[n]{}; + for (auto i = 0; i != n; ++i) tmp[i] = amp_get_global_id(i); + + return index{tmp}; + } + + template + operator Concurrency::tiled_index() const [[hc]] + { + return {}; + } + + template + operator hc::tiled_index() const [[hc]] + { + return {}; + } +}; + +template +struct Kernel_emitter { + static + __attribute__((used, annotate("__HCC_KERNEL__"))) + void entry_point(Kernel f) restrict(cpu, amp) + { + #if __KALMAR_ACCELERATOR__ != 0 + Index tmp = Indexer{}; + f(tmp); + #endif + } +}; + +template +inline +const char* linker_name_for() { - Kalmar::QueueSearcher ser; - Kalmar::Serialize s(&ser); - f.__cxxamp_serialize(s); - if (ser.get_que()) - return ser.get_que(); - else - return getContext()->auto_select(); + static std::once_flag f{}; + static std::string r{}; + + // TODO: this should be fused with the one used in mcwamp_hsa.cpp as a + // for_each_elf(...) function. + std::call_once(f, [&]() { + dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* pr) { + const auto base = info->dlpi_addr; + ELFIO::elfio elf; + + if (!elf.load(base ? info->dlpi_name : "/proc/self/exe")) return 0; + + struct Symbol { + std::string name; + ELFIO::Elf64_Addr value; + ELFIO::Elf_Xword size; + unsigned char bind; + unsigned char type; + ELFIO::Elf_Half section_index; + unsigned char other; + } tmp{}; + for (auto&& section : elf.sections) { + if (section->get_type() != SHT_SYMTAB) continue; + + ELFIO::symbol_section_accessor fn{elf, section}; + + auto n = fn.get_symbols_num(); + while (n--) { + fn.get_symbol( + n, + tmp.name, + tmp.value, + tmp.size, + tmp.bind, + tmp.type, + tmp.section_index, + tmp.other); + + if (tmp.type != STT_FUNC) continue; + + static const auto k_addr = + reinterpret_cast(&Kernel::entry_point); + if (tmp.value + base == k_addr) { + *static_cast(pr) = tmp.name; + + return 1; + } + } + } + + return 0; + }, &r); + }); + + if (r.empty()) { + throw std::runtime_error{ + std::string{"Kernel: "} + + typeid(&Kernel::entry_point).name() + + " is not available."}; + } + + return r.c_str(); } -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -template -inline std::shared_ptr -mcw_cxxamp_launch_kernel_async(const std::shared_ptr& pQueue, size_t *ext, - size_t *local_size, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - //Invoke Kernel::__cxxamp_trampoline as an kernel - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - // FIXME: implicitly casting to avoid pointer to int error - int* foo = reinterpret_cast(&Kernel::__cxxamp_trampoline); - void *kernel = NULL; - { - std::string kernel_name(f.__cxxamp_trampoline_name()); - kernel = CLAMP::CreateKernel(kernel_name, pQueue.get()); - } - append_kernel(pQueue, f, kernel); - return pQueue->LaunchKernelAsync(kernel, dim_ext, ext, local_size); -#endif +template +struct Index_type; + +template +struct Index_type> { + using index_type = index; +}; + +template +struct Index_type> { + using index_type = Concurrency::tiled_index; +}; + +template +struct Index_type> { + using index_type = index; +}; + +template +struct Index_type> { + using index_type = hc::tiled_index; +}; + +template +using IndexType = typename Index_type::index_type; + +template +inline +void* make_registered_kernel( + const std::shared_ptr& q, const Kernel& f) +{ + using K = Kalmar::Kernel_emitter, Kernel>; + + void *kernel{CLAMP::CreateKernel( + linker_name_for(), q.get(), &f, sizeof(Kernel))}; + append_kernel(q, f, kernel); + + return kernel; } -#pragma clang diagnostic pop -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -template +template +constexpr inline -void mcw_cxxamp_launch_kernel(const std::shared_ptr& pQueue, size_t *ext, - size_t *local_size, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - //Invoke Kernel::__cxxamp_trampoline as an kernel - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - // FIXME: implicitly casting to avoid pointer to int error - int* foo = reinterpret_cast(&Kernel::__cxxamp_trampoline); - void *kernel = NULL; - { - std::string kernel_name(f.__cxxamp_trampoline_name()); - kernel = CLAMP::CreateKernel(kernel_name, pQueue.get()); - } - append_kernel(pQueue, f, kernel); - pQueue->LaunchKernel(kernel, dim_ext, ext, local_size); -#endif // __KALMAR_ACCELERATOR__ +std::array local_dimensions(const T&) +{ + return std::array{}; } -#pragma clang diagnostic pop -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -template -inline void* mcw_cxxamp_get_kernel(const std::shared_ptr& pQueue, const Kernel& f) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - //Invoke Kernel::__cxxamp_trampoline as an kernel - //to ensure functor has right operator() defined - //this triggers the trampoline code being emitted - // FIXME: implicitly casting to avoid pointer to int error - int* foo = reinterpret_cast(&Kernel::__cxxamp_trampoline); - void *kernel = NULL; - std::string kernel_name (f.__cxxamp_trampoline_name()); - kernel = CLAMP::CreateKernel(kernel_name, pQueue.get()); - return kernel; -#else - return NULL; -#endif +template +constexpr +inline +std::array local_dimensions( + const Concurrency::tiled_extent&) +{ + return std::array{dims...}; +} + +template +inline +std::array local_dimensions(const hc::tiled_extent& domain) +{ + std::array r{}; + for (auto i = 0; i != n; ++i) r[i] = domain.tile_dim[i]; + + return r; +} + +template +inline +std::pair< + std::array, + std::array> dimensions(const Domain& domain) +{ + using R = std::pair< + std::array, + std::array>; + + R r{}; + for (auto i = 0; i != domain.rank; ++i) r.first[i] = domain[i]; + r.second = local_dimensions(domain); + + return r; } -#pragma clang diagnostic pop -template +template inline -void mcw_cxxamp_execute_kernel_with_dynamic_group_memory( - const std::shared_ptr& pQueue, size_t *ext, size_t *local_size, - const Kernel& f, void *kernel, size_t dynamic_group_memory_size) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - append_kernel(pQueue, f, kernel); - pQueue->LaunchKernelWithDynamicGroupMemory(kernel, dim_ext, ext, local_size, dynamic_group_memory_size); -#endif // __KALMAR_ACCELERATOR__ +std::shared_ptr launch_kernel_async( + const std::shared_ptr& q, + const Domain& domain, + const Kernel& f) +{ + const auto dims{dimensions(domain)}; + + return q->LaunchKernelAsync( + make_registered_kernel(q, f), + Domain::rank, + dims.first.data(), + dims.second.data()); } -template -inline std::shared_ptr -mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async( - const std::shared_ptr& pQueue, size_t *ext, size_t *local_size, - const Kernel& f, void *kernel, size_t dynamic_group_memory_size) restrict(cpu,amp) { -#if __KALMAR_ACCELERATOR__ != 1 - append_kernel(pQueue, f, kernel); - return pQueue->LaunchKernelWithDynamicGroupMemoryAsync(kernel, dim_ext, ext, local_size, dynamic_group_memory_size); -#endif // __KALMAR_ACCELERATOR__ +template +inline +void launch_kernel( + const std::shared_ptr& q, + const Domain& domain, + const Kernel& f) +{ + const auto dims{dimensions(domain)}; + + q->LaunchKernel( + make_registered_kernel(q, f), + Domain::rank, + dims.first.data(), + dims.second.data()); } +template +inline +void launch_kernel_with_dynamic_group_memory( + const std::shared_ptr& q, + const Domain& domain, + const Kernel& f, + std::size_t dynamic_group_memory_size) +{ + const auto dims{dimensions(domain)}; + + q->LaunchKernelWithDynamicGroupMemory( + make_registered_kernel(q, f), + Domain::rank, + dims.first.data(), + dims.second.data(), + domain.dynamic_group_segment_size()); +} + +template +inline +std::shared_ptr launch_kernel_with_dynamic_group_memory_async( + const std::shared_ptr& q, + const Domain& domain, + const Kernel& f) +{ + const auto dims{dimensions(domain)}; + + return q->LaunchKernelWithDynamicGroupMemoryAsync( + make_registered_kernel(q, f), + Domain::rank, + dims.first.data(), + dims.second.data(), + domain.get_dynamic_group_segment_size()); +} } // namespace Kalmar /** \endcond */ diff --git a/include/kalmar_runtime.h b/include/kalmar_runtime.h index 7a9438de578..193d3eba456 100644 --- a/include/kalmar_runtime.h +++ b/include/kalmar_runtime.h @@ -3,6 +3,8 @@ #include "hc_defines.h" #include "kalmar_aligned_alloc.h" +#include + namespace hc { class AmPointerInfo; class completion_future; @@ -200,16 +202,38 @@ class KalmarQueue virtual void wait(hcWaitMode mode = hcWaitModeBlocked) {} // sync kernel launch with dynamic group memory - virtual void LaunchKernelWithDynamicGroupMemory(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size, size_t dynamic_group_size) {} + virtual + void LaunchKernelWithDynamicGroupMemory( + void* kernel, + size_t dim_ext, + const size_t* ext, + const size_t* local_size, + size_t dynamic_group_size) = 0; // async kernel launch with dynamic group memory - virtual std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size, size_t dynamic_group_size) { return nullptr; } + virtual + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + void* kernel, + std::size_t dim_ext, + const std::size_t* ext, + const std::size_t* local_size, + std::size_t dynamic_group_size) = 0; // sync kernel launch - virtual void LaunchKernel(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size) {} + virtual + void LaunchKernel( + void* kernel, + size_t dim_ext, + const size_t* ext, + const size_t* local_size) = 0; // async kernel launch - virtual std::shared_ptr LaunchKernelAsync(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size) { return LaunchKernelWithDynamicGroupMemoryAsync(kernel, dim_ext, ext, local_size, 0); } + virtual + std::shared_ptr LaunchKernelAsync( + void* kernel, + std::size_t dim_ext, + const std::size_t* ext, + const std::size_t* local_size) = 0; /// read data from device to host virtual void read(void* device, void* dst, size_t count, size_t offset) = 0; @@ -376,7 +400,12 @@ class KalmarDevice virtual void BuildProgram(void* size, void* source) {} /// create kernel - virtual void* CreateKernel(const char* fun, KalmarQueue *queue) { return nullptr; } + virtual + void* CreateKernel( + const char* fun, + KalmarQueue *queue, + const void* callable = nullptr, + std::size_t callable_size = 0u) = 0; /// check if a given kernel is compatible with the device virtual bool IsCompatibleKernel(void* size, void* source) { return true; } @@ -457,6 +486,48 @@ class CPUQueue final : public KalmarQueue memmove((char*)dst + dst_offset, (char*)src + src_offset, count); } + void* CreateKernel( + const char*, KalmarQueue*, const void*, std::size_t) override + { + return nullptr; + } + void LaunchKernel( + void*, + std::size_t, + const std::size_t*, + const std::size_t*) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::shared_ptr LaunchKernelAsync( + void*, + std::size_t, + const std::size_t*, + const std::size_t*) override + { + throw std::runtime_error{"Unsupported."}; + } + void LaunchKernelWithDynamicGroupMemory( + void*, + std::size_t, + const std::size_t*, + const std::size_t*, + std::size_t) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + void*, + std::size_t, + const std::size_t*, + const std::size_t*, + std::size_t) override + { + throw std::runtime_error{"Unimplemented."}; + } + void* map(void* device, size_t count, size_t offset, bool modify) override { return (char*)device + offset; } @@ -482,7 +553,14 @@ class CPUDevice final : public KalmarDevice std::shared_ptr createQueue(execute_order order = execute_in_order) override { return std::shared_ptr(new CPUQueue(this)); } void* create(size_t count, struct rw_info* /* not used */ ) override { return kalmar_aligned_alloc(0x1000, count); } void release(void* ptr, struct rw_info* /* nout used */) override { kalmar_aligned_free(ptr); } - void* CreateKernel(const char* fun, KalmarQueue *queue) { return nullptr; } + void* CreateKernel( + const char*, + KalmarQueue*, + const void* = nullptr, + std::size_t = 0u) + { + return nullptr; + } }; /// KalmarContext @@ -567,19 +645,8 @@ class KalmarContext KalmarContext *getContext(); namespace CLAMP { -// used in parallel_for_each.h -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -extern bool is_cpu(); -extern bool in_cpu_kernel(); -extern void enter_kernel(); -extern void leave_kernel(); -#endif - -extern void *CreateKernel(std::string, KalmarQueue*); - -extern void PushArg(void *, int, size_t, const void *); -extern void PushArgPtr(void *, int, size_t, const void *); - +void* CreateKernel( + const char*, KalmarQueue*, const void* = nullptr, std::size_t = 0u); } // namespace CLAMP static inline const std::shared_ptr get_cpu_queue() { @@ -681,14 +748,6 @@ struct rw_info rw_info(const size_t count, void* ptr) : data(ptr), count(count), curr(nullptr), master(nullptr), stage(nullptr), devs(), mode(access_type_none), HostPtr(ptr != nullptr), toReleaseDevPointer(true) { -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - /// if array_view is constructed in cpu path kernel - /// allocate memory for it and do nothing - if (CLAMP::in_cpu_kernel() && ptr == nullptr) { - data = kalmar_aligned_alloc(0x1000, count); - return; - } -#endif if (ptr) { mode = access_type_read_write; curr = master = get_cpu_queue(); @@ -705,12 +764,6 @@ struct rw_info rw_info(const std::shared_ptr& Queue, const std::shared_ptr& Stage, const size_t count, access_type mode_) : data(nullptr), count(count), curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(true) { -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (CLAMP::in_cpu_kernel() && data == nullptr) { - data = kalmar_aligned_alloc(0x1000, count); - return; - } -#endif if (mode == access_type_auto) mode = curr->getDev()->get_access(); devs[curr->getDev()] = {curr->getDev()->create(count, this), modified}; @@ -789,10 +842,6 @@ struct rw_info /// @blcok: this call will be blocking or not /// none blocking occurs in serialization stage void sync(std::shared_ptr pQueue, bool modify, bool block = true) { -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (CLAMP::in_cpu_kernel()) - return; -#endif if (!curr) { /// This can only happen if array_view is constructed with size and /// is not accessed before @@ -928,13 +977,6 @@ struct rw_info } ~rw_info() { -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - if (CLAMP::in_cpu_kernel()) { - if (data && !HostPtr) - kalmar_aligned_free(data); - return; - } -#endif /// If this rw_info is constructed by host pointer /// 1. synchronize latest data to host pointer /// 2. Because the data pointer cannot be released, erase itself from devs diff --git a/include/kalmar_serialize.h b/include/kalmar_serialize.h index 5cc2d932f99..f5e8a40c248 100644 --- a/include/kalmar_serialize.h +++ b/include/kalmar_serialize.h @@ -66,12 +66,10 @@ class BufferArgumentsAppender : public FunctorBufferWalker BufferArgumentsAppender(std::shared_ptr pQueue, void* k) : pQueue(pQueue), k_(k), current_idx_(0) {} void Append(size_t sz, const void *s) override { - CLAMP::PushArg(k_, current_idx_++, sz, s); } void AppendPtr(size_t sz, const void *s) override { - CLAMP::PushArgPtr(k_, current_idx_++, sz, s); } - void visit_buffer(struct rw_info* rw, bool modify, bool isArray) override { + void visit_buffer(rw_info* rw, bool modify, bool isArray) override { if (isArray) { auto curr = pQueue->getDev()->get_path(); auto path = rw->master->getDev()->get_path(); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 6e31089a17d..01f19fdad34 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -16,7 +16,6 @@ set( CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hcc ) # C++AMP runtime (mcwamp) #################### add_mcwamp_library(mcwamp mcwamp.cpp) -add_mcwamp_library(mcwamp_atomic mcwamp_atomic.cpp) # Library interface to use runtime add_library(hccrt INTERFACE) @@ -79,7 +78,7 @@ add_subdirectory(cpu) #################### # install targets #################### -install(TARGETS mcwamp mcwamp_atomic hccrt hccshared +install(TARGETS mcwamp hccrt hccshared EXPORT hcc-targets RUNTIME DESTINATION bin LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/lib/cpu/mcwamp_cpu.cpp b/lib/cpu/mcwamp_cpu.cpp index 47d1bda60fd..946428a61b0 100644 --- a/lib/cpu/mcwamp_cpu.cpp +++ b/lib/cpu/mcwamp_cpu.cpp @@ -5,17 +5,16 @@ // //===----------------------------------------------------------------------===// +#include +#include + #include #include #include #include +#include #include -#include -#include - -extern "C" void PushArgImpl(void *ker, int idx, size_t sz, const void *v) {} - namespace Kalmar { class CPUFallbackQueue final : public KalmarQueue @@ -24,6 +23,40 @@ class CPUFallbackQueue final : public KalmarQueue CPUFallbackQueue(KalmarDevice* pDev) : KalmarQueue(pDev) {} + void LaunchKernel( + void*, std::size_t, const std::size_t*, const std::size_t*) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::shared_ptr LaunchKernelAsync( + void*, + std::size_t, + const std::size_t*, + const std::size_t*) override + { + throw std::runtime_error{"Unsupported."}; + } + void LaunchKernelWithDynamicGroupMemory( + void*, + std::size_t, + const std::size_t*, + const std::size_t*, + std::size_t) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + void*, + std::size_t, + const std::size_t*, + const std::size_t*, + std::size_t) override + { + throw std::runtime_error{"Unimplemented."}; + } + void read(void* device, void* dst, size_t count, size_t offset) override { if (dst != device) memmove(dst, (char*)device + offset, count); @@ -71,6 +104,12 @@ class CPUFallbackDevice final : public KalmarDevice std::shared_ptr createQueue(execute_order order = execute_in_order) override { return std::shared_ptr(new CPUFallbackQueue(this)); } + + void* CreateKernel( + const char*, KalmarQueue*, const void* = nullptr, std::size_t = 0u) + { + return nullptr; + } }; template inline void deleter(T* ptr) { delete ptr; } @@ -79,7 +118,7 @@ class CPUContext final : public KalmarContext { public: CPUContext() { Devices.push_back(new CPUFallbackDevice); } - ~CPUContext() { std::for_each(std::begin(Devices), std::end(Devices), deleter); } + ~CPUContext() { for (auto&& x : Devices) deleter(x); } }; diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 87269ce76af..71ab2060c2a 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -6,18 +6,35 @@ // Kalmar Runtime implementation (HSA version) +#include "kalmar_runtime.h" +#include "kalmar_aligned_alloc.h" + +#include "hc_am_internal.hpp" +#include "unpinned_copy_engine.h" +#include "hc_rt_debug.h" +#include "hc_printf.hpp" + #include "../hc2/headers/types/program_state.hpp" +#include +#include +#include +#include +#include + #include #include #include #include #include #include +#include #include #include +#include #include #include +#include #include #include #include @@ -27,28 +44,11 @@ #include #ifndef USE_LIBCXX -#include + #include #endif -#include -#include -#include -#include -#include - -#include "kalmar_runtime.h" -#include "kalmar_aligned_alloc.h" - -#include "hc_am_internal.hpp" -#include "unpinned_copy_engine.h" -#include "hc_rt_debug.h" -#include "hc_printf.hpp" - -#include -#include - #ifndef KALMAR_DEBUG -#define KALMAR_DEBUG (0) + #define KALMAR_DEBUG (0) #endif #define CHECK_OLDER_COMPLETE 0 @@ -550,11 +550,6 @@ inline static void checkHCCRuntimeStatus(const HCCRuntimeStatus status, const un } // namespace Kalmar - - -extern "C" void PushArgImpl(void *ker, int idx, size_t sz, const void *v); -extern "C" void PushArgPtrImpl(void *ker, int idx, size_t sz, const void *v); - // forward declaration namespace Kalmar { class HSAQueue; @@ -983,87 +978,95 @@ class HSABarrier : public HSAOp { }; // end of HSABarrier class HSADispatch : public HSAOp { -private: - Kalmar::HSADevice* device; + Kalmar::HSADevice* device_{nullptr}; - const char *kernel_name; - const HSAKernel* kernel; + const char* kernel_name_{nullptr}; + const HSAKernel* kernel_{nullptr}; - std::vector arg_vec; - uint32_t arg_count; - size_t prevArgVecCapacity; - void* kernargMemory; - int kernargMemoryIndex; + std::unique_ptr kernargMemory_{ + nullptr, hsa_amd_memory_unlock}; + hsa_kernel_dispatch_packet_t aql_{}; + bool isDispatched_{false}; + hsa_wait_state_t waitMode_{}; - hsa_kernel_dispatch_packet_t aql; - bool isDispatched; - hsa_wait_state_t waitMode; - - - std::shared_future* future; - + std::unique_ptr> future_{}; public: - std::shared_future* getFuture() override { return future; } - - void setKernelName(const char *x_kernel_name) { kernel_name = x_kernel_name;}; - const char *getKernelName() { return kernel_name ? kernel_name : (kernel ? kernel->shortKernelName.c_str() : ""); }; - const char *getLongKernelName() { return (kernel ? kernel->getLongKernelName().c_str() : ""); }; + std::shared_future* getFuture() override { return future_.get(); } + void setKernelName(const char* name) { kernel_name_ = name; } + const char* getKernelName() const + { + return kernel_name_ ? kernel_name_ : + (kernel_ ? kernel_->shortKernelName.c_str() : ""); + } + const char* getLongKernelName() const + { + return kernel_ ? + kernel_->getLongKernelName().c_str() : ""; + } void setWaitMode(Kalmar::hcWaitMode mode) override { switch (mode) { case Kalmar::hcWaitModeBlocked: - waitMode = HSA_WAIT_STATE_BLOCKED; + waitMode_ = HSA_WAIT_STATE_BLOCKED; break; case Kalmar::hcWaitModeActive: - waitMode = HSA_WAIT_STATE_ACTIVE; + waitMode_ = HSA_WAIT_STATE_ACTIVE; break; } } - ~HSADispatch() { - - if (isDispatched) { - hsa_status_t status = HSA_STATUS_SUCCESS; - status = waitComplete(); + if (isDispatched_) { + auto status = waitComplete(); STATUS_CHECK(status, __LINE__); } dispose(); } - HSADispatch(Kalmar::HSADevice* _device, Kalmar::KalmarQueue* _queue, HSAKernel* _kernel, - const hsa_kernel_dispatch_packet_t *aql=nullptr); + HSADispatch( + Kalmar::HSADevice* device, + Kalmar::KalmarQueue* queue, + HSAKernel* kernel, + const hsa_kernel_dispatch_packet_t* aql = nullptr); + HSADispatch( + Kalmar::HSADevice* device, + Kalmar::KalmarQueue* queue, + HSAKernel* kernel, + const void* callable, + std::size_t callable_size, + const hsa_kernel_dispatch_packet_t* aql = nullptr) + : HSADispatch{device, queue, kernel, aql} + { + void* tmp{nullptr}; + auto r = hsa_amd_memory_lock( + const_cast(callable), callable_size, nullptr, 0, &tmp); - hsa_status_t pushFloatArg(float f) { return pushArgPrivate(f); } - hsa_status_t pushIntArg(int i) { return pushArgPrivate(i); } - hsa_status_t pushBooleanArg(unsigned char z) { return pushArgPrivate(z); } - hsa_status_t pushByteArg(char b) { return pushArgPrivate(b); } - hsa_status_t pushLongArg(long j) { return pushArgPrivate(j); } - hsa_status_t pushDoubleArg(double d) { return pushArgPrivate(d); } - hsa_status_t pushShortArg(short s) { return pushArgPrivate(s); } - hsa_status_t pushPointerArg(void *addr) { return pushArgPrivate(addr); } + STATUS_CHECK(r, __LINE__); - hsa_status_t clearArgs() { - arg_count = 0; - arg_vec.clear(); - return HSA_STATUS_SUCCESS; + kernargMemory_.reset(tmp); } - void overrideAcquireFenceIfNeeded(); - hsa_status_t setLaunchConfiguration(const int dims, size_t *globalDims, size_t *localDims, - const int dynamicGroupSize); + hsa_status_t setLaunchConfiguration( + int dims, + const std::size_t* globalDims, + const std::size_t* localDims, + int dynamicGroupSize); hsa_status_t dispatchKernelWaitComplete(); hsa_status_t dispatchKernelAsyncFromOp(); - hsa_status_t dispatchKernelAsync(const void *hostKernarg, int hostKernargSize, bool allocSignal); + hsa_status_t dispatchKernelAsync( + void *hostKernarg, std::size_t hostKernargSize, bool allocSignal); // dispatch a kernel asynchronously - hsa_status_t dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg, - int hostKernargSize, bool allocSignal); + hsa_status_t dispatchKernel( + hsa_queue_t* lockedHsaQueue, + void *hostKernarg, + std::size_t hostKernargSize, + bool allocSignal); // wait for the kernel to finish execution hsa_status_t waitComplete(); @@ -1081,36 +1084,7 @@ class HSADispatch : public HSAOp { uint64_t getEndTimestamp() override; - const hsa_kernel_dispatch_packet_t &getAql() const { return aql; }; - -private: - template - hsa_status_t pushArgPrivate(T val) { - /* add padding if necessary */ - int padding_size = (arg_vec.size() % sizeof(T)) ? (sizeof(T) - (arg_vec.size() % sizeof(T))) : 0; -#if KALMAR_DEBUG && HCC_DEBUG_KARG - printf("push %lu bytes into kernarg: ", sizeof(T) + padding_size); -#endif - for (size_t i = 0; i < padding_size; ++i) { - arg_vec.push_back((uint8_t)0x00); -#if KALMAR_DEBUG && HCC_DEBUG_KARG - printf("%02X ", (uint8_t)0x00); -#endif - } - uint8_t* ptr = static_cast(static_cast(&val)); - for (size_t i = 0; i < sizeof(T); ++i) { - arg_vec.push_back(ptr[i]); -#if KALMAR_DEBUG && HCC_DEBUG_KARG - printf("%02X ", ptr[i]); -#endif - } -#if KALMAR_DEBUG && HCC_DEBUG_KARG - printf("\n"); -#endif - arg_count++; - return HSA_STATUS_SUCCESS; - } - + const hsa_kernel_dispatch_packet_t& getAql() const { return aql_; }; }; // end of HSADispatch //----- @@ -1603,27 +1577,29 @@ class HSAQueue final : public KalmarQueue drainingQueue_ = false; } - void LaunchKernel(void *ker, size_t nr_dim, size_t *global, size_t *local) override { + void LaunchKernel( + void* ker, + size_t nr_dim, + const size_t* global, + const size_t* local) override + { LaunchKernelWithDynamicGroupMemory(ker, nr_dim, global, local, 0); } - void LaunchKernelWithDynamicGroupMemory(void *ker, size_t nr_dim, size_t *global, size_t *local, size_t dynamic_group_size) override { - HSADispatch *dispatch = - reinterpret_cast(ker); - size_t tmp_local[] = {0, 0, 0}; - if (!local) - local = tmp_local; - dispatch->setLaunchConfiguration(nr_dim, global, local, dynamic_group_size); + void LaunchKernelWithDynamicGroupMemory( + void* ker, + size_t nr_dim, + const size_t* global, + const size_t* local, + size_t dynamic_group_size) override + { + std::unique_ptr dispatch{static_cast(ker)}; + dispatch->setLaunchConfiguration( + nr_dim, global, local, dynamic_group_size); // wait for previous kernel dispatches be completed - std::for_each(std::begin(kernelBufferMap[ker]), std::end(kernelBufferMap[ker]), - [&] (void* buffer) { - waitForDependentAsyncOps(buffer); - }); - - waitForStreamDeps(dispatch); - - + for (auto&& buf : kernelBufferMap[ker]) waitForDependentAsyncOps(buf); + waitForStreamDeps(dispatch.get()); // dispatch the kernel // and wait for its completion @@ -1632,61 +1608,60 @@ class HSAQueue final : public KalmarQueue // clear data in kernelBufferMap kernelBufferMap[ker].clear(); kernelBufferMap.erase(ker); - - delete(dispatch); } - std::shared_ptr LaunchKernelAsync(void *ker, size_t nr_dim, size_t *global, size_t *local) override { - return LaunchKernelWithDynamicGroupMemoryAsync(ker, nr_dim, global, local, 0); + std::shared_ptr LaunchKernelAsync( + void* ker, + std::size_t nr_dim, + const std::size_t* global, + const std::size_t* local) override + { + return LaunchKernelWithDynamicGroupMemoryAsync( + ker, nr_dim, global, local, 0); } - std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync(void *ker, size_t nr_dim, size_t *global, size_t *local, size_t dynamic_group_size) override { - hsa_status_t status = HSA_STATUS_SUCCESS; - + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + void* ker, + size_t nr_dim, + const size_t* global, + const size_t* local, + size_t dynamic_group_size) override + { HSADispatch *dispatch = reinterpret_cast(ker); - - - bool hasArrayViewBufferDeps = (kernelBufferMap.find(ker) != kernelBufferMap.end()); - + bool hasArrayViewBufferDeps = + (kernelBufferMap.find(ker) != kernelBufferMap.end()); if (hasArrayViewBufferDeps) { - // wait for previous kernel dispatches be completed - std::for_each(std::begin(kernelBufferMap[ker]), std::end(kernelBufferMap[ker]), - [&] (void* buffer) { - waitForDependentAsyncOps(buffer); - }); + for (auto&& buffer : kernelBufferMap[ker]) { + waitForDependentAsyncOps(buffer); + } } waitForStreamDeps(dispatch); - // create a shared_ptr instance std::shared_ptr sp_dispatch(dispatch); // associate the kernel dispatch with this queue pushAsyncOp(std::static_pointer_cast (sp_dispatch)); - size_t tmp_local[] = {0, 0, 0}; - if (!local) - local = tmp_local; - dispatch->setLaunchConfiguration(nr_dim, global, local, dynamic_group_size); + dispatch->setLaunchConfiguration( + nr_dim, global, local, dynamic_group_size); // dispatch the kernel - status = dispatch->dispatchKernelAsyncFromOp(); + auto status = dispatch->dispatchKernelAsyncFromOp(); STATUS_CHECK(status, __LINE__); - if (hasArrayViewBufferDeps) { - // associate all buffers used by the kernel with the kernel dispatch instance - std::for_each(std::begin(kernelBufferMap[ker]), std::end(kernelBufferMap[ker]), - [&] (void* buffer) { - bufferKernelMap[buffer].push_back(sp_dispatch); - }); - - // clear data in kernelBufferMap - kernelBufferMap[ker].clear(); - kernelBufferMap.erase(ker); + // associate all buffers used by the kernel with the kernel dispatch + // instance + for (auto&& buffer : kernelBufferMap[ker]) { + bufferKernelMap[buffer].emplace_back(sp_dispatch); + } + + // clear data in kernelBufferMap + kernelBufferMap.erase(ker); } return sp_dispatch; @@ -1957,8 +1932,6 @@ class HSAQueue final : public KalmarQueue } void Push(void *kernel, int idx, void *device, bool modify) override { - PushArgImpl(kernel, idx, sizeof(void*), &device); - // register the buffer with the kernel // when the buffer may be read/written by the kernel // the buffer is not registered if it's only read by the kernel @@ -1992,9 +1965,12 @@ class HSAQueue final : public KalmarQueue return true; } - void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, - const void * args, size_t argsize, - hc::completion_future *cf, const char *kernelName) override ; + void dispatch_hsa_kernel( + const hsa_kernel_dispatch_packet_t* aql, + void* args, + size_t argsize, + hc::completion_future* cf, + const char* kernelName) override; bool set_cu_mask(const std::vector& cu_mask) override { // get device's total compute unit count @@ -2762,7 +2738,11 @@ class HSADevice final : public KalmarDevice return isCompatible; } - void* CreateKernel(const char* fun, Kalmar::KalmarQueue *queue) override { + void* CreateKernel( + const char* fun, + Kalmar::KalmarQueue *queue, + const void* callable, + std::size_t callable_size) override { // try load kernels lazily in case it was not done so at bootstrap // due to HCC_LAZYINIT env var if (executables.size() == 0) { @@ -2786,64 +2766,14 @@ class HSADevice final : public KalmarDevice #endif shortName = demangleStatus ? fun : std::string(demangled); try { - if (demangleStatus == 0) { - - if (kernelNameFormat == 2) { - shortName = demangled; - } else { - // kernelNameFormat == 0 or unspecified: - - // Example: HIP_kernel_functor_name_begin_unnamed_HIP_kernel_functor_name_end_5::__cxxamp_trampoline(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, float*, long long)" - - std::string hip_begin_str ("::HIP_kernel_functor_name_begin_"); - std::string hip_end_str ("_HIP_kernel_functor_name_end"); - int hip_begin = shortName.find(hip_begin_str); - int hip_end = shortName.find(hip_end_str); - - if ((hip_begin != -1) && (hip_end != -1) && (hip_end > hip_begin)) { - // HIP kernel with markers - int start_pos = hip_begin + hip_begin_str.length(); - std::string hipname = shortName.substr(start_pos, hip_end - start_pos) ; - DBOUTL(DB_CODE, "hipname=" << hipname); - if (hipname == "unnamed") { - shortName = shortName.substr(0, hip_begin); - } else { - shortName = hipname; - } - - } else { - // PFE not from HIP: - - // strip off hip launch template wrapper: - std::string hipImplString ("void hip_impl::grid_launch_hip_impl_<"); - int begin = shortName.find(hipImplString); - if ((begin != std::string::npos)) { - begin += hipImplString.length() ; - } else { - begin = 0; - } - - shortName = shortName.substr(begin); - - // Strip off any leading return type: - begin = shortName.find(" ", 0); - if (begin == std::string::npos) { - begin = 0; - } else { - begin +=1; // skip the space - } - shortName = shortName.substr(begin); - - DBOUTL(DB_CODE, "shortKernel processing demangled non-hip. beginChar=" << begin << " shortName=" << shortName); - } - - } + if (demangleStatus == 0 && kernelNameFormat == 2) { + shortName = demangled; + } - if (HCC_DB_SYMBOL_FORMAT & 0x10) { - // trim everything after first ( - int begin = shortName.find("("); - shortName = shortName.substr(0, begin); - } + if (HCC_DB_SYMBOL_FORMAT & 0x10) { + // trim everything after first ( + int begin = shortName.find("("); + shortName = shortName.substr(0, begin); } } catch (std::out_of_range& exception) { // Do something sensible if string pattern is not what we expect @@ -2855,13 +2785,16 @@ class HSADevice final : public KalmarDevice DBOUT (DB_CODE, "CreateKernel_raw= " << fun << "\n"); if (executables.size() != 0) { - for (auto executable_iterator : executables) { + for (auto&& executable_iterator : executables) { HSAExecutable *executable = executable_iterator.second; // Get symbol handle. - hsa_status_t status; hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol_by_name(executable->hsaExecutable, fun, const_cast(&agent), &kernelSymbol); + auto status = hsa_executable_get_symbol_by_name( + executable->hsaExecutable, + fun, + const_cast(&agent), + &kernelSymbol); if (status == HSA_STATUS_SUCCESS) { // Get code handle. uint64_t kernelCodeHandle; @@ -2915,8 +2848,7 @@ class HSADevice final : public KalmarDevice // HSAQueue::LaunchKernel() // or it will be created as a shared_ptr in: // HSAQueue::LaunchKernelAsync() - HSADispatch *dispatch = new HSADispatch(this, queue, kernel); - return dispatch; + return new HSADispatch{this, queue, kernel, callable, callable_size}; } std::shared_ptr createQueue(execute_order order = execute_in_order) override { @@ -2936,7 +2868,7 @@ class HSADevice final : public KalmarDevice std::vector< std::shared_ptr > get_all_queues() override { std::vector< std::shared_ptr > result; queues_mutex.lock(); - for (auto queue : queues) { + for (auto&& queue : queues) { if (!queue.expired()) { result.push_back(queue.lock()); } @@ -3048,137 +2980,6 @@ class HSADevice final : public KalmarDevice } } - void growKernargBuffer() - { - uint8_t * kernargMemory = nullptr; - // increase kernarg pool on demand by KERNARG_POOL_SIZE - hsa_amd_memory_pool_t kernarg_region = getHSAKernargRegion(); - - hsa_status_t status = hsa_amd_memory_pool_allocate(kernarg_region, KERNARG_POOL_SIZE * KERNARG_BUFFER_SIZE, 0, (void**)(&kernargMemory)); - STATUS_CHECK(status, __LINE__); - - status = hsa_amd_agents_allow_access(1, &agent, NULL, kernargMemory); - STATUS_CHECK(status, __LINE__); - - for (size_t i = 0; i < KERNARG_POOL_SIZE * KERNARG_BUFFER_SIZE; i+=KERNARG_BUFFER_SIZE) { - kernargPool.push_back(kernargMemory+i); - kernargPoolFlag.push_back(false); - }; - } - - std::pair getKernargBuffer(int size) { - void* ret = nullptr; - int cursor = 0; - - // find an available buffer in the pool in case - // - kernarg pool is available - // - requested size is smaller than KERNARG_BUFFER_SIZE - if ( (KERNARG_POOL_SIZE > 0) && (size <= KERNARG_BUFFER_SIZE) ) { - kernargPoolMutex.lock(); - cursor = kernargCursor; - - if (kernargPoolFlag[cursor] == false) { - // the cursor is valid, use it - ret = kernargPool[cursor]; - - // set the kernarg buffer as used - kernargPoolFlag[cursor] = true; - - // simply move the cursor to the next index - ++kernargCursor; - if (kernargCursor == kernargPool.size()) kernargCursor = 0; - } else { - // the cursor is not valid, sequentially find the next available slot - bool found = false; - - int startingCursor = cursor; - do { - ++cursor; - if (cursor == kernargPool.size()) cursor = 0; - - if (kernargPoolFlag[cursor] == false) { - // the cursor is valid, use it - ret = kernargPool[cursor]; - - // set the kernarg buffer as used - kernargPoolFlag[cursor] = true; - - // simply move the cursor to the next index - kernargCursor = cursor + 1; - if (kernargCursor == kernargPool.size()) kernargCursor = 0; - - // break from the loop - found = true; - break; - } - } while(cursor != startingCursor); // ensure we at most scan the vector once - - if (found == false) { - hsa_status_t status = HSA_STATUS_SUCCESS; - - // increase kernarg pool on demand by KERNARG_POOL_SIZE - hsa_amd_memory_pool_t kernarg_region = getHSAKernargRegion(); - - // keep track of the size of kernarg pool before increasing it - int oldKernargPoolSize = kernargPool.size(); - int oldKernargPoolFlagSize = kernargPoolFlag.size(); - assert(oldKernargPoolSize == oldKernargPoolFlagSize); - - - growKernargBuffer(); - assert(kernargPool.size() == oldKernargPoolSize + KERNARG_POOL_SIZE); - assert(kernargPoolFlag.size() == oldKernargPoolFlagSize + KERNARG_POOL_SIZE); - - // set return values, after the pool has been increased - - // use the first item in the newly allocated pool - cursor = oldKernargPoolSize; - - // access the new item through the newly assigned cursor - ret = kernargPool[cursor]; - - // mark the item as used - kernargPoolFlag[cursor] = true; - - // simply move the cursor to the next index - kernargCursor = cursor + 1; - if (kernargCursor == kernargPool.size()) kernargCursor = 0; - - found = true; - } - - } - - kernargPoolMutex.unlock(); - memset (ret, 0x00, KERNARG_BUFFER_SIZE); - } else { - // allocate new buffers in case: - // - the kernarg pool is set at compile-time - // - requested kernarg buffer size is larger than KERNARG_BUFFER_SIZE - // - - hsa_status_t status = HSA_STATUS_SUCCESS; - hsa_amd_memory_pool_t kernarg_region = getHSAKernargRegion(); - - status = hsa_amd_memory_pool_allocate(kernarg_region, size, 0, &ret); - STATUS_CHECK(status, __LINE__); - - status = hsa_amd_agents_allow_access(1, &agent, NULL, ret); - STATUS_CHECK(status, __LINE__); - - DBOUTL(DB_RESOURCE, "Allocating non-pool kernarg buffer size=" << size ); - - // set cursor value as -1 to notice the buffer would be deallocated - // instead of recycled back into the pool - cursor = -1; - memset (ret, 0x00, size); - } - - - - return std::make_pair(ret, cursor); - } - void* getSymbolAddress(const char* symbolName) override { hsa_status_t status; @@ -3898,13 +3699,6 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : KalmarD } useCoarseGrainedRegion = result; - /// pre-allocate a pool of kernarg buffers in case: - /// - kernarg region is available - /// - compile-time macro KERNARG_POOL_SIZE is larger than 0 -#if KERNARG_POOL_SIZE > 0 - growKernargBuffer(); -#endif - // Setup AM pool. ri._am_memory_pool = (ri._found_local_memory_pool) ? ri._local_memory_pool @@ -4254,9 +4048,12 @@ std::shared_ptr HSAQueue::EnqueueAsyncCopy(const void *src, void void -HSAQueue::dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, - const void * args, size_t argSize, - hc::completion_future *cf, const char *kernelName) override +HSAQueue::dispatch_hsa_kernel( + const hsa_kernel_dispatch_packet_t *aql, + void * args, + size_t argSize, + hc::completion_future* cf, + const char *kernelName) { uint16_t dims = (aql->setup >> HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS) & ((1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS) - 1); @@ -4310,25 +4107,21 @@ HSAQueue::dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, // member function implementation of HSADispatch // ---------------------------------------------------------------------- -HSADispatch::HSADispatch(Kalmar::HSADevice* _device, Kalmar::KalmarQueue *queue, HSAKernel* _kernel, - const hsa_kernel_dispatch_packet_t *aql) : - HSAOp(queue, Kalmar::hcCommandKernel), - device(_device), - kernel_name(nullptr), - kernel(_kernel), - isDispatched(false), - waitMode(HSA_WAIT_STATE_BLOCKED), - future(nullptr), - kernargMemory(nullptr) -{ - if (aql) { - this->aql = *aql; - } - clearArgs(); -} - - - +HSADispatch::HSADispatch( + Kalmar::HSADevice* device, + Kalmar::KalmarQueue *queue, + HSAKernel* kernel, + const hsa_kernel_dispatch_packet_t *aql) : + HSAOp{queue, Kalmar::hcCommandKernel}, + device_{device}, + kernel_name_{nullptr}, + kernel_{kernel}, + aql_{aql ? *aql : hsa_kernel_dispatch_packet_t{}}, + isDispatched_{false}, + waitMode_{HSA_WAIT_STATE_BLOCKED}, + future_{}, + kernargMemory_{nullptr, hsa_amd_memory_unlock} +{} static std::ostream& PrintHeader(std::ostream& os, uint16_t h) { @@ -4430,13 +4223,15 @@ static void printKernarg(const void *kernarg_address, int bytesToPrint) // dispatch a kernel asynchronously -// - allocates signal, copies arguments into kernarg buffer, and places aql packet into queue. -hsa_status_t -HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg, - int hostKernargSize, bool allocSignal) { - +// - allocates signal and places aql packet into queue. +hsa_status_t HSADispatch::dispatchKernel( + hsa_queue_t* lockedHsaQueue, + void *hostKernarg, + std::size_t hostKernargSize, + bool allocSignal) +{ hsa_status_t status = HSA_STATUS_SUCCESS; - if (isDispatched) { + if (isDispatched_) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -4446,7 +4241,7 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg */ // set dispatch fences // The fence bits must be set on entry into this function. - uint16_t header = aql.header; + uint16_t header = aql_.header; if (hsaQueue()->get_execute_order() == Kalmar::execute_in_order) { //std::cout << "barrier bit on\n"; // set AQL header with barrier bit on if execute in order @@ -4458,25 +4253,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg header |= (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE); } + aql_.kernarg_address = kernargMemory_.get(); - // bind kernel arguments - //printf("hostKernargSize size: %d in bytesn", hostKernargSize); - - if (hostKernargSize > 0) { - hsa_amd_memory_pool_t kernarg_region = device->getHSAKernargRegion(); - std::pair ret = device->getKernargBuffer(hostKernargSize); - kernargMemory = ret.first; - kernargMemoryIndex = ret.second; - //std::cerr << "op #" << getSeqNum() << " allocated kernarg cursor=" << kernargMemoryIndex << "\n"; - - // as kernarg buffers are fine-grained, we can directly use memcpy - memcpy(kernargMemory, hostKernarg, hostKernargSize); - - aql.kernarg_address = kernargMemory; - } else { - aql.kernarg_address = nullptr; - } - + std::cout << aql_.kernarg_address << std::endl; // write packet uint32_t queueMask = lockedHsaQueue->size - 1; @@ -4492,7 +4271,7 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg &(((hsa_kernel_dispatch_packet_t*)(lockedHsaQueue->base_address))[index & queueMask]); // Copy mostly-finished AQL packet into the queue - *q_aql = aql; + *q_aql = aql_; // Set some specific fields: if (allocSignal) { @@ -4523,7 +4302,7 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg // Ring door bell hsa_signal_store_relaxed(lockedHsaQueue->doorbell_signal, index); - isDispatched = true; + isDispatched_ = true; return status; } @@ -4533,18 +4312,15 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg // wait for the kernel to finish execution inline hsa_status_t HSADispatch::waitComplete() { - hsa_status_t status = HSA_STATUS_SUCCESS; - if (!isDispatched) { + if (!isDispatched_) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - - if (_signal.handle) { - DBOUT(DB_MISC, "wait for kernel dispatch op#" << *this << " completion with wait flag: " << waitMode << " signal="<< std::hex << _signal.handle << std::dec << "\n"); + DBOUT(DB_MISC, "wait for kernel dispatch op#" << *this << " completion with wait flag: " << waitMode_ << " signal="<< std::hex << _signal.handle << std::dec << "\n"); // wait for completion - if (hsa_signal_wait_scacquire(_signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), waitMode)!=0) { + if (hsa_signal_wait_scacquire(_signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), waitMode_) != 0) { throw Kalmar::runtime_exception("Signal wait returned unexpected value\n", 0); } @@ -4563,35 +4339,31 @@ HSADispatch::waitComplete() { this->hsaQueue()->removeAsyncOp(this); } - isDispatched = false; - return status; + isDispatched_ = false; + return HSA_STATUS_SUCCESS; } -inline hsa_status_t -HSADispatch::dispatchKernelWaitComplete() { - hsa_status_t status = HSA_STATUS_SUCCESS; - - if (isDispatched) { +hsa_status_t HSADispatch::dispatchKernelWaitComplete() { + if (isDispatched_) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } // WaitComplete dispatches need to ensure all data is released to system scope // This ensures the op is trule "complete" before continuing. // This WaitComplete path is used for AMP-style dispatches and may merit future review&optimization. - aql.header = - ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_.header = + ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); - { - // extract hsa_queue_t from HSAQueue - hsa_queue_t* rocrQueue = hsaQueue()->acquireLockedRocrQueue(); + // extract hsa_queue_t from HSAQueue + hsa_queue_t* rocrQueue = hsaQueue()->acquireLockedRocrQueue(); - // dispatch kernel - status = dispatchKernel(rocrQueue, arg_vec.data(), arg_vec.size(), true); - STATUS_CHECK(status, __LINE__); + // dispatch kernel + auto status = dispatchKernel( + rocrQueue, kernargMemory_.get(), sizeof(kernargMemory_.get()), true); + STATUS_CHECK(status, __LINE__); - hsaQueue()->releaseLockedRocrQueue(); - } + hsaQueue()->releaseLockedRocrQueue(); // wait for completion status = waitComplete(); @@ -4603,16 +4375,17 @@ HSADispatch::dispatchKernelWaitComplete() { // Flavor used when launching dispatch with args and signal created by HCC // (As opposed to the dispatch_hsa_kernel path) -inline hsa_status_t -HSADispatch::dispatchKernelAsyncFromOp() +hsa_status_t HSADispatch::dispatchKernelAsyncFromOp() { - return dispatchKernelAsync(arg_vec.data(), arg_vec.size(), true); + return dispatchKernelAsync( // TODO: CACAT + kernargMemory_.get(), sizeof(kernargMemory_.get()), true); } -inline hsa_status_t -HSADispatch::dispatchKernelAsync(const void *hostKernarg, int hostKernargSize, bool allocSignal) { - - +hsa_status_t HSADispatch::dispatchKernelAsync( + void *hostKernarg, + std::size_t hostKernargSize, + bool allocSignal) +{ if (HCC_SERIALIZE_KERNEL & 0x1) { hsaQueue()->wait(); } @@ -4637,9 +4410,8 @@ HSADispatch::dispatchKernelAsync(const void *hostKernarg, int hostKernargSize, b // dynamically allocate a std::shared_future object - future = new std::shared_future(std::async(std::launch::deferred, [&] { - waitComplete(); - }).share()); + future_.reset(new std::shared_future{ + std::async(std::launch::deferred, [&] { waitComplete(); }).share()}); if (HCC_SERIALIZE_KERNEL & 0x2) { status = waitComplete(); @@ -4652,16 +4424,6 @@ HSADispatch::dispatchKernelAsync(const void *hostKernarg, int hostKernargSize, b inline void HSADispatch::dispose() { - hsa_status_t status; - if (kernargMemory != nullptr) { - //std::cerr << "op#" << getSeqNum() << " releasing kernal arg buffer index=" << kernargMemoryIndex<< "\n"; - device->releaseKernargBuffer(kernargMemory, kernargMemoryIndex); - kernargMemory = nullptr; - } - - clearArgs(); - std::vector().swap(arg_vec); - if (HCC_PROFILE & HCC_PROFILE_TRACE) { uint64_t start = getBeginTimestamp(); uint64_t end = getEndTimestamp(); @@ -4670,11 +4432,6 @@ HSADispatch::dispose() { LOG_PROFILE(this, start, end, "kernel", getKernelName(), ""); } Kalmar::ctx.releaseSignal(_signal, _signalIndex); - - if (future != nullptr) { - delete future; - future = nullptr; - } } inline uint64_t @@ -4696,14 +4453,18 @@ void HSADispatch::overrideAcquireFenceIfNeeded() if (hsaQueue()->nextKernelNeedsSysAcquire()) { DBOUT( DB_CMD2, " kernel AQL packet adding system-scope acquire\n"); // Pick up system acquire if needed. - aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ; + aql_.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ; hsaQueue()->setNextKernelNeedsSysAcquire(false); } } -inline hsa_status_t -HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *localDims, - const int dynamicGroupSize) { +inline +hsa_status_t HSADispatch::setLaunchConfiguration( + int dims, + const size_t* globalDims, + const size_t* localDims, + int dynamicGroupSize) +{ assert((0 < dims) && (dims <= 3)); #if KALMAR_DEBUG && HCC_DEBUG_KARG @@ -4712,7 +4473,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * #endif // Set group dims // for each workgroup dimension, make sure it does not exceed the maximum allowable limit - const uint16_t* workgroup_max_dim = device->getWorkgroupMaxDim(); + const uint16_t* workgroup_max_dim = device_->getWorkgroupMaxDim(); unsigned int workgroup_size[3] = { 1, 1, 1}; @@ -4723,12 +4484,12 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * // throw an error if (localDims[i] > workgroup_max_dim[i]) { std::stringstream msg; - msg << "The extent of the tile (" << localDims[i] + msg << "The extent of the tile (" << localDims[i] << ") exceeds the device limit (" << workgroup_max_dim[i] << ")."; throw Kalmar::runtime_exception(msg.str().c_str(), -1); } else if (localDims[i] > globalDims[i]) { std::stringstream msg; - msg << "The extent of the tile (" << localDims[i] + msg << "The extent of the tile (" << localDims[i] << ") exceeds the compute grid extent (" << globalDims[i] << ")."; throw Kalmar::runtime_exception(msg.str().c_str(), -1); } @@ -4749,7 +4510,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * for (unsigned int i = 1; ; i<<=1) { if (i == recommended_flat_workgroup_size || i >= globalDims[0]) { - workgroup_size[0] = + workgroup_size[0] = std::min(i, static_cast(globalDims[0])); break; } @@ -4764,7 +4525,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * for (unsigned int i = 1; ; i<<=1) { if (i == recommended_flat_workgroup_size || i >= globalDims[0]) { - workgroup_size[0] = + workgroup_size[0] = std::min(i, static_cast(globalDims[0])); break; } @@ -4779,26 +4540,26 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * } else if (flat_group_size == recommended_flat_workgroup_size || j >= globalDims[1]) { - workgroup_size[1] = + workgroup_size[1] = std::min(j, static_cast(globalDims[1])); break; } } // compute the group size for the 3rd dimension - workgroup_size[2] = recommended_flat_workgroup_size / + workgroup_size[2] = recommended_flat_workgroup_size / (workgroup_size[0] * workgroup_size[1]); } } - auto kernel = this->kernel; + auto kernel = this->kernel_; auto calculate_kernel_max_flat_workgroup_size = [&] { constexpr unsigned int max_num_vgprs_per_work_item = 256; constexpr unsigned int num_work_items_per_simd = 64; constexpr unsigned int num_simds_per_cu = 4; const unsigned int workitem_vgpr_count = std::max((unsigned int)kernel->workitem_vgpr_count, 1u); - unsigned int max_flat_group_size = (max_num_vgprs_per_work_item / workitem_vgpr_count) + unsigned int max_flat_group_size = (max_num_vgprs_per_work_item / workitem_vgpr_count) * num_work_items_per_simd * num_simds_per_cu; return max_flat_group_size; }; @@ -4808,7 +4569,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * const unsigned int max_num_work_items_per_cu = calculate_kernel_max_flat_workgroup_size(); if (actual_flat_group_size > max_num_work_items_per_cu) { std::stringstream msg; - msg << "The number of work items (" << actual_flat_group_size + msg << "The number of work items (" << actual_flat_group_size << ") per work group exceeds the limit (" << max_num_work_items_per_cu << ") of kernel " << kernel->kernelName << " ."; throw Kalmar::runtime_exception(msg.str().c_str(), -1); @@ -4816,34 +4577,34 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t * }; validate_kernel_flat_group_size(); - memset(&aql, 0, sizeof(aql)); + aql_ = {}; // Copy info from kernel into AQL packet: // bind kernel code - aql.kernel_object = kernel->kernelCodeHandle; + aql_.kernel_object = kernel->kernelCodeHandle; - aql.group_segment_size = kernel->static_group_segment_size + dynamicGroupSize; - aql.private_segment_size = kernel->private_segment_size; + aql_.group_segment_size = kernel->static_group_segment_size + dynamicGroupSize; + aql_.private_segment_size = kernel->private_segment_size; // Set global dims: - aql.grid_size_x = globalDims[0]; - aql.grid_size_y = (dims > 1 ) ? globalDims[1] : 1; - aql.grid_size_z = (dims > 2 ) ? globalDims[2] : 1; + aql_.grid_size_x = globalDims[0]; + aql_.grid_size_y = (dims > 1 ) ? globalDims[1] : 1; + aql_.grid_size_z = (dims > 2 ) ? globalDims[2] : 1; - aql.workgroup_size_x = workgroup_size[0]; - aql.workgroup_size_y = workgroup_size[1]; - aql.workgroup_size_z = workgroup_size[2]; + aql_.workgroup_size_x = workgroup_size[0]; + aql_.workgroup_size_y = workgroup_size[1]; + aql_.workgroup_size_z = workgroup_size[2]; - aql.setup = dims << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + aql_.setup = dims << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - aql.header = 0; + aql_.header = 0; if (HCC_OPT_FLUSH) { - aql.header = ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_.header = ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); overrideAcquireFenceIfNeeded(); } else { - aql.header = ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_.header = ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); } return HSA_STATUS_SUCCESS; @@ -5626,39 +5387,6 @@ extern "C" void *GetContextImpl() { return &Kalmar::ctx; } -extern "C" void PushArgImpl(void *ker, int idx, size_t sz, const void *v) { - //std::cerr << "pushing:" << ker << " of size " << sz << "\n"; - HSADispatch *dispatch = - reinterpret_cast(ker); - void *val = const_cast(v); - switch (sz) { - case sizeof(double): - dispatch->pushDoubleArg(*reinterpret_cast(val)); - break; - case sizeof(short): - dispatch->pushShortArg(*reinterpret_cast(val)); - break; - case sizeof(int): - dispatch->pushIntArg(*reinterpret_cast(val)); - //std::cerr << "(int) value = " << *reinterpret_cast(val) <<"\n"; - break; - case sizeof(unsigned char): - dispatch->pushBooleanArg(*reinterpret_cast(val)); - break; - default: - assert(0 && "Unsupported kernel argument size"); - } -} - -extern "C" void PushArgPtrImpl(void *ker, int idx, size_t sz, const void *v) { - //std::cerr << "pushing:" << ker << " of size " << sz << "\n"; - HSADispatch *dispatch = - reinterpret_cast(ker); - void *val = const_cast(v); - dispatch->pushPointerArg(val); -} - - // op printer std::ostream& operator<<(std::ostream& os, const HSAOp & op) { diff --git a/lib/mcwamp.cpp b/lib/mcwamp.cpp index f71b255cdaf..56c4844db9b 100644 --- a/lib/mcwamp.cpp +++ b/lib/mcwamp.cpp @@ -37,8 +37,6 @@ struct RuntimeImpl { RuntimeImpl(const char* libraryName) : m_ImplName(libraryName), m_RuntimeHandle(nullptr), - m_PushArgImpl(nullptr), - m_PushArgPtrImpl(nullptr), m_GetContextImpl(nullptr), isCPU(false) { //std::cout << "dlopen(" << libraryName << ")\n"; @@ -58,8 +56,6 @@ struct RuntimeImpl { // load symbols from C++AMP runtime implementation void LoadSymbols() { - m_PushArgImpl = (PushArgImpl_t) dlsym(m_RuntimeHandle, "PushArgImpl"); - m_PushArgPtrImpl = (PushArgPtrImpl_t) dlsym(m_RuntimeHandle, "PushArgPtrImpl"); m_GetContextImpl= (GetContextImpl_t) dlsym(m_RuntimeHandle, "GetContextImpl"); } @@ -68,8 +64,6 @@ struct RuntimeImpl { std::string m_ImplName; void* m_RuntimeHandle; - PushArgImpl_t m_PushArgImpl; - PushArgPtrImpl_t m_PushArgPtrImpl; GetContextImpl_t m_GetContextImpl; bool isCPU; }; @@ -360,18 +354,15 @@ void LoadInMemoryProgram(KalmarQueue* pQueue) { } // used in parallel_for_each.h -void *CreateKernel(std::string s, KalmarQueue* pQueue) { +void* CreateKernel( + const char* name, + KalmarQueue* pQueue, + const void* callable, + std::size_t callable_size) +{ // TODO - should create a HSAQueue:: CreateKernel member function that creates and returns a dispatch. - return pQueue->getDev()->CreateKernel(s.c_str(), pQueue); -} - -void PushArg(void *k_, int idx, size_t sz, const void *s) { - GetOrInitRuntime()->m_PushArgImpl(k_, idx, sz, s); + return pQueue->getDev()->CreateKernel(name, pQueue, callable, callable_size); } -void PushArgPtr(void *k_, int idx, size_t sz, const void *s) { - GetOrInitRuntime()->m_PushArgPtrImpl(k_, idx, sz, s); -} - } // namespace CLAMP KalmarContext *getContext() { diff --git a/lib/mcwamp_atomic.cpp b/lib/mcwamp_atomic.cpp deleted file mode 100644 index 01b964b2515..00000000000 --- a/lib/mcwamp_atomic.cpp +++ /dev/null @@ -1,183 +0,0 @@ -#include -#include - -// FIXME : need to consider how to let hc namespace could also use functions here -namespace Concurrency { - -std::mutex afx_u, afx_i, afx_f; -unsigned int atomic_exchange_unsigned(unsigned int *x, unsigned int y) { - std::lock_guard guard(afx_u); - unsigned int old = *x; - *x = y; - return old; -} -int atomic_exchange_int(int *x, int y) { - std::lock_guard guard(afx_i); - int old = *x; - *x = y; - return old; -} -float atomic_exchange_float(float* x, float y) { - std::lock_guard guard(afx_f); - int old = *x; - *x = y; - return old; -} - -std::mutex afcas_u, afcas_i; -unsigned int atomic_compare_exchange_unsigned(unsigned int *x, unsigned int y, unsigned int z) { - std::lock_guard guard(afcas_u); - unsigned int old = *x; - if (*x == y) { - *x = z; - } - return old; -} -int atomic_compare_exchange_int(int *x, int y, int z) { - std::lock_guard guard(afcas_i); - int old = *x; - if (*x == y) { - *x = z; - } - return old; -} - -std::mutex afa_u, afa_i, afa_f; -unsigned int atomic_add_unsigned(unsigned int *x, unsigned int y) { - std::lock_guard guard(afa_u); - unsigned int old = *x; - *x += y; - return old; -} -int atomic_add_int(int *x, int y) { - std::lock_guard guard(afa_i); - int old = *x; - *x += y; - return old; -} -float atomic_add_float(float* x, float y) { - std::lock_guard guard(afa_f); - float old = *x; - *x += y; - return old; -} - -std::mutex afs_u, afs_i, afs_f; -unsigned int atomic_sub_unsigned(unsigned int *x, unsigned int y) { - std::lock_guard guard(afa_u); - unsigned int old = *x; - *x -= y; - return old; -} -int atomic_sub_int(int *x, int y) { - std::lock_guard guard(afa_i); - int old = *x; - *x -= y; - return old; -} -float atomic_sub_float(float* x, float y) { - std::lock_guard guard(afa_f); - float old = *x; - *x -= y; - return old; -} - -std::mutex afand_u, afand_i; -unsigned int atomic_and_unsigned(unsigned int *x, unsigned int y) { - std::lock_guard guard(afand_u); - unsigned int old = *x; - *x &= y; - return old; -} -int atomic_and_int(int *x, int y) { - std::lock_guard guard(afand_i); - int old = *x; - *x &= y; - return old; -} - -std::mutex afor_u, afor_i; -unsigned int atomic_or_unsigned(unsigned int *x, unsigned int y) { - std::lock_guard guard(afor_u); - unsigned int old = *x; - *x |= y; - return old; -} -int atomic_or_int(int *x, int y) { - std::lock_guard guard(afor_i); - int old = *x; - *x |= y; - return old; -} - -std::mutex afxor_u, afxor_i; -unsigned int atomic_xor_unsigned(unsigned int *x, unsigned int y) { - std::lock_guard guard(afxor_u); - unsigned int old = *x; - *x ^= y; - return old; -} -int atomic_xor_int(int *x, int y) { - std::lock_guard guard(afxor_i); - int old = *x; - *x ^= y; - return old; -} - -std::mutex afmax_u, afmax_i; -unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) { - std::lock_guard guard(afmax_u); - unsigned int old = *p; - *p = std::max(*p, val); - return old; -} -int atomic_max_int(int *p, int val) { - std::lock_guard guard(afmax_i); - int old = *p; - *p = std::max(*p, val); - return old; -} - -std::mutex afmin_u, afmin_i; -unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) { - std::lock_guard guard(afmin_u); - unsigned int old = *p; - *p = std::min(*p, val); - return old; -} -int atomic_min_int(int *p, int val) { - std::lock_guard guard(afmin_i); - int old = *p; - *p = std::min(*p, val); - return old; -} - -std::mutex afi_u, afi_i; -unsigned int atomic_inc_unsigned(unsigned int *p) { - std::lock_guard guard(afi_u); - unsigned int old = *p; - *p += 1; - return old; -} -int atomic_inc_int(int *p) { - std::lock_guard guard(afi_i); - int old = *p; - *p += 1; - return old; -} - -std::mutex afd_u, afd_i; -unsigned int atomic_dec_unsigned(unsigned int *p) { - std::lock_guard guard(afd_u); - unsigned int old = *p; - *p -= 1; - return old; -} -int atomic_dec_int(int *p) { - std::lock_guard guard(afd_i); - int old = *p; - *p -= 1; - return old; -} - -} diff --git a/lib/mcwamp_impl.hpp b/lib/mcwamp_impl.hpp index 0a0e544cf8d..24ff102f0f2 100644 --- a/lib/mcwamp_impl.hpp +++ b/lib/mcwamp_impl.hpp @@ -1,7 +1,3 @@ #pragma once -#include - -typedef void* (*PushArgImpl_t)(void *, int, size_t, const void *); -typedef void* (*PushArgPtrImpl_t)(void *, int, size_t, const void *); typedef void* (*GetContextImpl_t)(); diff --git a/tests/Unit/Codegen/deser_decl.cpp b/tests/Unit/Codegen/deser_decl.cpp deleted file mode 100644 index c841a02b20f..00000000000 --- a/tests/Unit/Codegen/deser_decl.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: %amp_device -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s -class base{ - public: - __attribute__((annotate("deserialize"))) /* For compiler */ - base(float a_,float b_) restrict(amp) :a(a_), b(b_) {} - float a; - float b; -}; -class baz { - public: -#if 0 // This declaration is supposed to be generated - __attribute__((annotate("deserialize"))) /* For compiler */ - baz(float a, float b, int foo) restrict(amp); -#endif - void cho(void) restrict(amp) {}; - - base B; - int bar; -}; - -int kerker(void) restrict(amp,cpu) { - // Will pass if deserializer declaration and definition are generated - baz bl(0.0, 0.0, 1); - return bl.bar; -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::baz(float, float, int)( diff --git a/tests/Unit/Codegen/deser_decl_support_inheritclass.cpp b/tests/Unit/Codegen/deser_decl_support_inheritclass.cpp deleted file mode 100644 index d238d13757f..00000000000 --- a/tests/Unit/Codegen/deser_decl_support_inheritclass.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: %amp_device -c -S -emit-llvm %s -o-|%cppfilt|%FileCheck %s -class base { - public: - __attribute__((annotate("deserialize"))) /* For compiler */ - base(float a_,float b_) restrict(amp) :a(a_), b(b_) {} - float a; - float b; -}; - -class baz:public base { - public: -#if 0 // This declaration is supposed to be generated - __attribute__((annotate("deserialize"))) /* For compiler */ - baz(float a, float b, int foo) restrict(amp); -#endif - void cho(void) restrict(amp) {}; - - int bar; -}; - -int kerker(void) restrict(amp,cpu) { - // Will pass if deserializer declaration and definition are generated - baz bl(0.0, 0.0, 1); - return bl.bar; -} - -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::baz(float, float, int)( diff --git a/tests/Unit/Codegen/deser_def.cpp b/tests/Unit/Codegen/deser_def.cpp deleted file mode 100644 index 43dfa76a031..00000000000 --- a/tests/Unit/Codegen/deser_def.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: %amp_device -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s -class baz { - public: - int cho(void) restrict(amp) { - return 45; - } - baz(void): foo(1234) {} - __attribute__((used)) /* Forcing this function to be present in the output */ - __attribute__((annotate("auto_deserialize"))) /* For compiler */ - baz(int foo_, float bar_) restrict(amp); - // :foo(foo_), bar(bar_) {} - private: - int foo; - float bar; -}; - -int kerker(void) restrict(amp,cpu) { - baz b1; - baz bll(1, 2.0); - return b1.cho()+bll.cho(); -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::baz(int, float)(%class.baz*{{.*}}, i32{{.*}}, float{{.*}}) diff --git a/tests/Unit/Codegen/deser_def_body.cpp b/tests/Unit/Codegen/deser_def_body.cpp deleted file mode 100644 index 719e5b6a0fa..00000000000 --- a/tests/Unit/Codegen/deser_def_body.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: %gtest_amp %s -o %t && %t -#include -#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti -#include -#endif -class baz { - public: - baz(void): foo(1234) {} - __attribute__((annotate("auto_deserialize"))) baz(int foo_, float bar_) restrict(amp,cpu); - //:foo(foo_), bar(bar_) {} - int foo; - float bar; -}; - - __attribute__((annotate("user_deserialize"))) -int fake_use(void) - restrict(amp) { - baz bll(1, 2.0); - return bll.foo; -} -#ifndef __KALMAR_ACCELERATOR__ -TEST(GPUCodeGen, Constructor) { - baz bll(1, 2.0); - EXPECT_EQ(bll.foo, 1); -} -#endif diff --git a/tests/Unit/Codegen/deser_def_body_compound.cpp b/tests/Unit/Codegen/deser_def_body_compound.cpp deleted file mode 100644 index 2dde8e3314b..00000000000 --- a/tests/Unit/Codegen/deser_def_body_compound.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// XFAIL: * -// RUN: %gtest_amp %s -o %t && %t -#include -#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti -#include -#endif -class Member { - public: - // Compiler-generated constructor - __attribute__((noinline)) - __attribute__((annotate("auto_deserialize"))) Member(float, int) restrict(amp); - float bzzt; - int zzz; -}; - -class baz { - public: - // Compiler-generated constructor - __attribute__((annotate("auto_deserialize"))) baz(float m1, int m2, - int foo_, float bar_) restrict(amp,cpu); - - Member m; - int foo; - float bar; -}; - -__attribute__((annotate("user_deserialize"))) -int fake_use(void) restrict(amp) { - baz bll(0.0, 0, 1, 2.0); - return bll.foo; -} -#ifndef __KALMAR_ACCELERATOR__ -TEST(GPUCodeGen, ConstructorCompound) { - float local_float = 2.78f; - baz bll(local_float, 2, 1, 2.0); - EXPECT_EQ(bll.foo, 1); - EXPECT_EQ(bll.m.bzzt, local_float); - EXPECT_EQ(bll.m.zzz, 2); -} -#endif diff --git a/tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp b/tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp deleted file mode 100644 index b0dced430eb..00000000000 --- a/tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__=1 %s -c -o %t.device.o -// RUN: %gtest_amp %s %t.device.o -o %t && %t -// XFAIL: * - -#include -#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti -#include -#endif -class Member { - public: - // Compiler-generated constructor - __attribute__((annotate("auto_deserialize"))) Member(float, int) restrict(amp, cpu); - float bzzt; - int zzz; -}; - -class base { - public: - // Compiler-generated constructor - __attribute__((annotate("auto_deserialize"))) base(float m1, int m2, - int foo_, float bar_) restrict(amp, cpu); - - Member m; - int foo; - float bar; -}; - -class baz :public base { - public: - // Compiler-generated constructor - __attribute__((annotate("auto_deserialize"))) baz(float m1, int m2, - int foo_, float bar_, int bar_foo_) restrict(amp, cpu); - int baz_foo; -}; - -__attribute__((annotate("user_deserialize"))) -int fake_use(void) restrict(amp) { - baz bll(0, 0, 1, 2.0, 1); - return bll.foo; -} -#ifndef __KALMAR_ACCELERATOR__ -TEST(GPUCodeGen, ConstructorCompound) { - float local_float = 2.78f; - baz bll(local_float, 2, 1, 2.0,1); - EXPECT_EQ(bll.foo, 1); - EXPECT_EQ(bll.m.bzzt, local_float); - EXPECT_EQ(bll.m.zzz, 2); - EXPECT_EQ(bll.baz_foo, 1); -} -#endif diff --git a/tests/Unit/Codegen/deser_def_ref.cpp b/tests/Unit/Codegen/deser_def_ref.cpp deleted file mode 100644 index 46dd8bfa12c..00000000000 --- a/tests/Unit/Codegen/deser_def_ref.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// XFAIL: * -// RUN: %amp_device -c -D__KALMAR_ACCELERATOR__=1 -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s -// RUN: %amp_device -c -D__KALMAR_ACCELERATOR__=1 %s -o %t.device.o -// RUN: %gtest_amp %s %t.device.o -o %t && %t -#ifndef __KALMAR_ACCELERATOR__ -#include -#endif -class base{ - public: - __attribute__((annotate("deserialize"))) /* For compiler */ - base(int a_,float b_) restrict(amp) :a(a_), b(b_) {} - int a; - float b; -}; -class baz { - public: -#if 0 // This declaration is supposed to be generated - __attribute__((annotate("deserialize"))) /* For compiler */ - baz(base&, int foo) restrict(amp); -#endif - void cho(void) restrict(amp) {}; - - base &B; // No reference type is considered amp-compatible - int bar; -}; - -#ifdef __KALMAR_ACCELERATOR__ -int kerker(void) restrict(amp,cpu) { - base b(1234, 0.0f); - // Will pass if deserializer declaration and definition are generated - baz bl(b, 1); - return bl.B.a; -} -#else -extern int kerker(void) restrict(amp,cpu); -TEST(GPUCodeGen, ConstructorWithRef) { - EXPECT_EQ(kerker(), 1234); -} -#endif -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::baz(base&, int)( diff --git a/tests/Unit/Codegen/ser_decl.cpp b/tests/Unit/Codegen/ser_decl.cpp deleted file mode 100644 index ee1644d1ac5..00000000000 --- a/tests/Unit/Codegen/ser_decl.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s -#include //for size_t -//Serialization object decl -namespace Concurrency { -class Serialize { - public: - void Append(size_t x, const void *s); -}; -} - -class baz { - public: - int cho(void) restrict(amp) { - return 45; - } - baz(void): foo(1234) {} -#if 0 - __attribute__((annotate("serialize")))/* For compiler */ - __cxxamp_serialize(Concurrency::Serialize& s) const; -#endif - private: - int foo; -}; - -int kerker(void) restrict(amp,cpu) { - baz b1; - Concurrency::Serialize s; - b1.__cxxamp_serialize(s); - return b1.cho(); -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::__cxxamp_serialize(Concurrency::Serialize&) diff --git a/tests/Unit/Codegen/ser_decl_ref.cpp b/tests/Unit/Codegen/ser_decl_ref.cpp deleted file mode 100644 index 2a2236eec88..00000000000 --- a/tests/Unit/Codegen/ser_decl_ref.cpp +++ /dev/null @@ -1,42 +0,0 @@ -XFAIL: * -// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s -//Serialization object decl -#include -namespace Concurrency { -class Serialize { - public: - void Append(size_t x, const void *s); -}; -} - -class base{ - public: - __attribute__((annotate("user_deserialize"))) /* For compiler */ - base(int a_,float b_) restrict(amp,cpu) :a(a_), b(b_) {} - int cho(void) restrict(amp); - int a; - float b; -}; -class baz { - public: -#if 0 // This declaration is supposed to be generated - __attribute__((annotate("deserialize"))) /* For compiler */ - baz(base&, int foo) restrict(amp); -#endif - int cho(void) restrict(amp) { return 0; }; - - base &B; // reference object is not allowed in amp codes - int bar; -}; - -int kerker(void) restrict(amp,cpu) { - base b(1234, 0.0f); - // Will pass if deserializer declaration and definition are generated - baz bl(b, 1); - Concurrency::Serialize s; - bl.__cxxamp_serialize(s); - return bl.cho(); -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::__cxxamp_serialize(Concurrency::Serialize&) -// CHECK: call void @base::__cxxamp_serialize(Concurrency::Serialize&) diff --git a/tests/Unit/Codegen/ser_def.cpp b/tests/Unit/Codegen/ser_def.cpp deleted file mode 100644 index f1c0506b593..00000000000 --- a/tests/Unit/Codegen/ser_def.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s -#include //for size_t -//Serialization object decl -namespace Concurrency { -class Serialize { - public: - void Append(size_t x, const void *s); -}; -} - -class baz { - public: - int cho(void) restrict(amp) { - return 45; - } - baz(void): foo(1234) {} - void __cxxamp_serialize(Concurrency::Serialize& s); - private: - int foo; -}; - -int kerker(void) restrict(amp,cpu) { - baz b1; - Concurrency::Serialize s; - b1.__cxxamp_serialize(s); - return b1.cho(); -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::__cxxamp_serialize(Concurrency::Serialize&) diff --git a/tests/Unit/Codegen/ser_def_body.cpp b/tests/Unit/Codegen/ser_def_body.cpp deleted file mode 100644 index e8a76f9849f..00000000000 --- a/tests/Unit/Codegen/ser_def_body.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s -// RUN: %gtest_amp %s -DUSING_GTEST=1 -o %t && %t -#include //for size_t -//Serialization object decl -namespace Concurrency { -class Serialize { - public: - Serialize():x(0) {} - void Append(size_t sz, const void *s) { - x++; - } - int x; -}; -template -class gmac_array { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Serialize& s) const { - s.Append(0, NULL); - } - T t; -}; -} -class nontemplate { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Concurrency::Serialize& s) const { - s.Append(0, NULL); - } -}; -class baz { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Concurrency::Serialize& s) const; - private: - Concurrency::gmac_array foo; - Concurrency::gmac_array bar; - nontemplate nt; -}; - -int kerker(void) restrict(amp,cpu) { - baz b1; - Concurrency::Serialize s; - b1.__cxxamp_serialize(s); - return 1; -} -#ifdef USING_GTEST -// The definition should be generated by clang -// CHECK: call {{.*}}void @Concurrency::gmac_array::__cxxamp_serialize -// Executable tests -#include -TEST(Serialization, Call) { - baz bl; - Concurrency::Serialize s; - bl.__cxxamp_serialize(s); - EXPECT_EQ(3, s.x); -} -#endif diff --git a/tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp b/tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp deleted file mode 100644 index f273b7362f7..00000000000 --- a/tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s -// RUN: %gtest_amp %s -DUSING_GTEST=1 -o %t && %t -#include //for size_t -#include -//Serialization object decl -namespace Concurrency { -class Serialize { - public: - Serialize():x(0) {} - void Append(size_t sz, const void *s) { - x+=1; - } - int x; -}; -template -class gmac_array { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Serialize& s) const { - s.Append(0, NULL); - } - T t; -}; -} -class base { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Concurrency::Serialize& s) const; - private: - Concurrency::gmac_array a; - int i; -}; -class derive:public base { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Concurrency::Serialize& s) const; - private: - float f; - Concurrency::gmac_array b; -}; - -int kerker(void) restrict(amp,cpu) { - derive b1; - Concurrency::Serialize s; - b1.__cxxamp_serialize(s); - return 1; -} - -// The definition should be generated by clang -// CHECK: define {{.*}}derive::__cxxamp_serialize -// CHECK: call {{.*}}void @base::__cxxamp_serialize(Concurrency::Serialize&) const -// CHECK: } - -#ifdef USING_GTEST -// Executable tests -#include -TEST(Serialization, Call) { - derive bl; - Concurrency::Serialize s; - bl.__cxxamp_serialize(s); - EXPECT_EQ(4, s.x); -} -#endif diff --git a/tests/Unit/Codegen/ser_def_body_support_scalar.cpp b/tests/Unit/Codegen/ser_def_body_support_scalar.cpp deleted file mode 100644 index 8ceb9cd9419..00000000000 --- a/tests/Unit/Codegen/ser_def_body_support_scalar.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s -// RUN: %gtest_amp %s -DUSING_GTEST=1 -o %t && %t -#include //for size_t -#include -//Serialization object decl -namespace Concurrency { -class Serialize { - public: - Serialize():x(0) {} - void Append(size_t sz, const void *s) { - x++; - } - int x; -}; -template -class gmac_array { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Serialize& s) const { - s.Append(0, NULL); - } - T t; -}; -} -class baz { - public: - __attribute__((annotate("serialize")))/* For compiler */ - void __cxxamp_serialize(Concurrency::Serialize& s) const; - private: - Concurrency::gmac_array foo; - int i; - float f; -}; - -int kerker(void) restrict(amp,cpu) { - baz b1; - Concurrency::Serialize s; - b1.__cxxamp_serialize(s); - return 1; -} - -// The definition should be generated by clang -// CHECK: define {{.*}}baz::__cxxamp_serialize -// CHECK: call {{.*}}void @Concurrency::gmac_array::__cxxamp_serialize -// CHECK: call {{.*}}void @Concurrency::Serialize::Append -// CHECK: } - -#ifdef USING_GTEST -// Executable tests -#include -TEST(Serialization, Call) { - baz bl; - Concurrency::Serialize s; - bl.__cxxamp_serialize(s); - EXPECT_EQ(3, s.x); -} -#endif diff --git a/tests/Unit/Codegen/signature.cpp b/tests/Unit/Codegen/signature.cpp deleted file mode 100644 index a850fe8e7bc..00000000000 --- a/tests/Unit/Codegen/signature.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// XFAIL: * -// RUN: %amp_device -O2 -D__KALMAR_ACCELERATOR__=1 %s -c -o %t.device.o -// RUN: %gtest_amp %s %t.device.o -O2 -o %t && %t -#include -#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti -#include -#endif -class member { - public: - void cho(void) restrict(amp) {}; - member(int i) { - _i = i+1; - } - int _i; -}; -class base { - public: - void cho(void) restrict(amp) {}; - base(float f) { - _f = f+1; - } - float _f; -}; -class baz: public base { - public: - void cho(void) restrict(amp) {}; - // User-defined constructor with same signature as generated - // deserializer - baz(float f, int bar_, int i): base(f), bar(bar_), m(i){} - int bar; - member m; -}; -#ifdef __KALMAR_ACCELERATOR__ -__attribute__((annotate("user_deserialize"))) -float fake_use(void) restrict(amp) { - baz bll(1.1, 2, 1); // calls the deserializer - return bll._f; -} -#else -extern float fake_use(void); -TEST(GPUCodeGen, Constructor) { - baz user(1.1f, 2, 1); //calls user-defined constructor - EXPECT_EQ(user._f, 2.1f); - EXPECT_EQ(1.1f, fake_use()); //fake_use calls the generated constructor -} -#endif diff --git a/tests/Unit/Codegen/trampoline.cpp b/tests/Unit/Codegen/trampoline.cpp deleted file mode 100644 index bcd261cb235..00000000000 --- a/tests/Unit/Codegen/trampoline.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s -#include "amp.h" -class baz { - public: - void operator()(Concurrency::index<1> idx) restrict(amp) { - } -#if 0 - // The declaration and body of this function will be generated - static __attribute__((annotate("__cxxamp_trampoline"))) - void __cxxamp_trampoline(int, float) restrict(amp); -#endif - private: - int foo; - float bar; -}; -template -void kerker(void) restrict(amp) { - // This reference triggers declaration&definition of __cxxamp_trampoline - int* b = reinterpret_cast(&Foo::__cxxamp_trampoline); -} -void kk(void) restrict(amp) { - kerker(); -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::__cxxamp_trampoline diff --git a/tests/Unit/Codegen/trampoline_byref.cpp b/tests/Unit/Codegen/trampoline_byref.cpp deleted file mode 100644 index 64597bda877..00000000000 --- a/tests/Unit/Codegen/trampoline_byref.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s -#include "amp.h" -class baz { - public: - void operator()(Concurrency::index<1> &idx) restrict(amp) { - } -#if 0 - // The declaration and body of this function will be generated - static __attribute__((annotate("__cxxamp_trampoline"))) - void __cxxamp_trampoline(int foo, float bar) restrict(amp); -#endif - private: - int foo; - float bar; -}; -template -void kerker(void) restrict(amp) { - // This reference triggers declaration&definition of __cxxamp_trampoline - int* b = reinterpret_cast(&Foo::__cxxamp_trampoline); -} -void kk(void) restrict(amp) { - kerker(); -} -// The definition should be generated by clang -// CHECK: define {{.*}}void @baz::__cxxamp_trampoline diff --git a/tests/Unit/Codegen/trampoline_name.cpp b/tests/Unit/Codegen/trampoline_name.cpp deleted file mode 100644 index a4a09c25d75..00000000000 --- a/tests/Unit/Codegen/trampoline_name.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// RUN: %gtest_amp %s -o %t && %t -#include -#include -#include -// the functor to test -class baz { - public: - void operator()(Concurrency::index<1> idx) restrict(amp) {} - int foo; - float bar; -}; - -TEST(GPUCodeGen, TrampolineName) { - // Inject the trampoline declaration - void* bar = reinterpret_cast(&baz::__cxxamp_trampoline); - // An injected member function __cxxamp_trampoline_name - // should return the mangled name of the trampoline - // hardcoded for now.. - EXPECT_EQ(std::string("_ZN3baz19__cxxamp_trampolineEif"), - std::string(baz::__cxxamp_trampoline_name())); -} diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp index 02341ad0663..917d394b58e 100644 --- a/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp +++ b/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp @@ -5,29 +5,9 @@ struct Foo { int table[3]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(int x0, int x1, int x2) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(int), &table[0]); - s.Append(sizeof(int), &table[1]); - s.Append(sizeof(int), &table[2]); - } }; int main() { - - // XXX the test would cause soft hang now - // explicitly disable the test for now -#if 0 using namespace hc; Foo f; @@ -44,8 +24,8 @@ int main() { av.copy(data, data_d, 3 * sizeof(int)); parallel_for_each(extent<1>(3), [=](index<1> idx) [[hc]] { - data_d[idx[0]] = f.table[idx[0]] + 999; - }); + data_d[idx[0]] = f.table[idx[0]] + 999; + }); av.copy(data_d, data, 3 * sizeof(int)); @@ -57,7 +37,4 @@ int main() { am_free(data_d); return !(ret == true); -#else - return !(false == true); -#endif } diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp index 719b9d05606..df613216d4b 100644 --- a/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp +++ b/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp @@ -6,22 +6,6 @@ template struct Foo { T table[3]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - } }; template diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp index f2d38707f7f..0638d933f36 100644 --- a/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp +++ b/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp @@ -16,110 +16,30 @@ struct Foo { template struct Foo { T table[1]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0) [[cpu]][[hc]] { - table[0] = x0; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - } }; // partial specialization of Foo template struct Foo { T table[2]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - } }; // partial specialization of Foo template struct Foo { T table[3]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - } }; // partial specialization of Foo template struct Foo { T table[4]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2, T x3) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - table[3] = x3; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - s.Append(sizeof(T), &table[3]); - } }; // partial specialization of Foo template struct Foo { T table[5]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2, T x3, T x4) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - table[3] = x3; - table[4] = x4; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - s.Append(sizeof(T), &table[3]); - s.Append(sizeof(T), &table[4]); - } }; template @@ -140,8 +60,8 @@ bool test() { av.copy(data, data_d, N * sizeof(T)); parallel_for_each(extent<1>(N), [=](index<1> idx) [[hc]] { - data_d[idx[0]] = f.table[idx[0]] + T(999); - }); + data_d[idx[0]] = f.table[idx[0]] + T(999); + }); av.copy(data_d, data, N * sizeof(T)); diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp index 2b04359a173..c6dc6556df0 100644 --- a/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp +++ b/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp @@ -17,110 +17,30 @@ struct Foo { template struct Foo { T table[1]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0) [[cpu]][[hc]] { - table[0] = x0; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - } }; // partial specialization of Foo template struct Foo { T table[2]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - } }; // partial specialization of Foo template struct Foo { T table[3]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - } }; // partial specialization of Foo template struct Foo { T table[4]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2, T x3) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - table[3] = x3; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - s.Append(sizeof(T), &table[3]); - } }; // partial specialization of Foo template struct Foo { T table[5]; - - Foo() = default; - - __attribute__((annotate("user_deserialize"))) - Foo(T x0, T x1, T x2, T x3, T x4) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; - table[3] = x3; - table[4] = x4; - } - - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(T), &table[0]); - s.Append(sizeof(T), &table[1]); - s.Append(sizeof(T), &table[2]); - s.Append(sizeof(T), &table[3]); - s.Append(sizeof(T), &table[4]); - } }; // Bar extends Foo @@ -187,8 +107,8 @@ bool test() { av.copy(data, data_d, N * sizeof(T)); parallel_for_each(extent<1>(N), [=](index<1> idx) [[hc]] { - data_d[idx[0]] = f.table[idx[0]] + T(999); - }); + data_d[idx[0]] = f.table[idx[0]] + T(999); + }); av.copy(data_d, data, N * sizeof(T)); @@ -205,10 +125,7 @@ bool test() { int main() { bool ret = true; - // XXX the test would cause soft hang now - // explicitly disable the test for now -#if 0 - ret &= test >(); + ret &= test >(); ret &= test >(); ret &= test >(); ret &= test >(); @@ -257,7 +174,4 @@ int main() { ret &= test >(); return !(ret == true); -#else - return !(false == true); -#endif } From c4e8739a4e44cabe5b80394f9444bce5d831b1d7 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 5 Aug 2018 17:14:44 +0300 Subject: [PATCH 002/134] Start removing Kalmar. Re-implement array. Tie Callable lifetime to pfe. --- include/amp.h | 107 +- include/amp_math.h | 8 +- include/atomics.hpp | 46 +- include/hc.hpp | 2012 +++++++++++++++++++----------- include/hc_am.hpp | 25 +- include/hc_defines.h | 8 +- include/hc_short_vector.inl | 24 + include/kalmar_aligned_alloc.h | 4 +- include/kalmar_buffer.h | 39 +- include/kalmar_exception.h | 4 +- include/kalmar_index.h | 17 +- include/kalmar_launch.h | 71 +- include/kalmar_math.h | 365 +++--- include/kalmar_runtime.h | 270 ++-- include/kalmar_serialize.h | 29 +- lib/cpu/mcwamp_cpu.cpp | 38 +- lib/hsa/mcwamp_hsa.cpp | 360 +++--- lib/hsa/unpinned_copy_engine.cpp | 2 +- lib/mcwamp.cpp | 35 +- 19 files changed, 2068 insertions(+), 1396 deletions(-) diff --git a/include/amp.h b/include/amp.h index bddb29e1d5c..4cd6c58718d 100644 --- a/include/amp.h +++ b/include/amp.h @@ -48,11 +48,11 @@ namespace Concurrency { * Represents a unique position in N-dimensional space. */ template -using index = Kalmar::index; +using index = detail::index; -using runtime_exception = Kalmar::runtime_exception; -using invalid_compute_domain = Kalmar::invalid_compute_domain; -using accelerator_view_removed = Kalmar::accelerator_view_removed; +using runtime_exception = detail::runtime_exception; +using invalid_compute_domain = detail::invalid_compute_domain; +using accelerator_view_removed = detail::accelerator_view_removed; } // namespace Concurrency @@ -63,8 +63,8 @@ using accelerator_view_removed = Kalmar::accelerator_view_removed; namespace Concurrency { using namespace hc::atomics; -using namespace Kalmar::enums; -using namespace Kalmar::CLAMP; +using namespace detail::enums; +using namespace detail::CLAMP; // ------------------------------------------------------------------------ // accelerator_view @@ -221,20 +221,20 @@ class accelerator_view { bool operator!=(const accelerator_view& other) const { return !(*this == other); } private: - accelerator_view(std::shared_ptr pQueue) : pQueue(pQueue) {} - std::shared_ptr pQueue; + accelerator_view(std::shared_ptr pQueue) : pQueue(pQueue) {} + std::shared_ptr pQueue; friend class accelerator; template friend - void Kalmar::launch_kernel( - const std::shared_ptr&, + void detail::launch_kernel( + const std::shared_ptr&, const Domain&, const Kernel&); template friend - std::shared_future Kalmar::launch_kernel_async( - const std::shared_ptr&, + std::shared_future detail::launch_kernel_async( + const std::shared_ptr&, const Domain&, const Kernel&); @@ -319,7 +319,7 @@ class accelerator * @param[in] path The device path of this accelerator. */ explicit accelerator(const std::wstring& path) - : pDev(Kalmar::getContext()->getDevice(path)) {} + : pDev(detail::getContext()->getDevice(path)) {} /** * Copy constructs an accelerator object. This function does a shallow copy @@ -338,7 +338,7 @@ class accelerator * @return A vector of accelerators. */ static std::vector get_all() { - auto Devices = Kalmar::getContext()->getDevices(); + auto Devices = detail::getContext()->getDevices(); std::vector ret(Devices.size()); for (std::size_t i = 0; i < ret.size(); ++i) ret[i] = Devices[i]; @@ -359,7 +359,7 @@ class accelerator * false, and the function will have no effect. */ static bool set_default(const std::wstring& path) { - return Kalmar::getContext()->set_default(path); + return detail::getContext()->set_default(path); } /** @@ -378,7 +378,7 @@ class accelerator * of the target for a parallel_for_each execution. */ static accelerator_view get_auto_selection_view() { - return Kalmar::getContext()->auto_select(); + return detail::getContext()->auto_select(); } /** @@ -537,9 +537,9 @@ class accelerator access_type get_default_cpu_access_type() const { return pDev->get_access(); } private: - accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {} + accelerator(detail::HCCDevice* pDev) : pDev(pDev) {} friend class accelerator_view; - Kalmar::KalmarDevice* pDev; + detail::HCCDevice* pDev; }; // ------------------------------------------------------------------------ @@ -880,7 +880,7 @@ class extent { * by this extent (with an assumed origin of zero). */ bool contains(const index& idx) const restrict(amp,cpu) { - return Kalmar::amp_helper, extent>::contains(idx, *this); + return detail::amp_helper, extent>::contains(idx, *this); } /** @@ -889,7 +889,7 @@ class extent { * extent[0] * extent[1] ... * extent[N-1] */ unsigned int size() const restrict(amp,cpu) { - return Kalmar::index_helper>::count_size(*this); + return detail::index_helper>::count_size(*this); } @@ -939,7 +939,7 @@ class extent { // FIXME: the signature is not entirely the same as defined in: // C++AMP spec v1.2 #1255 bool operator==(const extent& other) const restrict(amp,cpu) { - return Kalmar::index_helper >::equal(*this, other); + return detail::index_helper >::equal(*this, other); } bool operator!=(const extent& other) const restrict(amp,cpu) { return !(*this == other); @@ -1077,10 +1077,10 @@ class extent { : base_(other.base_) {} private: - typedef Kalmar::index_impl::type> base; + typedef detail::index_impl::type> base; base base_; - template friend struct Kalmar::index_helper; - template friend struct Kalmar::amp_helper; + template friend struct detail::index_helper; + template friend struct detail::amp_helper; }; // ------------------------------------------------------------------------ @@ -1333,7 +1333,7 @@ class tiled_index { void parallel_for_each( const accelerator_view&, const tiled_extent&, const K&); friend - struct Kalmar::Indexer; + struct detail::Indexer; }; /** @@ -1453,7 +1453,7 @@ class tiled_index { void parallel_for_each( const accelerator_view&, const tiled_extent&, const K&); friend - struct Kalmar::Indexer; + struct detail::Indexer; }; /** @@ -1577,7 +1577,7 @@ class tiled_index { void parallel_for_each( const accelerator_view&, const tiled_extent&, const K&); friend - struct Kalmar::Indexer; + struct detail::Indexer; }; // ------------------------------------------------------------------------ @@ -2165,9 +2165,9 @@ class array { static_assert(0 == (sizeof(T) % sizeof(int)), "only value types whose size is a multiple of the size of an integer are allowed in array"); public: #if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; + typedef detail::_data acc_buffer_t; #else - typedef Kalmar::_data_host acc_buffer_t; + typedef detail::_data_host acc_buffer_t; #endif /** @@ -2724,7 +2724,7 @@ class array { m_device.synchronize(true); #endif T *ptr = reinterpret_cast(m_device.get()); - return ptr[Kalmar::amp_helper, Concurrency::extent>::flatten(idx, extent)]; + return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx, extent)]; } T& operator()(const index& idx) restrict(amp,cpu) { return (*this)[idx]; @@ -2750,7 +2750,7 @@ class array { m_device.synchronize(); #endif T *ptr = reinterpret_cast(m_device.get()); - return ptr[Kalmar::amp_helper, Concurrency::extent>::flatten(idx, extent)]; + return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx, extent)]; } const T& operator()(const index& idx) const restrict(amp,cpu) { return (*this)[idx]; @@ -2848,7 +2848,7 @@ class array { */ array_view section(const Concurrency::index& origin, const Concurrency::extent& ext) restrict(amp,cpu) { #if __KALMAR_ACCELERATOR__ != 1 - if( !Kalmar::amp_helper, Concurrency::extent>::contains(origin, ext ,this->extent) ) + if( !detail::amp_helper, Concurrency::extent>::contains(origin, ext ,this->extent) ) throw runtime_exception("errorMsg_throw", 0); #endif array_view av(*this); @@ -2867,7 +2867,7 @@ class array { */ array_view section(const index& idx) restrict(amp,cpu) { #if __KALMAR_ACCELERATOR__ != 1 - if( !Kalmar::amp_helper, Concurrency::extent>::contains(idx, this->extent ) ) + if( !detail::amp_helper, Concurrency::extent>::contains(idx, this->extent ) ) throw runtime_exception("errorMsg_throw", 0); #endif array_view av(*this); @@ -3082,9 +3082,9 @@ class array_view public: typedef typename std::remove_const::type nc_T; #if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; + typedef detail::_data acc_buffer_t; #else - typedef Kalmar::_data_host acc_buffer_t; + typedef detail::_data_host acc_buffer_t; #endif /** @@ -3453,7 +3453,7 @@ class array_view cache.get_cpu_access(true); #endif T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, Concurrency::extent>::flatten(idx + index_base, extent_base)]; + return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx + index_base, extent_base)]; } T& operator() (const index& idx) const restrict(amp,cpu) { @@ -3546,7 +3546,7 @@ class array_view array_view section(const Concurrency::index& idx, const Concurrency::extent& ext) const restrict(amp,cpu) { #if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, Concurrency::extent>::contains(idx, ext,this->extent ) ) + if ( !detail::amp_helper, Concurrency::extent>::contains(idx, ext,this->extent ) ) throw runtime_exception("errorMsg_throw", 0); #endif array_view av(cache, ext, extent_base, idx + index_base, offset); @@ -3558,7 +3558,7 @@ class array_view */ array_view section(const Concurrency::index& idx) const restrict(amp,cpu) { Concurrency::extent ext(extent); - Kalmar::amp_helper, Concurrency::extent>::minus(idx, ext); + detail::amp_helper, Concurrency::extent>::minus(idx, ext); return section(idx, ext); } @@ -3653,7 +3653,7 @@ class array_view cache.get_cpu_access(true); #endif T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, Concurrency::extent>::flatten(idx.global + index_base, extent_base)]; + return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx.global + index_base, extent_base)]; } const acc_buffer_t& internal() const restrict(amp,cpu) { return cache; } @@ -3717,9 +3717,9 @@ class array_view typedef typename std::remove_const::type nc_T; #if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; + typedef detail::_data acc_buffer_t; #else - typedef Kalmar::_data_host acc_buffer_t; + typedef detail::_data_host acc_buffer_t; #endif /** @@ -4032,7 +4032,7 @@ class array_view cache.get_cpu_access(); #endif const T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, Concurrency::extent>::flatten(idx + index_base, extent_base)]; + return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx + index_base, extent_base)]; } const T& operator() (const index& idx) const restrict(amp,cpu) { return (*this)[idx]; @@ -4137,7 +4137,7 @@ class array_view */ array_view section(const Concurrency::index& idx) const restrict(amp,cpu) { Concurrency::extent ext(extent); - Kalmar::amp_helper, Concurrency::extent>::minus(idx, ext); + detail::amp_helper, Concurrency::extent>::minus(idx, ext); return section(idx, ext); } @@ -5092,16 +5092,17 @@ void parallel_for_each( { if (av.get_accelerator().get_device_path() == L"cpu") { throw runtime_exception{ - Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; + detail::__errorMsg_UnsupportedAccelerator, E_FAIL}; } validate_compute_domain(compute_domain); - Kalmar::launch_kernel(av.pQueue, compute_domain, f); + detail::launch_kernel(av.pQueue, compute_domain, f); } // parallel_for_each, tiled +template inline void validate_tile_dims() {} @@ -5110,19 +5111,21 @@ template inline void validate_tile_dims() { + static_assert(dim > 0, "The number of threads in a tile must be positive."); static_assert( dim <= 1024, "The maximum number of threads in a tile is 1024."); validate_tile_dims(); } - template inline void validate_tiled_compute_domain(const tiled_extent& compute_domain) { - constexpr int tmp[]{dims...}; + validate_tile_dims(); + validate_compute_domain(compute_domain); + constexpr int tmp[]{dims...}; for (auto i = 0u; i != compute_domain.rank; ++i) { if (compute_domain[i] % tmp[i]) { throw invalid_compute_domain{"Extent not divisible by tile size."}; @@ -5137,15 +5140,13 @@ void parallel_for_each( const tiled_extent& compute_domain, const Kernel& f) { - validate_tile_dims(); - if (av.get_accelerator().get_device_path() == L"cpu") { throw runtime_exception{ - Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; + detail::__errorMsg_UnsupportedAccelerator, E_FAIL}; } - validate_compute_domain(compute_domain); - validate_tiled_domain(compute_domain); - Kalmar::launch_kernel(av.pQueue, compute_domain, f); + validate_tiled_compute_domain(compute_domain); + + detail::launch_kernel(av.pQueue, compute_domain, f); } } // namespace Concurrency diff --git a/include/amp_math.h b/include/amp_math.h index 7f229c2a4e3..d487a0072e8 100644 --- a/include/amp_math.h +++ b/include/amp_math.h @@ -13,11 +13,11 @@ namespace Concurrency { // namespace alias -// namespace Concurrency::fast_math is an alias of namespace Kalmar::fast_math -namespace fast_math = Kalmar::fast_math; +// namespace Concurrency::fast_math is an alias of namespace detail::fast_math +namespace fast_math = detail::fast_math; -// namespace Concurrency::precise_math is an alias of namespace Kalmar::precise_math -namespace precise_math = Kalmar::precise_math; +// namespace Concurrency::precise_math is an alias of namespace detail::precise_math +namespace precise_math = detail::precise_math; } // namespace Concurrency diff --git a/include/atomics.hpp b/include/atomics.hpp index c7f541f032c..ab2fe7e3964 100644 --- a/include/atomics.hpp +++ b/include/atomics.hpp @@ -43,7 +43,7 @@ namespace hc return __atomic_exchange_n(dest, val, __ATOMIC_RELAXED); } inline - float atomic_exchange(float* dest, float val) //[[cpu]][[hc]] + float atomic_exchange(float* dest, float val) [[cpu]][[hc]] { static_assert(sizeof(float) == sizeof(unsigned int), ""); @@ -59,7 +59,7 @@ namespace hc return r; } inline - double atomic_exchange(double* dest, double val) //[[cpu]][[hc]] + double atomic_exchange(double* dest, double val) [[cpu]][[hc]] { static_assert(sizeof(double) == sizeof(std::uint64_t), ""); @@ -175,7 +175,8 @@ namespace hc typename T, typename std::enable_if< std::is_integral{} && - sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline T atomic_fetch_sub(T* dest, T val) [[cpu]][[hc]] { return __atomic_fetch_sub(dest, val, __ATOMIC_RELAXED); @@ -185,27 +186,54 @@ namespace hc typename T, typename std::enable_if< std::is_integral{} && - sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + std::is_signed{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline T atomic_fetch_max(T* dest, T val) [[cpu]][[hc]] { return __sync_fetch_and_max(dest, val); } + template< + typename T, + typename std::enable_if< + std::is_integral{} && + std::is_unsigned{} && + sizeof(T) >= sizeof(std::uint32_t)>::type* = nullptr> + inline + T atomic_fetch_max(T* dest, T val) [[cpu]][[hc]] + { + return __sync_fetch_and_umax(dest, val); + } template< typename T, typename std::enable_if< std::is_integral{} && - sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + std::is_signed{} && + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline T atomic_fetch_min(T* dest, T val) [[cpu]][[hc]] { return __sync_fetch_and_min(dest, val); } + template< + typename T, + typename std::enable_if< + std::is_integral{} && + std::is_unsigned{} && + sizeof(T) >= sizeof(std::uint32_t)>::type* = nullptr> + inline + T atomic_fetch_min(T* dest, T val) [[cpu]][[hc]] + { + return __sync_fetch_and_umin(dest, val); + } template< typename T, typename std::enable_if< std::is_integral{} && - sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline T atomic_fetch_and(T* dest, T val) [[cpu]][[hc]] { return __atomic_fetch_and(dest, val, __ATOMIC_RELAXED); @@ -215,7 +243,8 @@ namespace hc typename T, typename std::enable_if< std::is_integral{} && - sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline T atomic_fetch_or(T* dest, T val) [[cpu]][[hc]] { return __atomic_fetch_or(dest, val, __ATOMIC_RELAXED); @@ -225,7 +254,8 @@ namespace hc typename T, typename std::enable_if< std::is_integral{} && - sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> inline + sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr> + inline T atomic_fetch_xor(T* dest, T val) [[cpu]][[hc]] { return __atomic_fetch_xor(dest, val, __ATOMIC_RELAXED); diff --git a/include/hc.hpp b/include/hc.hpp index b80944897df..5031a90cf84 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -24,9 +24,21 @@ #include "hcc_features.hpp" +//#include +//#include + +#include "/opt/rocm/include/hsa/hsa.h" +#include "/opt/rocm/include/hsa/hsa_ext_amd.h" + +#include +#include #include #include +#include +#include +#include #include +#include #ifndef __HC__ # define __HC__ [[hc]] @@ -42,7 +54,7 @@ typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t; * @namespace hc * Heterogeneous C++ (HC) namespace */ -namespace Kalmar { +namespace detail { class HSAQueue; }; @@ -51,8 +63,8 @@ namespace hc { class AmPointerInfo; using namespace atomics; -using namespace Kalmar::enums; -using namespace Kalmar::CLAMP; +using namespace detail::enums; +using namespace detail::CLAMP; // forward declaration @@ -67,11 +79,11 @@ template class array; // namespace alias -// namespace hc::fast_math is an alias of namespace Kalmar::fast_math -namespace fast_math = Kalmar::fast_math; +// namespace hc::fast_math is an alias of namespace detail::fast_math +namespace fast_math = detail::fast_math; -// namespace hc::precise_math is an alias of namespace Kalmar::precise_math -namespace precise_math = Kalmar::precise_math; +// namespace hc::precise_math is an alias of namespace detail::precise_math +namespace precise_math = detail::precise_math; // type alias @@ -79,11 +91,11 @@ namespace precise_math = Kalmar::precise_math; * Represents a unique position in N-dimensional space. */ template -using index = Kalmar::index; +using index = detail::index; -using runtime_exception = Kalmar::runtime_exception; -using invalid_compute_domain = Kalmar::invalid_compute_domain; -using accelerator_view_removed = Kalmar::accelerator_view_removed; +using runtime_exception = detail::runtime_exception; +using invalid_compute_domain = detail::invalid_compute_domain; +using accelerator_view_removed = detail::accelerator_view_removed; // ------------------------------------------------------------------------ // global functions @@ -95,17 +107,17 @@ using accelerator_view_removed = Kalmar::accelerator_view_removed; * @return An implementation-defined tick count */ inline uint64_t get_system_ticks() { - return Kalmar::getContext()->getSystemTicks(); + return detail::getContext()->getSystemTicks(); } /** - * Get the frequency of ticks per second for the underlying asynchrnous operation. + * Get the frequency of ticks per second for the underlying asynchronous operation. * * @return An implementation-defined frequency in Hz in case the instance is * created by a kernel dispatch or a barrier packet. 0 otherwise. */ inline uint64_t get_tick_frequency() { - return Kalmar::getContext()->getSystemTickFrequency(); + return detail::getContext()->getSystemTickFrequency(); } #define GET_SYMBOL_ADDRESS(acc, symbol) \ @@ -198,7 +210,7 @@ class accelerator_view { * the parent accelerator. */ // FIXME: dummy implementation now - bool get_is_debug() const { return 0; } + bool get_is_debug() const { return 0; } /** * Performs a blocking wait for completion of all commands submitted to the @@ -209,9 +221,9 @@ class accelerator_view { * hcWaitModeActive would be used to reduce latency with * the expense of using one CPU core for active waiting. */ - void wait(hcWaitMode waitMode = hcWaitModeBlocked) { - pQueue->wait(waitMode); - Kalmar::getContext()->flushPrintfBuffer(); + void wait(hcWaitMode waitMode = hcWaitModeBlocked) { + pQueue->wait(waitMode); + detail::getContext()->flushPrintfBuffer(); } /** @@ -221,10 +233,10 @@ class accelerator_view { * An accelerator_view internally maintains a buffer of commands such as * data transfers between the host memory and device buffers, and kernel * invocations (parallel_for_each calls). This member function sends the - * commands to the device for processing. Normally, these commands + * commands to the device for processing. Normally, these commands * to the GPU automatically whenever the runtime determines that they need - * to be, such as when the command buffer is full or when waiting for - * transfer of data from the device buffers to host memory. The flush + * to be, such as when the command buffer is full or when waiting for + * transfer of data from the device buffers to host memory. The flush * member function will send the commands manually to the device. * * Calling this member function incurs an overhead and must be used with @@ -235,7 +247,7 @@ class accelerator_view { * references to them have been removed. * * Because flush operates asynchronously, it can return either before or - * after the device finishes executing the buffered commandser, the + * after the device finishes executing the buffered commands, the * commands will eventually always complete. * * If the queuing_mode is queuing_mode_immediate, this function has no effect. @@ -250,7 +262,7 @@ class accelerator_view { * commands that were submitted prior to the marker event creation have * completed, the future is ready. * - * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), + * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), * the marker always ensures older commands complete before the returned completion_future * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. @@ -273,7 +285,7 @@ class accelerator_view { * dependent event and all commands submitted prior to the marker event * creation have been completed, the future is ready. * - * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), + * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), * the marker always ensures older commands complete before the returned completion_future * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. @@ -284,7 +296,7 @@ class accelerator_view { * - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs) * * dependent_futures may be recorded in another queue or another accelerator. If in another accelerator, - * the runtime performs cross-accelerator sychronization. + * the runtime performs cross-accelerator synchronisation. * * @return A future which can be waited on, and will block until the * current batch of commands, plus the dependent event have @@ -300,7 +312,7 @@ class accelerator_view { * dependent events and all commands submitted prior to the marker event * creation have been completed, the completion_future is ready. * - * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), + * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), * the marker always ensures older commands complete before the returned completion_future * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. @@ -325,7 +337,7 @@ class accelerator_view { * dependent events and all commands submitted prior to the marker event * creation have been completed, the completion_future is ready. * - * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), + * Regardless of the accelerator_view's execute_order (execute_any_order, execute_in_order), * the marker always ensures older commands complete before the returned completion_future * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. @@ -338,8 +350,8 @@ class accelerator_view { completion_future create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) const; /** - * Copies size_bytes bytes from src to dst. - * Src and dst must not overlap. + * Copies size_bytes bytes from src to dst. + * Src and dst must not overlap. * Note the src is the first parameter and dst is second, following C++ convention. * The copy command will execute after any commands already inserted into the accelerator_view finish. * This is a synchronous copy command, and the copy operation complete before this call returns. @@ -350,15 +362,15 @@ class accelerator_view { /** - * Copies size_bytes bytes from src to dst. - * Src and dst must not overlap. + * Copies size_bytes bytes from src to dst. + * Src and dst must not overlap. * Note the src is the first parameter and dst is second, following C++ convention. * The copy command will execute after any commands already inserted into the accelerator_view finish. * This is a synchronous copy command, and the copy operation complete before this call returns. * The copy_ext flavor allows caller to provide additional information about each pointer, which can improve performance by eliminating replicated lookups. * This interface is intended for language runtimes such as HIP. - - @p copyDir : Specify direction of copy. Must be hcMemcpyHostToHost, hcMemcpyHostToDevice, hcMemcpyDeviceToHost, or hcMemcpyDeviceToDevice. + + @p copyDir : Specify direction of copy. Must be hcMemcpyHostToHost, hcMemcpyHostToDevice, hcMemcpyDeviceToHost, or hcMemcpyDeviceToDevice. @p forceUnpinnedCopy : Force copy to be performed with host involvement rather than with accelerator copy engines. */ void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc, bool forceUnpinnedCopy); @@ -368,14 +380,14 @@ class accelerator_view { void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceUnpinnedCopy) ; /** - * Copies size_bytes bytes from src to dst. - * Src and dst must not overlap. - * Note the src is the first parameter and dst is second, following C++ convention. + * Copies size_bytes bytes from src to dst. + * Src and dst must not overlap. + * Note the src is the first parameter and dst is second, following C++ convention. * This is an asynchronous copy command, and this call may return before the copy operation completes. * If the source or dest is host memory, the memory must be pinned or a runtime exception will be thrown. * Pinned memory can be created with am_alloc with flag=amHostPinned flag. * - * The copy command will be implicitly ordered with respect to commands previously equeued to this accelerator_view: + * The copy command will be implicitly ordered with respect to commands previously enqueued to this accelerator_view: * - If the accelerator_view execute_order is execute_in_order (the default), then the copy will execute after all previously sent commands finish execution. * - If the accelerator_view execute_order is execute_any_order, then the copy will start after all previously send commands start but can execute in any order. * @@ -383,11 +395,10 @@ class accelerator_view { */ completion_future copy_async(const void *src, void *dst, size_t size_bytes); - /** - * Copies size_bytes bytes from src to dst. - * Src and dst must not overlap. - * Note the src is the first parameter and dst is second, following C++ convention. + * Copies size_bytes bytes from src to dst. + * Src and dst must not overlap. + * Note the src is the first parameter and dst is second, following C++ convention. * This is an asynchronous copy command, and this call may return before the copy operation completes. * If the source or dest is host memory, the memory must be pinned or a runtime exception will be thrown. * Pinned memory can be created with am_alloc with flag=amHostPinned flag. @@ -398,18 +409,18 @@ class accelerator_view { * The copyAcc determines where the copy is executed and does not affect the ordering. * * The copy_async_ext flavor allows caller to provide additional information about each pointer, which can improve performance by eliminating replicated lookups, - * and also allow control over which device performs the copy. + * and also allow control over which device performs the copy. * This interface is intended for language runtimes such as HIP. * - * @p copyDir : Specify direction of copy. Must be hcMemcpyHostToHost, hcMemcpyHostToDevice, hcMemcpyDeviceToHost, or hcMemcpyDeviceToDevice. + * @p copyDir : Specify direction of copy. Must be hcMemcpyHostToHost, hcMemcpyHostToDevice, hcMemcpyDeviceToHost, or hcMemcpyDeviceToDevice. * @p copyAcc : Specify which accelerator performs the copy operation. The specified accelerator must have access to the source and dest pointers - either * because the memory is allocated on those devices or because the accelerator has peer access to the memory. * If copyAcc is nullptr, then the copy will be performed by the host. In this case, the host accelerator must have access to both pointers. - * The copy operation will be performed by the specified engine but is not synchronized with respect to any operations on that device. + * The copy operation will be performed by the specified engine but is not synchronized with respect to any operations on that device. * */ - completion_future copy_async_ext(const void *src, void *dst, size_t size_bytes, - hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, + completion_future copy_async_ext(const void *src, void *dst, size_t size_bytes, + hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc); /** @@ -485,7 +496,7 @@ class accelerator_view { /** * Returns an opaque handle which points to the AM region on the HSA agent. - * This region can be used to allocate accelerator memory which is accessible from the + * This region can be used to allocate accelerator memory which is accessible from the * specified accelerator. * * @return An opaque handle of the region, if the accelerator is based @@ -498,7 +509,7 @@ class accelerator_view { /** * Returns an opaque handle which points to the AM system region on the HSA agent. - * This region can be used to allocate system memory which is accessible from the + * This region can be used to allocate system memory which is accessible from the * specified accelerator. * * @return An opaque handle of the region, if the accelerator is based @@ -510,7 +521,7 @@ class accelerator_view { /** * Returns an opaque handle which points to the AM system region on the HSA agent. - * This region can be used to allocate finegrained system memory which is accessible from the + * This region can be used to allocate finegrained system memory which is accessible from the * specified accelerator. * * @return An opaque handle of the region, if the accelerator is based @@ -541,34 +552,34 @@ class accelerator_view { /** * Dispatch a kernel into the accelerator_view. * - * This function is intended to provide a gateway to dispatch code objects, with + * This function is intended to provide a gateway to dispatch code objects, with * some assistance from HCC. Kernels are specified in the standard code object - * format, and can be created from a varety of compiler tools including the + * format, and can be created from a variety of compiler tools including the * assembler, offline cl compilers, or other tools. The caller also - * specifies the execution configuration and kernel arguments. HCC + * specifies the execution configuration and kernel arguments. HCC * will copy the kernel arguments into an appropriate segment and insert - * the packet into the queue. HCC will also automatically handle signal + * the packet into the queue. HCC will also automatically handle signal * and kernarg allocation and deallocation for the command. * - * The kernel is dispatched asynchronously, and thus this API may return before the + * The kernel is dispatched asynchronously, and thus this API may return before the * kernel finishes executing. - + * Kernels dispatched with this API may be interleaved with other copy and kernel - * commands generated from copy or parallel_for_each commands. - * The kernel honors the execute_order associated with the accelerator_view. + * commands generated from copy or parallel_for_each commands. + * The kernel honors the execute_order associated with the accelerator_view. * Specifically, if execute_order is execute_in_order, then the kernel * will wait for older data and kernel commands in the same queue before - * beginning execution. If execute_order is execute_any_order, then the - * kernel may begin executing without regards to the state of older kernels. - * This call honors the packer barrier bit (1 << HSA_PACKET_HEADER_BARRIER) + * beginning execution. If execute_order is execute_any_order, then the + * kernel may begin executing without regards to the state of older kernels. + * This call honors the packer barrier bit (1 << HSA_PACKET_HEADER_BARRIER) * if set in the aql.header field. If set, this provides the same synchronization - * behaviora as execute_in_order for the command generated by this API. + * behavior as execute_in_order for the command generated by this API. * - * @p aql is an HSA-format "AQL" packet. The following fields must + * @p aql is an HSA-format "AQL" packet. The following fields must * be set by the caller: - * aql.kernel_object + * aql.kernel_object * aql.group_segment_size : includes static + dynamic group size - * aql.private_segment_size + * aql.private_segment_size * aql.grid_size_x, aql.grid_size_y, aql.grid_size_z * aql.group_size_x, aql.group_size_y, aql.group_size_z * aql.setup : The 2 bits at HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS. @@ -578,19 +589,19 @@ class accelerator_view { (1 << HSA_PACKET_HEADER_BARRIER); * The following fields are ignored. The API will will set up these fields before dispatching the AQL packet: - * aql.completion_signal - * aql.kernarg - * - * @p args : Pointer to kernel arguments with the size and aligment expected by the kernel. The args are copied and then passed directly to the kernel. After this function returns, the args memory may be deallocated. + * aql.completion_signal + * aql.kernarg + * + * @p args : Pointer to kernel arguments with the size and alignment expected by the kernel. The args are copied and then passed directly to the kernel. After this function returns, the args memory may be deallocated. * @p argSz : Size of the arguments. * @p cf : Written with a completion_future that can be used to track the status - * of the dispatch. May be NULL, in which case no completion_future is - * returned and the caller must use other synchronization techniqueues + * of the dispatch. May be NULL, in which case no completion_future is + * returned and the caller must use other synchronization techniques * such as calling accelerator_view::wait() or waiting on a younger command * in the same queue. - * @p kernel_name : Optionally specify the name of the kernel for debug and profiling. + * @p kernel_name : Optionally specify the name of the kernel for debug and profiling. * May be null. If specified, the caller is responsible for ensuring the memory for the name remains allocated until the kernel completes. - * + * * * The dispatch_hsa_kernel call will perform the following operations: * - Efficiently allocate a kernarg region and copy the arguments. @@ -598,17 +609,17 @@ class accelerator_view { * - Dispatch the command into the queue and flush it to the GPU. * - Kernargs and signals are automatically reclaimed by the HCC runtime. */ - void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, + void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, const void * args, size_t argsize, - hc::completion_future *cf=nullptr, const char *kernel_name = nullptr) + hc::completion_future *cf=nullptr, const char *kernel_name = nullptr) { pQueue->dispatch_hsa_kernel(aql, args, argsize, cf, kernel_name); } /** - * Set a CU affinity to specific command queues. + * Set a CU affinity to specific command queues. * The setting is permanent until the queue is destroyed or CU affinity is - * set again. This setting is "atomic", it won't affect the dispatch in flight. + * set again. This setting is "atomic", it won't affect the dispatch in flight. * * @param cu_mask a bool vector to indicate what CUs you want to use. True * represents using the cu. The first 32 elements represents the first @@ -628,8 +639,8 @@ class accelerator_view { } private: - accelerator_view(std::shared_ptr pQueue) : pQueue(pQueue) {} - std::shared_ptr pQueue; + accelerator_view(std::shared_ptr pQueue) : pQueue(pQueue) {} + std::shared_ptr pQueue; friend class accelerator; template friend class array; @@ -637,27 +648,27 @@ class accelerator_view { template friend - void Kalmar::launch_kernel_with_dynamic_group_memory( - const std::shared_ptr&, + void detail::launch_kernel_with_dynamic_group_memory( + const std::shared_ptr&, const Domain&, const Kernel&); template friend - std::shared_ptr - Kalmar::launch_kernel_with_dynamic_group_memory_async( - const std::shared_ptr&, + std::shared_ptr + detail::launch_kernel_with_dynamic_group_memory_async( + const std::shared_ptr&, const Domain&, const Kernel&); template friend - void Kalmar::launch_kernel( - const std::shared_ptr&, + void detail::launch_kernel( + const std::shared_ptr&, const Domain&, const Kernel&); template friend - std::shared_ptr Kalmar::launch_kernel_async( - const std::shared_ptr&, + std::shared_ptr detail::launch_kernel_async( + const std::shared_ptr&, const Domain&, const Kernel&); @@ -676,8 +687,8 @@ class accelerator_view { const accelerator_view&, const tiled_extent&, const Kernel&); accelerator_view() __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - throw runtime_exception("errorMsg_throw", 0); +#if __HCC_ACCELERATOR__ != 1 + throw runtime_exception{"errorMsg_throw", 0}; #endif } }; @@ -696,7 +707,7 @@ class accelerator public: /** * Constructs a new accelerator object that represents the default - * accelerator. This is equivalent to calling the constructor + * accelerator. This is equivalent to calling the constructor * @code{.cpp} * accelerator(accelerator::default_accelerator) * @endcode @@ -722,7 +733,7 @@ class accelerator * @param[in] path The device path of this accelerator. */ explicit accelerator(const std::wstring& path) - : pDev(Kalmar::getContext()->getDevice(path)) {} + : pDev(detail::getContext()->getDevice(path)) {} /** * Copy constructs an accelerator object. This function does a shallow copy @@ -741,7 +752,7 @@ class accelerator * @return A vector of accelerators. */ static std::vector get_all() { - auto Devices = Kalmar::getContext()->getDevices(); + auto Devices = detail::getContext()->getDevices(); std::vector ret; for(auto&& i : Devices) ret.push_back(i); @@ -762,7 +773,7 @@ class accelerator * false, and the function will have no effect. */ static bool set_default(const std::wstring& path) { - return Kalmar::getContext()->set_default(path); + return detail::getContext()->set_default(path); } /** @@ -781,7 +792,7 @@ class accelerator * of the target for a parallel_for_each execution. */ static accelerator_view get_auto_selection_view() { - return Kalmar::getContext()->auto_select(); + return detail::getContext()->auto_select(); } /** @@ -812,14 +823,14 @@ class accelerator * * @param[in] qmode The queuing mode of the accelerator_view to be created. * See "Queuing Mode". The default value would be - * queueing_mdoe_automatic if not specified. + * queueing_mode_automatic if not specified. */ accelerator_view create_view(execute_order order = execute_in_order, queuing_mode mode = queuing_mode_automatic) { auto pQueue = pDev->createQueue(order); pQueue->set_mode(mode); return pQueue; } - + /** * Compares "this" accelerator with the passed accelerator object to * determine if they represent the same underlying device. @@ -848,9 +859,9 @@ class accelerator * this this accelerator. * * This method only succeeds if the default_cpu_access_type for the - * accelerator has not already been overriden by a previous call to this - * method and the runtime selected default_cpu_access_type for this - * accelerator has not yet been used for allocating an array or for an + * accelerator has not already been overriden by a previous call to this + * method and the runtime selected default_cpu_access_type for this + * accelerator has not yet been used for allocating an array or for an * implicit array_view memory allocation on this accelerator. * * @param[in] default_cpu_access_type The default cpu access_type to be used @@ -938,8 +949,8 @@ class accelerator * Get the default cpu access_type for buffers created on this accelerator */ access_type get_default_cpu_access_type() const { return pDev->get_access(); } - - + + /** * Returns the maximum size of tile static area available on this * accelerator. @@ -947,13 +958,13 @@ class accelerator size_t get_max_tile_static_size() { return get_default_view().get_max_tile_static_size(); } - + /** * Returns a vector of all accelerator_view associated with this accelerator. */ std::vector get_all_views() { std::vector result; - std::vector< std::shared_ptr > queues = pDev->get_all_queues(); + std::vector< std::shared_ptr > queues = pDev->get_all_queues(); for (auto q : queues) { result.push_back(q); } @@ -962,7 +973,7 @@ class accelerator /** * Returns an opaque handle which points to the AM region on the HSA agent. - * This region can be used to allocate accelerator memory which is accessible from the + * This region can be used to allocate accelerator memory which is accessible from the * specified accelerator. * * @return An opaque handle of the region, if the accelerator is based @@ -974,7 +985,7 @@ class accelerator /** * Returns an opaque handle which points to the AM system region on the HSA agent. - * This region can be used to allocate system memory which is accessible from the + * This region can be used to allocate system memory which is accessible from the * specified accelerator. * * @return An opaque handle of the region, if the accelerator is based @@ -986,7 +997,7 @@ class accelerator /** * Returns an opaque handle which points to the AM system region on the HSA agent. - * This region can be used to allocate finegrained system memory which is accessible from the + * This region can be used to allocate finegrained system memory which is accessible from the * specified accelerator. * * @return An opaque handle of the region, if the accelerator is based @@ -1050,27 +1061,24 @@ class accelerator * Check if @p other is peer of this accelerator. * * @return true if other can access this accelerator's device memory pool or false if not. - * The acceleratos is not its own peer. + * The accelerator is not its own peer. */ bool get_is_peer(const accelerator& other) const { return pDev->is_peer(other.pDev); } - + /** - * Return a std::vector of this accelerator's peers. peer is other accelerator which can access this + * Return a std::vector of this accelerator's peers. peer is other accelerator which can access this * accelerator's device memory using map_to_peer family of APIs. * */ - std::vector get_peers() const { + std::vector get_peers() const + { // TODO: remove / optimise. std::vector peers; - const auto &accs = get_all(); + static const auto accs = get_all(); + for (auto&& acc : accs) if (get_is_peer(acc)) peers.push_back(acc); - for(auto iter = accs.begin(); iter != accs.end(); iter++) - { - if(this->get_is_peer(*iter)) - peers.push_back(*iter); - } return peers; } @@ -1101,12 +1109,12 @@ class accelerator return pDev->has_cpu_accessible_am(); }; - Kalmar::KalmarDevice *get_dev_ptr() const { return pDev; }; + detail::HCCDevice *get_dev_ptr() const { return pDev; }; private: - accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {} + accelerator(detail::HCCDevice* pDev) : pDev(pDev) {} friend class accelerator_view; - Kalmar::KalmarDevice* pDev; + detail::HCCDevice* pDev; }; // ------------------------------------------------------------------------ @@ -1132,7 +1140,7 @@ class completion_future { completion_future() : __amp_future(), __thread_then(nullptr), __asyncOp(nullptr) {}; /** - * Copy constructor. Constructs a new completion_future object that referes + * Copy constructor. Constructs a new completion_future object that refers * to the same asynchronous operation as the other completion_future object. * * @param[in] other An object of type completion_future from which to @@ -1143,7 +1151,7 @@ class completion_future { /** * Move constructor. Move constructs a new completion_future object that - * referes to the same asynchronous operation as originally refered by the + * refers to the same asynchronous operation as originally referred by the * other completion_future object. After this constructor returns, * other.valid() == false * @@ -1230,12 +1238,12 @@ class completion_future { if (this->valid()) { if (__asyncOp != nullptr) { __asyncOp->setWaitMode(mode); - } + } //TODO-ASYNC - need to reclaim older AsyncOps here. __amp_future.wait(); } - Kalmar::getContext()->flushPrintfBuffer(); + detail::getContext()->flushPrintfBuffer(); } template @@ -1269,19 +1277,17 @@ class completion_future { // the original signature in the specification should be // template // void then(const functor& func) const; - template - void then(const functor & func) { -#if __KALMAR_ACCELERATOR__ != 1 - // could only assign once - if (__thread_then == nullptr) { - // spawn a new thread to wait on the future and then execute the callback functor - __thread_then = new std::thread([&]() __CPU__ { - this->wait(); - if(this->valid()) - func(); - }); - } -#endif + template + void then(const F& func) + { // TODO: this should be completely redone, it is inefficient and odd. + // could only assign once + if (__thread_then == nullptr) { + // spawn a new thread to wait on the future and then execute the callback functor + __thread_then = new std::thread([&]() __CPU__ { + this->wait(); + if(this->valid()) func(); + }); + } } /** @@ -1290,7 +1296,7 @@ class completion_future { * purpose. * Applications should retain the parent completion_future to ensure * the native handle is not deallocated by the HCC runtime. The completion_future - * pointer to the native handle is reference counted, so a copy of + * pointer to the native handle is reference counted, so a copy of * the completion_future is sufficient to retain the native_handle. */ void* get_native_handle() const { @@ -1330,7 +1336,8 @@ class completion_future { } /** - * Get the frequency of ticks per second for the underlying asynchrnous operation. + * Get the frequency of ticks per second for the underlying asynchronous + * operation. * * @return An implementation-defined frequency in Hz in case the instance is * created by a kernel dispatch or a barrier packet. 0 otherwise. @@ -1362,7 +1369,7 @@ class completion_future { } delete __thread_then; __thread_then = nullptr; - + if (__asyncOp != nullptr) { __asyncOp = nullptr; } @@ -1377,14 +1384,14 @@ class completion_future { private: std::shared_future __amp_future; std::thread* __thread_then = nullptr; - std::shared_ptr __asyncOp; + std::shared_ptr __asyncOp; - completion_future(std::shared_ptr event) : __amp_future(*(event->getFuture())), __asyncOp(event) {} + completion_future(std::shared_ptr event) : __amp_future(*(event->getFuture())), __asyncOp(event) {} completion_future(const std::shared_future &__future) : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {} - friend class Kalmar::HSAQueue; + friend class detail::HSAQueue; // non-tiled parallel_for_each // generic version @@ -1441,10 +1448,10 @@ accelerator_view::get_accelerator() const { return pQueue->getDev(); } inline completion_future accelerator_view::create_marker(memory_scope scope) const { - std::shared_ptr deps[1]; + std::shared_ptr deps[1]; // If necessary create an explicit dependency on previous command // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete. - std::shared_ptr depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr); + std::shared_ptr depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr); int cnt = 0; if (depOp) { @@ -1457,11 +1464,11 @@ accelerator_view::create_marker(memory_scope scope) const { inline unsigned int accelerator_view::get_version() const { return get_accelerator().get_version(); } inline completion_future accelerator_view::create_blocking_marker(completion_future& dependent_future, memory_scope scope) const { - std::shared_ptr deps[2]; + std::shared_ptr deps[2]; // If necessary create an explicit dependency on previous command // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete. - std::shared_ptr depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr); + std::shared_ptr depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr); int cnt = 0; if (depOp) { @@ -1470,21 +1477,21 @@ inline completion_future accelerator_view::create_blocking_marker(completion_fut if (dependent_future.__asyncOp) { deps[cnt++] = dependent_future.__asyncOp; // retrieve async op associated with completion_future - } - + } + return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope)); } template inline completion_future accelerator_view::create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) const { - std::shared_ptr deps[5]; // array of 5 pointers to the native handle of async ops. 5 is the max supported by barrier packet + std::shared_ptr deps[5]; // array of 5 pointers to the native handle of async ops. 5 is the max supported by barrier packet hc::completion_future lastMarker; // If necessary create an explicit dependency on previous command // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete. - std::shared_ptr depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr); + std::shared_ptr depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr); int cnt = 0; if (depOp) { @@ -1534,8 +1541,8 @@ accelerator_view::copy_async(const void *src, void *dst, size_t size_bytes) { inline completion_future accelerator_view::copy_async_ext(const void *src, void *dst, size_t size_bytes, - hcCommandKind copyDir, - const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, + hcCommandKind copyDir, + const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc) { return completion_future(pQueue->EnqueueAsyncCopyExt(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : nullptr)); @@ -1663,7 +1670,7 @@ class extent { * by this extent (with an assumed origin of zero). */ bool contains(const index& idx) const __CPU__ __HC__ { - return Kalmar::amp_helper, extent>::contains(idx, *this); + return detail::amp_helper, extent>::contains(idx, *this); } /** @@ -1672,7 +1679,7 @@ class extent { * extent[0] * extent[1] ... * extent[N-1] */ unsigned int size() const __CPU__ __HC__ { - return Kalmar::index_helper>::count_size(*this); + return detail::index_helper>::count_size(*this); } /** @{ */ @@ -1715,7 +1722,7 @@ class extent { * @param[in] other The right-hand extent to be compared. */ bool operator==(const extent& other) const __CPU__ __HC__ { - return Kalmar::index_helper >::equal(*this, other); + return detail::index_helper >::equal(*this, other); } bool operator!=(const extent& other) const __CPU__ __HC__ { return !(*this == other); @@ -1845,10 +1852,10 @@ class extent { /** @} */ private: - typedef Kalmar::index_impl::type> base; + typedef detail::index_impl::type> base; base base_; - template friend struct Kalmar::index_helper; - template friend struct Kalmar::amp_helper; + template friend struct detail::index_helper; + template friend struct detail::amp_helper; }; // ------------------------------------------------------------------------ @@ -1975,12 +1982,12 @@ template class tiled_extent : public extent { public: static const int rank = N; - + /** * Tile size for each dimension. */ int tile_dim[N]; - + /** * Default constructor. The origin and extent is default-constructed and * thus zero. @@ -2063,7 +2070,7 @@ class tiled_extent<1> : public extent<1> { * @param[in] ext The extent of this tiled_extent * @param[in] t0 Size of tile. */ - tiled_extent(const extent<1>& ext, int t0) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0} {} + tiled_extent(const extent<1>& ext, int t0) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0} {} /** * Constructs a tiled_extent with the extent "ext". @@ -2350,7 +2357,7 @@ tiled_extent<3> extent::tile_with_dynamic(int t0, int t1, int t2, int dynamic * @return The size of a wavefront. */ #define __HSA_WAVEFRONT_SIZE__ (64) -extern "C" unsigned int __wavesize() __HC__; +extern "C" unsigned int __wavesize() __HC__; #if __hcc_backend__==HCC_BACKEND_AMDGPU @@ -2362,7 +2369,7 @@ extern "C" inline unsigned int __wavesize() __HC__ { /** * Count number of 1 bits in the input * - * @param[in] input An unsinged 32-bit integer. + * @param[in] input An unsigned 32-bit integer. * @return Number of 1 bits in the input. */ extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ { @@ -2372,7 +2379,7 @@ extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ { /** * Count number of 1 bits in the input * - * @param[in] input An unsinged 64-bit integer. + * @param[in] input An unsigned 64-bit integer. * @return Number of 1 bits in the input. */ extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ { @@ -2531,7 +2538,7 @@ extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC_ /** @{ */ /** * Copy and interleave the lower half of the elements from - * each source into the desitionation + * each source into the destination * * Please refer to HSA PRM 5.9 for more detailed specification of these functions. */ @@ -2559,7 +2566,7 @@ extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__; /** @{ */ /** * Copy and interleave the upper half of the elements from - * each source into the desitionation + * each source into the destination * * Please refer to HSA PRM 5.9 for more detailed specification of these functions. */ @@ -2890,15 +2897,15 @@ inline float __amdgcn_ds_swizzle(float src, int pattern) [[hc]] { /** * move DPP intrinsic */ -extern "C" int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) [[hc]]; +extern "C" int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) [[hc]]; /** - * Shift the value of src to the right by one thread within a wavefront. - * + * Shift the value of src to the right by one thread within a wavefront. + * * @param[in] src variable being shifted * @param[in] bound_ctrl When set to true, a zero will be shifted into thread 0; otherwise, the original value will be returned for thread 0 - * @return value of src being shifted into from the neighboring lane - * + * @return value of src being shifted into from the neighboring lane + * */ extern "C" int __amdgcn_wave_sr1(int src, bool bound_ctrl) [[hc]]; inline unsigned int __amdgcn_wave_sr1(unsigned int src, bool bound_ctrl) [[hc]] { @@ -2913,14 +2920,14 @@ inline float __amdgcn_wave_sr1(float src, bool bound_ctrl) [[hc]] { } /** - * Shift the value of src to the left by one thread within a wavefront. - * + * Shift the value of src to the left by one thread within a wavefront. + * * @param[in] src variable being shifted * @param[in] bound_ctrl When set to true, a zero will be shifted into thread 63; otherwise, the original value will be returned for thread 63 - * @return value of src being shifted into from the neighboring lane - * + * @return value of src being shifted into from the neighboring lane + * */ -extern "C" int __amdgcn_wave_sl1(int src, bool bound_ctrl) [[hc]]; +extern "C" int __amdgcn_wave_sl1(int src, bool bound_ctrl) [[hc]]; inline unsigned int __amdgcn_wave_sl1(unsigned int src, bool bound_ctrl) [[hc]] { __u tmp; tmp.u = src; tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl); @@ -2934,11 +2941,11 @@ inline float __amdgcn_wave_sl1(float src, bool bound_ctrl) [[hc]] { /** - * Rotate the value of src to the right by one thread within a wavefront. - * + * Rotate the value of src to the right by one thread within a wavefront. + * * @param[in] src variable being rotated - * @return value of src being rotated into from the neighboring lane - * + * @return value of src being rotated into from the neighboring lane + * */ extern "C" int __amdgcn_wave_rr1(int src) [[hc]]; inline unsigned int __amdgcn_wave_rr1(unsigned int src) [[hc]] { @@ -2953,11 +2960,11 @@ inline float __amdgcn_wave_rr1(float src) [[hc]] { } /** - * Rotate the value of src to the left by one thread within a wavefront. - * + * Rotate the value of src to the left by one thread within a wavefront. + * * @param[in] src variable being rotated - * @return value of src being rotated into from the neighboring lane - * + * @return value of src being rotated into from the neighboring lane + * */ extern "C" int __amdgcn_wave_rl1(int src) [[hc]]; inline unsigned int __amdgcn_wave_rl1(unsigned int src) [[hc]] { @@ -2973,7 +2980,7 @@ inline float __amdgcn_wave_rl1(float src) [[hc]] { #endif -/* definition to expand macro then apply to pragma message +/* definition to expand macro then apply to pragma message #define VALUE_TO_STRING(x) #x #define VALUE(x) VALUE_TO_STRING(x) #define VAR_NAME_VALUE(var) #var "=" VALUE(var) @@ -2988,8 +2995,6 @@ inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline unsigned int __shfl(unsigned int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.u = var; tmp.i = __shfl(tmp.i, srcLane, width); @@ -3003,6 +3008,8 @@ inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __ return tmp.f; } +#endif + // FIXME: support half type /** @} */ @@ -3037,8 +3044,6 @@ inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WA return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline unsigned int __shfl_up(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.u = var; tmp.i = __shfl_up(tmp.i, delta, width); @@ -3051,6 +3056,8 @@ inline float __shfl_up(float var, const unsigned int delta, const int width=__HS return tmp.f; } +#endif + // FIXME: support half type /** @} */ @@ -3086,8 +3093,6 @@ inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_ return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline unsigned int __shfl_down(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.u = var; tmp.i = __shfl_down(tmp.i, delta, width); @@ -3100,6 +3105,7 @@ inline float __shfl_down(float var, const unsigned int delta, const int width=__ return tmp.f; } +#endif // FIXME: support half type /** @} */ @@ -3132,8 +3138,6 @@ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) _ return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline float __shfl_xor(float var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.f = var; tmp.i = __shfl_xor(tmp.i, laneMask, width); @@ -3149,6 +3153,8 @@ inline unsigned int __shfl_xor(unsigned int var, int laneMask, int width=__HSA_W return tmp.u; } +#endif + /** * Multiply two unsigned integers (x,y) but only the lower 24 bits will be used in the multiplication. * @@ -3251,7 +3257,7 @@ class tile_barrier { * @param[in] other An object of type tile_barrier from which to initialize * this. */ - tile_barrier(const tile_barrier& other) __CPU__ __HC__ {} + tile_barrier(const tile_barrier&) __CPU__ __HC__ = default; /** * Blocks execution of all threads in the thread tile until all threads in @@ -3368,7 +3374,8 @@ class tiled_index { * @param[in] other An object of type tiled_index from which to initialize * this. */ - tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {} + tiled_index(const tiled_index&) [[cpu, hc]] = default; + tiled_index(tiled_index&&) [[cpu, hc]] = default; /** * An index of rank 1, 2, or 3 that represents the global index within an @@ -3417,19 +3424,27 @@ class tiled_index { private: tiled_index() __HC__ - : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))), - local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))), - tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))), - tile_origin(index<3>(amp_get_global_id(2) - amp_get_local_id(2), - amp_get_global_id(1) - amp_get_local_id(1), - amp_get_global_id(0) - amp_get_local_id(0))), - tile_dim(index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0))) + : + global( + amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0)), + local(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0)), + tile(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0)), + tile_origin( + amp_get_global_id(2) - amp_get_local_id(2), + amp_get_global_id(1) - amp_get_local_id(1), + amp_get_global_id(0) - amp_get_local_id(0)), + tile_dim( + amp_get_local_size(2), + amp_get_local_size(1), + amp_get_local_size(0)) {} - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent&, const Kernel&); + template friend - struct Kalmar::Indexer; + completion_future parallel_for_each( + const accelerator_view&, const tiled_extent&, const Kernel&); + friend + struct detail::Indexer; }; @@ -3454,7 +3469,7 @@ class tiled_index<1> { * @param[in] other An object of type tiled_index from which to initialize * this. */ - tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {} + tiled_index(const tiled_index& other) __CPU__ __HC__ = default; /** * An index of rank 1, 2, or 3 that represents the global index within an @@ -3503,17 +3518,19 @@ class tiled_index<1> { private: tiled_index() __HC__ - : global(index<1>(amp_get_global_id(0))), - local(index<1>(amp_get_local_id(0))), - tile(index<1>(amp_get_group_id(0))), - tile_origin(index<1>(amp_get_global_id(0) - amp_get_local_id(0))), - tile_dim(index<1>(amp_get_local_size(0))) + : global(amp_get_global_id(0)), + local(amp_get_local_id(0)), + tile(amp_get_group_id(0)), + tile_origin(amp_get_global_id(0) - amp_get_local_id(0)), + tile_dim(amp_get_local_size(0)) {} - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); + template + friend + completion_future parallel_for_each( + const accelerator_view&, const tiled_extent<1>&, const Kernel&); friend - struct Kalmar::Indexer; + struct detail::Indexer; }; /** @@ -3537,7 +3554,7 @@ class tiled_index<2> { * @param[in] other An object of type tiled_index from which to initialize * this. */ - tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {} + tiled_index(const tiled_index& other) __CPU__ __HC__ = default; /** * An index of rank 1, 2, or 3 that represents the global index within an @@ -3586,18 +3603,21 @@ class tiled_index<2> { private: tiled_index() __HC__ - : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))), - local(index<2>(amp_get_local_id(1), amp_get_local_id(0))), - tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))), - tile_origin(index<2>(amp_get_global_id(1) - amp_get_local_id(1), - amp_get_global_id(0) - amp_get_local_id(0))), - tile_dim(index<2>(amp_get_local_size(1), amp_get_local_size(0))) + : global(amp_get_global_id(1), amp_get_global_id(0)), + local(amp_get_local_id(1), amp_get_local_id(0)), + tile(amp_get_group_id(1), amp_get_group_id(0)), + tile_origin( + amp_get_global_id(1) - amp_get_local_id(1), + amp_get_global_id(0) - amp_get_local_id(0)), + tile_dim(amp_get_local_size(1), amp_get_local_size(0)) {} - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); + template friend - struct Kalmar::Indexer; + completion_future parallel_for_each( + const accelerator_view&, const tiled_extent<2>&, const Kernel&); + friend + struct detail::Indexer; }; // ------------------------------------------------------------------------ @@ -3648,14 +3668,14 @@ struct projection_helper // T& operator[](int i) const __CPU__ __HC__; typedef T& result_type; static result_type project(array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 now.cache.get_cpu_access(true); #endif T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); return *ptr; } static result_type project(const array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 now.cache.get_cpu_access(true); #endif T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); @@ -3709,14 +3729,14 @@ struct projection_helper // const T& operator[](int i) const __CPU__ __HC__; typedef const T& const_result_type; static const_result_type project(array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 now.cache.get_cpu_access(); #endif const T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); return *ptr; } static const_result_type project(const array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 now.cache.get_cpu_access(); #endif const T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); @@ -3772,18 +3792,18 @@ struct array_projection_helper typedef array_view result_type; typedef array_view const_result_type; static result_type project(array& now, int stride) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 if( stride < 0) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; #endif int comp[N - 1], i; for (i = N - 1; i > 0; --i) comp[i - 1] = now.extent[i]; extent ext(comp); int offset = ext.size() * stride; -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 if( offset >= now.extent.size()) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; #endif return result_type(now.m_device, ext, ext, index(), offset); } @@ -3806,14 +3826,14 @@ struct array_projection_helper typedef T& result_type; typedef const T& const_result_type; static result_type project(array& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 now.m_device.synchronize(true); #endif T *ptr = reinterpret_cast(now.m_device.get() + i); return *ptr; } static const_result_type project(const array& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 now.m_device.synchronize(); #endif const T *ptr = reinterpret_cast(now.m_device.get() + i); @@ -3824,11 +3844,11 @@ struct array_projection_helper template const extent& check(const extent& ext) { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 for (int i = 0; i < N; i++) { if(ext[i] <=0) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; } #endif return ext; @@ -3838,40 +3858,40 @@ const extent& check(const extent& ext) // forward declarations of copy routines used by array / array_view // ------------------------------------------------------------------------ -template +template void copy(const array_view& src, const array_view& dest); -template +template void copy(const array_view& src, const array_view& dest); -template +template void copy(const array& src, const array_view& dest); -template +template void copy(const array& src, array& dest); -template +template void copy(const array_view& src, array& dest); -template +template void copy(const array_view& src, array& dest); -template +template void copy(InputIter srcBegin, InputIter srcEnd, const array_view& dest); -template +template void copy(InputIter srcBegin, InputIter srcEnd, array& dest); -template +template void copy(InputIter srcBegin, const array_view& dest); -template +template void copy(InputIter srcBegin, array& dest); -template +template void copy(const array_view &src, OutputIter destBegin); -template +template void copy(const array &src, OutputIter destBegin); // ------------------------------------------------------------------------ @@ -3887,18 +3907,100 @@ void copy(const array &src, OutputIter destBegin); */ template class array { - static_assert(!std::is_const::value, "array is not supported"); -public: -#if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; -#else - typedef Kalmar::_data_host acc_buffer_t; -#endif + static_assert(!std::is_const{}, "array is not supported"); + static_assert( + std::is_trivially_copyable{}, + "Only trivially copyable types are supported."); + static_assert( + std::is_trivially_destructible{}, + "Only trivially destructible types are supported."); + + struct Deleter { + void operator()(T* ptr) + { // TODO: this may throw in a dtor, which is bad. + if (hsa_memory_free(ptr) != HSA_STATUS_SUCCESS) { + throw std::runtime_error{"Failed to deallocate array memory."}; + } + } + }; + using Guarded_locked_ptr = std::pair; + + inline static constexpr std::size_t max_array_cnt_{65521u}; // Prime. + inline static std::array locked_ptrs_{}; + accelerator_view owner_; + accelerator_view associate_; + extent extent_; + access_type cpu_access_; + std::unique_ptr data_; + std::size_t this_idx_{max_array_cnt_}; + + template + friend + struct projection_helper; + template + friend + struct array_projection_helper; + + template + friend + void copy(const array&, const array_view&); + template + friend + void copy(const array_view&, array&); + + T* allocate_() + { + hsa_region_t* r{nullptr}; + switch (cpu_access_) { + case access_type_none: case access_type_auto: + r = static_cast(owner_.get_hsa_am_region()); + break; + default: + r = static_cast(owner_.get_hsa_am_system_region()); + } + + void* tmp{nullptr}; + + auto s = hsa_memory_allocate(*r, extent_.size() * sizeof(T), &tmp); + if (s != HSA_STATUS_SUCCESS) { + throw std::runtime_error{"Failed to allocate array storage."}; + } + + return static_cast(tmp); + } + std::size_t lock_this_() + { + const auto n = reinterpret_cast(this) % max_array_cnt_; + do { + while (locked_ptrs_[n].first.test_and_set()); + // TODO: add backoff here. + + auto s = hsa_amd_memory_lock( + this, + sizeof(*this), + static_cast(owner_.get_hsa_agent()), + 1, + reinterpret_cast(&locked_ptrs_[n].second)); + + if (s != HSA_STATUS_SUCCESS) { + throw std::runtime_error{"Failed to lock array address."}; + } + + return n; + } while (true); // TODO: add termination after a number of attempts. + } + array* this_() const [[hc]] + { + const auto n = reinterpret_cast(this) % max_array_cnt_; + + return locked_ptrs_[n].second; + } +public: /** * The rank of this array. */ - static const int rank = N; + static constexpr int rank = N; /** * The element type of this array. @@ -3909,7 +4011,7 @@ class array { * There is no default constructor for array. */ array() = delete; - + /** * Copy constructor. Constructs a new array from the supplied argument * other. The new array is located on the same accelerator_view as the @@ -3919,8 +4021,10 @@ class array { * this new array. */ array(const array& other) - : array(other.get_extent(), other.get_accelerator_view()) - { copy(other, *this); } + : array{other.extent_, other.owner_, other.associate_} + { // TODO: if both arrays resolve to the same slot this will deadlock. + copy(other, *this); + } /** * Move constructor. Constructs a new array by moving from the @@ -3930,8 +4034,26 @@ class array { * this new array. */ array(array&& other) - : m_device(other.m_device), extent(other.extent) - { other.m_device.reset(); } + : + owner_{std::move(other.owner_)}, + associate_{std::move(other.associate_)}, + extent_{std::move(other.extent_)}, + cpu_access_{other.cpu_access_}, + data_{std::move(other.data_)} + { + const auto n = reinterpret_cast(this) % max_array_cnt_; + + if (n == other.this_idx_) { + if (hsa_amd_memory_unlock(&other) != HSA_STATUS_SUCCESS) { + throw std::runtime_error{ + "Failed to unlock locked array pointer."}; + } + + other.this_idx_ = max_array_cnt_; + } + + this_idx_ = lock_this_(); + } /** * Constructs a new array with the supplied extent, located on the default @@ -3940,8 +4062,10 @@ class array { * * @param[in] ext The extent in each dimension of this array. */ - explicit array(const hc::extent& ext) - : array(ext, accelerator(L"default").get_default_view()) {} + explicit + array(const hc::extent& ext) + : array{ext, accelerator::get_auto_selection_view()} + {} /** @{ */ /** @@ -3950,12 +4074,15 @@ class array { * @param[in] e0,e1,e2 The component values that will form the extent of * this array. */ - explicit array(int e0) - : array(hc::extent(e0)) { static_assert(N == 1, "illegal"); } - explicit array(int e0, int e1) - : array(hc::extent(e0, e1)) {} - explicit array(int e0, int e1, int e2) - : array(hc::extent(e0, e1, e2)) {} + explicit + array(int e0) : array{hc::extent{e0}} + { + static_assert(N == 1, "illegal"); + } + explicit + array(int e0, int e1) : array{hc::extent{e0, e1}} {} + explicit + array(int e0, int e1, int e2) : array{hc::extent{e0, e1, e2}} {} /** @} */ @@ -3973,12 +4100,14 @@ class array { * @param[in] srcBegin A beginning iterator into the source container. * @param[in] srcEnd An ending iterator into the source container. */ - template - array(const hc::extent& ext, InputIter srcBegin) - : array(ext, srcBegin, accelerator(L"default").get_default_view()) {} - template - array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd) - : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {} + template + array(const hc::extent& ext, InputIter srcBegin) + : array{ext, srcBegin, accelerator::get_auto_selection_view()} + {} + template + array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd) + : array{ext, srcBegin, srcEnd, accelerator::get_auto_selection_view()} + {} /** @} */ @@ -3989,27 +4118,31 @@ class array { * * @param[in] e0,e1,e2 The component values that will form the extent of * this array. - * @param[in] srcBegin A beginning iterator into the source container. + * @param[in] srcBegin A beginning iterator into the source container. * @param[in] srcEnd An ending iterator into the source container. */ - template - array(int e0, InputIter srcBegin) - : array(hc::extent(e0), srcBegin) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd) - : array(hc::extent(e0), srcBegin, srcEnd) {} - template - array(int e0, int e1, InputIter srcBegin) - : array(hc::extent(e0, e1), srcBegin) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd) - : array(hc::extent(e0, e1), srcBegin, srcEnd) {} - template - array(int e0, int e1, int e2, InputIter srcBegin) - : array(hc::extent(e0, e1, e2), srcBegin) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd) - : array(hc::extent(e0, e1, e2), srcBegin, srcEnd) {} + template + array(int e0, InputIter srcBegin) : array{hc::extent{e0}, srcBegin} {} + template + array(int e0, InputIter srcBegin, InputIter srcEnd) + : array{hc::extent{e0}, srcBegin, srcEnd} + {} + template + array(int e0, int e1, InputIter srcBegin) + : array{hc::extent{e0, e1}, srcBegin} + {} + template + array(int e0, int e1, InputIter srcBegin, InputIter srcEnd) + : array{hc::extent{e0, e1}, srcBegin, srcEnd} + {} + template + array(int e0, int e1, int e2, InputIter srcBegin) + : array{hc::extent{e0, e1, e2}, srcBegin} + {} + template + array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd) + : array{hc::extent{e0, e1, e2}, srcBegin, srcEnd} + {} /** @} */ @@ -4024,9 +4157,12 @@ class array { * this array (and also to determine the extent of this * array). */ - explicit array(const array_view& src) - : array(src.get_extent(), accelerator(L"default").get_default_view()) - { copy(src, *this); } + explicit + array(const array_view& src) + : array{src.get_extent(), accelerator::get_auto_selection_view()} + { + copy(src, *this); + } /** * Constructs a new array with the supplied extent, located on the @@ -4050,26 +4186,52 @@ class array { * this array. * @param[in] access_type The type of CPU access desired for this array. */ - array(const hc::extent& ext, accelerator_view av, access_type cpu_access_type = access_type_auto) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size()), extent(ext) {} -#else - : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {} -#endif + array( + const hc::extent& ext, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : + owner_{std::move(av)}, + associate_{owner_}, + extent_{ext}, + cpu_access_{cpu_access_type}, + data_{allocate_(), Deleter{}}, + this_idx_{lock_this_()} + {} /** @{ */ /** * Constructs an array instance based on the given pointer on the device memory. */ - explicit array(int e0, void* accelerator_pointer) - : array(hc::extent(e0), accelerator(L"default").get_default_view(), accelerator_pointer) {} - explicit array(int e0, int e1, void* accelerator_pointer) - : array(hc::extent(e0, e1), accelerator(L"default").get_default_view(), accelerator_pointer) {} - explicit array(int e0, int e1, int e2, void* accelerator_pointer) - : array(hc::extent(e0, e1, e2), accelerator(L"default").get_default_view(), accelerator_pointer) {} + array(int e0, void* accelerator_pointer) + : + array{ + hc::extent{e0}, + accelerator::get_auto_selection_view(), + static_cast(accelerator_pointer)} + {} + array(int e0, int e1, void* accelerator_pointer) + : + array{ + hc::extent{e0, e1}, + accelerator::get_auto_selection_view(), + static_cast(accelerator_pointer)} + {} + array(int e0, int e1, int e2, void* accelerator_pointer) + : + array{ + hc::extent{e0, e1, e2}, + accelerator::get_auto_selection_view(), + static_cast(accelerator_pointer)} + {} - explicit array(const hc::extent& ext, void* accelerator_pointer) - : array(ext, accelerator(L"default").get_default_view(), accelerator_pointer) {} + array(const hc::extent& ext, void* accelerator_pointer) + : + array{ + ext, + accelerator::get_auto_selection_view(), + static_cast(accelerator_pointer)} + {} /** @} */ /** @@ -4081,17 +4243,24 @@ class array { * @param[in] accelerator_pointer The pointer to the device memory. * @param[in] access_type The type of CPU access desired for this array. */ - explicit array(const extent& ext, accelerator_view av, void* accelerator_pointer, access_type cpu_access_type = access_type_auto) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size(), accelerator_pointer), extent(ext) {} -#else - : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type), extent(ext) {} -#endif + array( + const extent& ext, + accelerator_view av, + void* accelerator_pointer, + access_type cpu_access_type = access_type_auto) + : + owner_{av}, + associate_{owner_}, + extent_{ext}, + cpu_access_{cpu_access_type}, + data_{static_cast(accelerator_pointer), Deleter{}}, + this_idx_{lock_this_()} + {} /** @{ */ /** * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), av, cpu_access_type)". + * "array(extent(e0 [, e1 [, e2 ]]), av, cpu_access_type)". * * @param[in] e0,e1,e2 The component values that will form the extent of * this array. @@ -4099,12 +4268,27 @@ class array { * this array. * @param[in] access_type The type of CPU access desired for this array. */ - array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0), av, cpu_access_type) {} - array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1), av, cpu_access_type) {} - array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1, e2), av, cpu_access_type) {} + array( + int e0, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{hc::extent{e0}, std::move(av), cpu_access_type} + {} + array( + int e0, + int e1, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{hc::extent{e0, e1}, std::move(av), cpu_access_type} + {} + array( + int e0, + int e1, + int e2, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{hc::extent{e0, e1, e2}, std::move(av), cpu_access_type} + {} /** @} */ @@ -4135,18 +4319,27 @@ class array { * location of this array. * @param[in] access_type The type of CPU access desired for this array. */ - template - array(const hc::extent& ext, InputIter srcBegin, accelerator_view av, - access_type cpu_access_type = access_type_auto) - : array(ext, av, cpu_access_type) { copy(srcBegin, *this); } - template - array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd, - accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(ext, av, cpu_access_type) { - if (ext.size() < std::distance(srcBegin, srcEnd)) - throw runtime_exception("errorMsg_throw", 0); - copy(srcBegin, srcEnd, *this); - } + template + array( + const hc::extent& ext, + InputIter srcBegin, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{ext, std::move(av), cpu_access_type} + { + copy(srcBegin, *this); + } + template + array( + const hc::extent& ext, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{ext, std::move(av), cpu_access_type} + { + copy(srcBegin, srcEnd, *this); + } /** @} */ @@ -4159,7 +4352,7 @@ class array { * * Users can optionally specify the type of CPU access desired for "this" * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the + * the specified accelerator_view "av" as well as the CPU (with the * specified CPU access_type). If a value other than access_type_auto or * access_type_none is specified for the cpu_access_type parameter and the * accelerator corresponding to the accelerator_view “av†does not support @@ -4176,8 +4369,14 @@ class array { * location of this array. * @param[in] access_type The type of CPU access desired for this array. */ - array(const array_view& src, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(src.get_extent(), av, cpu_access_type) { copy(src, *this); } + array( + const array_view& src, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{src.get_extent(), std::move(av), cpu_access_type} + { + copy(src, *this); + } /** @{ */ /** @@ -4192,24 +4391,79 @@ class array { * location of this array. * @param[in] access_type The type of CPU access desired for this array. */ - template - array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0), srcBegin, av, cpu_access_type) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0), srcBegin, srcEnd, av, cpu_access_type) {} - template - array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1), srcBegin, av, cpu_access_type) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1, e2), srcBegin, av, cpu_access_type) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {} + template + array( + int e0, + InputIter srcBegin, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{hc::extent{e0}, srcBegin, std::move(av), cpu_access_type} + {} + template + array( + int e0, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : + array{ + hc::extent{e0}, srcBegin, srcEnd, std::move(av), cpu_access_type} + {} + template + array( + int e0, + int e1, + InputIter srcBegin, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : array{hc::extent{e0, e1}, srcBegin, std::move(av), cpu_access_type} + {} + template + array( + int e0, + int e1, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : + array{ + hc::extent{e0, e1}, + srcBegin, + srcEnd, + std::move(av), + cpu_access_type} + {} + template + array( + int e0, + int e1, + int e2, + InputIter srcBegin, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : + array{ + hc::extent{e0, e1, e2}, srcBegin, std::move(av), cpu_access_type} + {} + template + array( + int e0, + int e1, + int e2, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + access_type cpu_access_type = access_type_auto) + : + array{ + hc::extent{e0, e1, e2}, + srcBegin, + srcEnd, + std::move(av), + cpu_access_type} + {} /** @} */ @@ -4225,16 +4479,22 @@ class array { * @param[in] associated_av An accelerator_view object which specifies a * target device accelerator. */ - array(const hc::extent& ext, accelerator_view av, accelerator_view associated_av) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size()), extent(ext) {} -#else - : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {} -#endif + array( + const hc::extent& ext, + accelerator_view av, + accelerator_view associated_av) + : + owner_{std::move(av)}, + associate_{std::move(associated_av)}, + extent_{ext}, + cpu_access_{access_type_auto}, + data_{allocate_(), Deleter{}}, + this_idx_{lock_this_()} + {} /** @{ */ /** - * Equivalent to construction using + * Equivalent to construction using * "array(extent(e0 [, e1 [, e2 ]]), av, associated_av)". * * @param[in] e0,e1,e2 The component values that will form the extent of @@ -4245,11 +4505,14 @@ class array { * target device accelerator. */ array(int e0, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0), av, associated_av) {} + : array{hc::extent{e0}, std::move(av), associated_av} + {} array(int e0, int e1, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1), av, associated_av) {} + : array{hc::extent{e0, e1}, std::move(av), associated_av} + {} array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1, e2), av, associated_av) {} + : array{hc::extent{e0, e1, e2}, std::move(av), associated_av} + {} /** @} */ @@ -4268,16 +4531,27 @@ class array { * @param[in] associated_av An accelerator_view object which specifies a * target device accelerator. */ - template - array(const hc::extent& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(ext, av, associated_av) { copy(srcBegin, *this); } - template - array(const hc::extent& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(ext, av, associated_av) { - if (ext.size() < std::distance(srcBegin, srcEnd)) - throw runtime_exception("errorMsg_throw", 0); - copy(srcBegin, srcEnd, *this); - } + template + array( + const hc::extent& ext, + InputIter srcBegin, + accelerator_view av, + accelerator_view associated_av) + : array{ext, std::move(av), std::move(associated_av)} + { + copy(srcBegin, *this); + } + template + array( + const hc::extent& ext, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + accelerator_view associated_av) + : array{ext, std::move(av), associated_av} + { + copy(srcBegin, srcEnd, *this); + } /** @} */ @@ -4297,9 +4571,14 @@ class array { * @param[in] associated_av An accelerator_view object which specifies a * target device accelerator. */ - array(const array_view& src, accelerator_view av, accelerator_view associated_av) - : array(src.get_extent(), av, associated_av) - { copy(src, *this); } + array( + const array_view& src, + accelerator_view av, + accelerator_view associated_av) + : array{src.get_extent(), std::move(av), associated_av} + { + copy(src, *this); + } /** @{ */ /** @@ -4315,49 +4594,114 @@ class array { * @param[in] associated_av An accelerator_view object which specifies a * target device accelerator. */ - template - array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0), srcBegin, av, associated_av) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0), srcBegin, srcEnd, av, associated_av) {} - template - array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1), srcBegin, av, associated_av) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1), srcBegin, srcEnd, av, associated_av) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1, e2), srcBegin, av, associated_av) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {} + template + array( + int e0, + InputIter srcBegin, + accelerator_view av, + accelerator_view associated_av) + : array{hc::extent{e0}, srcBegin, std::move(av), associated_av} + {} + template + array( + int e0, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + accelerator_view associated_av) + : + array{hc::extent{e0}, srcBegin, srcEnd, std::move(av), associated_av} + {} + template + array( + int e0, + int e1, + InputIter srcBegin, + accelerator_view av, + accelerator_view associated_av) + : array{hc::extent{e0, e1}, srcBegin, std::move(av), associated_av} + {} + template + array( + int e0, + int e1, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + accelerator_view associated_av) + : + array{ + hc::extent{e0, e1}, + srcBegin, + srcEnd, + std::move(av), + associated_av} + {} + template + array( + int e0, + int e1, + int e2, + InputIter srcBegin, + accelerator_view av, + accelerator_view associated_av) + : + array{hc::extent{e0, e1, e2}, srcBegin, std::move(av), associated_av} + {} + template + array( + int e0, + int e1, + int e2, + InputIter srcBegin, + InputIter srcEnd, + accelerator_view av, + accelerator_view associated_av) + : + array{ + hc::extent(e0, e1, e2), + srcBegin, + srcEnd, + std::move(av), + associated_av} + {} /** @} */ /** * Access the extent that defines the shape of this array. */ - hc::extent get_extent() const __CPU__ __HC__ { return extent; } + hc::extent get_extent() const __CPU__ __HC__ + { + return extent_; + } /** * This property returns the accelerator_view representing the location * where this array has been allocated. */ - accelerator_view get_accelerator_view() const { return m_device.get_av(); } + accelerator_view get_accelerator_view() const + { + return owner_; + } /** * This property returns the accelerator_view representing the preferred * target where this array can be copied. */ - accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); } + accelerator_view get_associated_accelerator_view() const + { + return associate_; + } /** * This property returns the CPU "access_type" allowed for this array. */ - access_type get_cpu_access_type() const { return m_device.get_access(); } - + access_type get_cpu_access_type() const + { + return cpu_access_; + } + /** * Assigns the contents of the array "other" to this array, using a deep * copy. @@ -4381,12 +4725,11 @@ class array { * this array. * @return Returns *this. */ - array& operator=(array&& other) { - if (this != &other) { - extent = other.extent; - m_device = other.m_device; - other.m_device.reset(); - } + array& operator=(array&& other) + { // TODO: potentially inefficient. + array tmp{std::move(other)}; + std::swap(*this, tmp); + return *this; } @@ -4398,12 +4741,16 @@ class array { * this array. * @return Returns *this. */ - array& operator=(const array_view& src) { - array arr(src); - *this = std::move(arr); + array& operator=(const array_view& src) + { + using std::swap; + + array tmp{src}; + swap(*this, tmp); + return *this; } - + /** * Copies the contents of this array to the array given by "dest", as * if by calling "copy(*this, dest)". @@ -4411,14 +4758,8 @@ class array { * @param[out] dest An object of type array to which to copy data * from this array. */ - void copy_to(array& dest) const { -#if __KALMAR_ACCELERATOR__ != 1 - for(int i = 0 ; i < N ; i++) - { - if (dest.extent[i] < this->extent[i] ) - throw runtime_exception("errorMsg_throw", 0); - } -#endif + void copy_to(array& dest) const + { copy(*this, dest); } @@ -4429,20 +4770,19 @@ class array { * @param[out] dest An object of type array_view to which to copy data * from this array. */ - void copy_to(const array_view& dest) const { copy(*this, dest); } + void copy_to(const array_view& dest) const + { + copy(*this, dest); + } /** * Returns a pointer to the raw data underlying this array. * - * @return A (const) pointer to the first element in the linearized array. + * @return A (const) pointer to the first element in the linearised array. */ - T* data() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - return nullptr; - m_device.synchronize(true); -#endif - return reinterpret_cast(m_device.get()); + T* data() const [[cpu, hc]] + { + return data_.get(); } /** @@ -4451,8 +4791,11 @@ class array { * @return A (const) pointer to the first element in the array on the * device memory. */ - T* accelerator_pointer() const __CPU__ __HC__ { - return reinterpret_cast(m_device.get_device_pointer()); + T* accelerator_pointer() const [[cpu, hc]] + { // TODO: this is dumb, array is an owning owned container i.e. data_ IS + // an accelerator pointer; it is NOT array_view, and this function + // should be removed. + return data_.get(); } /** @@ -4463,9 +4806,9 @@ class array { * contained on the array. */ operator std::vector() const { - std::vector vec(extent.size()); + std::vector vec(extent_.size()); hc::copy(*this, vec.data()); - return std::move(vec); + return vec; } /** @{ */ @@ -4479,16 +4822,38 @@ class array { * @param[in] idx An object of type index from that specifies the * location of the element. */ - T& operator[](const index& idx) __CPU__ __HC__ { -#ifndef __KALMAR_ACCELERATOR__ - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(true); -#endif - T *ptr = reinterpret_cast(m_device.get()); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx, extent)]; + T& operator[](const index& idx) [[cpu]] + { // TODO: simplify, this is a placeholder. + static const accelerator cpu{L"cpu"}; + + switch (cpu_access_) { + case access_type_none: + throw runtime_exception{"The array is not accessible on CPU.", 0}; + case access_type_auto: + if (owner_.get_accelerator() != cpu) { + throw runtime_exception{ + "The array is not accessible on CPU.", 0}; + } + break; + default: + break; + } + + return data_[detail::amp_helper< + N, index, hc::extent>::flatten(idx, extent_)]; + } + T& operator[](const index& idx) [[hc]] + { + return this_()->data_[detail::amp_helper< + N, index, hc::extent>::flatten(idx, this_()->extent_)]; + } + template::type* = nullptr> + T& operator[](int i0) [[cpu, hc]] + { + return operator[](index<1>{i0}); } - T& operator()(const index& idx) __CPU__ __HC__ { + T& operator()(const index& idx) [[cpu, hc]] + { return (*this)[idx]; } @@ -4505,17 +4870,18 @@ class array { * @param[in] idx An object of type index from that specifies the * location of the element. */ - const T& operator[](const index& idx) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(); -#endif - T *ptr = reinterpret_cast(m_device.get()); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx, extent)]; + const T& operator[](const index& idx) const [[cpu, hc]] + { // TODO: semi-ghastly, even though Scott Meyers approves of it. + return (*const_cast(this))[idx]; } - const T& operator()(const index& idx) const __CPU__ __HC__ { - return (*this)[idx]; + template::type* = nullptr> + const T& operator[](int i0) const [[cpu, hc]] + { + return operator[](index{i0}); + } + const T& operator()(const index& idx) const [[cpu, hc]] + { + return operator[](idx); } /** @} */ @@ -4528,11 +4894,20 @@ class array { * @param[in] i0,i1,i2 The component values that will form the index into * this array. */ - T& operator()(int i0, int i1) __CPU__ __HC__ { - return (*this)[index<2>(i0, i1)]; + template::type* = nullptr> + T& operator()(int i0) [[cpu, hc]] + { + return operator[](index<1>{i0}); } - T& operator()(int i0, int i1, int i2) __CPU__ __HC__ { - return (*this)[index<3>(i0, i1, i2)]; + template::type* = nullptr> + T& operator()(int i0, int i1) [[cpu, hc]] + { + return operator[](index<2>{i0, i1}); + } + template::type* = nullptr> + T& operator()(int i0, int i1, int i2) [[cpu, hc]] + { + return operator[](index<3>{i0, i1, i2}); } /** @} */ @@ -4545,11 +4920,20 @@ class array { * @param[in] i0,i1,i2 The component values that will form the index into * this array. */ - const T& operator()(int i0, int i1) const __CPU__ __HC__ { - return (*this)[index<2>(i0, i1)]; + template::type* = nullptr> + const T& operator()(int i0) const [[cpu, hc]] + { + return (*const_cast(this))(i0); } - const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ { - return (*this)[index<3>(i0, i1, i2)]; + template::type* = nullptr> + const T& operator()(int i0, int i1) const [[cpu, hc]] + { + return (*const_cast(this))(i0, i1); + } + template::type* = nullptr> + const T& operator()(int i0, int i1, int i2) const [[cpu, hc]] + { + return (*const_cast(this))(i0, i1, i2); } /** @{ */ @@ -4569,22 +4953,35 @@ class array { * @return Returns an array_view whose dimension is one lower than that of * this array. */ - typename array_projection_helper::result_type - operator[] (int i) __CPU__ __HC__ { - return array_projection_helper::project(*this, i); - } - typename array_projection_helper::result_type - operator()(int i0) __CPU__ __HC__ { - return (*this)[i0]; - } - typename array_projection_helper::const_result_type - operator[] (int i) const __CPU__ __HC__ { - return array_projection_helper::project(*this, i); - } - typename array_projection_helper::const_result_type - operator()(int i0) const __CPU__ __HC__ { - return (*this)[i0]; - } + template 1)>::type* = nullptr> + array_view operator[](int i0) [[cpu, hc]] + { + hc::extent tmp; + for (auto i = 1; i != m; ++i) tmp[i - 1] = extent_[i]; + + return array_view{tmp, data() + i0 * tmp.size()}; + } + + template 1)>::type* = nullptr> + array_view operator[](int i0) const [[cpu, hc]] + { + hc::extent tmp; + for (auto i = 1; i != m; ++i) tmp[i - 1] = extent_[i]; + + return array_view{tmp, data() + i0 * tmp.size()}; + } + + template 1)>::type* = nullptr> + array_view operator()(int i0) [[cpu, hc]] + { + return (*this)[i0]; + } + + template 1)>::type* = nullptr> + array_view operator()(int i0) const [[cpu, hc]] + { + return (*this)[i0]; + } /** @} */ @@ -4606,17 +5003,34 @@ class array { * @return Returns a subsection of the source array at specified origin, * and with the specified extent. */ - array_view section(const index& origin, const hc::extent& ext) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, hc::extent>::contains(origin, ext ,this->extent) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(*this); - return av.section(origin, ext); + array_view section( + const index& origin, const hc::extent& ext) [[cpu]] + { + if (extent_.size() < (ext + origin).size()) { + throw runtime_exception{"errorMsg_throw", 0}; + } + + return array_view{*this}.section(origin, ext); + } + array_view section( + const index& origin, const hc::extent& ext) [[hc]] + { + return array_view{*this}.section(origin, ext); + } + + array_view section( + const index& origin, const hc::extent& ext) const [[cpu]] + { + if (extent_.size() < (ext + origin).size()) { + throw runtime_exception{"errorMsg_throw", 0}; + } + + return array_view{*this}.section(origin, ext); } - array_view section(const index& origin, const hc::extent& ext) const __CPU__ __HC__ { - array_view av(*this); - return av.section(origin, ext); + array_view section( + const index& origin, const hc::extent& ext) const [[hc]] + { + return array_view{*this}.section(origin, ext); } /** @} */ @@ -4625,17 +5039,30 @@ class array { /** * Equivalent to "section(idx, this->extent – idx)". */ - array_view section(const index& idx) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, hc::extent>::contains(idx, this->extent ) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(*this); - return av.section(idx); + array_view section(const index& idx) [[cpu]] + { + if (!extent_.contains(idx)) { + throw runtime_exception{"errorMsg_throw", 0}; + } + + return array_view{*this}.section(idx); } - array_view section(const index& idx) const __CPU__ __HC__ { - array_view av(*this); - return av.section(idx); + array_view section(const index& idx) [[hc]] + { + return array_view{*this}.section(idx); + } + + array_view section(const index& idx) const [[cpu]] + { + if (!extent_.contains(idx)) { + throw runtime_exception{"errorMsg_throw", 0}; + } + + return array_view{*this}.section(idx); + } + array_view section(const index& idx) const [[hc]] + { + return array_view{*this}.section(idx); } /** @} */ @@ -4644,13 +5071,13 @@ class array { /** * Equivalent to "section(index(), ext)". */ - array_view section(const hc::extent& ext) __CPU__ __HC__ { - array_view av(*this); - return av.section(ext); + array_view section(const hc::extent& ext) [[cpu, hc]] + { + return array_view{*this}.section(ext); } - array_view section(const hc::extent& ext) const __CPU__ __HC__ { - array_view av(*this); - return av.section(ext); + array_view section(const hc::extent& ext) const [[cpu, hc]] + { + return array_view{*this}.section(ext); } /** @} */ @@ -4658,36 +5085,53 @@ class array { /** @{ */ /** * Equivalent to - * "array::section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]])) const". + * "array::section( + * index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]])) const". * * @param[in] i0,i1,i2 The component values that will form the origin of * the section * @param[in] e0,e1,e2 The component values that will form the extent of * the section */ - array_view section(int i0, int e0) __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); + array_view section(int i0, int e0) [[cpu, hc]] + { + static_assert(N == 1, "Rank must be 1."); + + return section(index<1>{i0}, hc::extent<1>{e0}); } - array_view section(int i0, int e0) const __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); + array_view section(int i0, int i1, int e0, int e1) [[cpu, hc]] + { + static_assert(N == 2, "Rank must be 2."); + + return section(index<2>{i0, i1}, hc::extent<2>{e0, e1}); } - array_view section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); + array_view section( + int i0, int i1, int i2, int e0, int e1, int e2) [[cpu, hc]] + { + static_assert(N == 3, "Rank must be 3."); + + return section(index<3>{i0, i1, i2}, hc::extent<3>{e0, e1, e2}); } - array_view section(int i0, int i1, int e0, int e1) __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); + + array_view section(int i0, int e0) const [[cpu, hc]] + { + static_assert(N == 1, "Rank must be 1."); + + return section(index<1>{i0}, hc::extent<1>{e0}); } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); + array_view section( + int i0, int i1, int e0, int e1) const [[cpu, hc]] + { + static_assert(N == 2, "Rank must be 2."); + + return section(index<2>{i0, i1}, hc::extent<2>{e0, e1}); } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); + array_view section( + int i0, int i1, int i2, int e0, int e1, int e2) const [[cpu, hc]] + { + static_assert(N == 3, "Rank must be 3."); + + return section(index<3>{i0, i1, i2}, hc::extent<3>{e0, e1, e2}); } /** @} */ @@ -4713,31 +5157,44 @@ class array { * reinterpreted from T to ElementType, and the rank reduced from N * to 1. */ - template - array_view reinterpret_as() __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); - if( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception("errorMsg_throw", 0); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(m_device), extent<1>(size), 0); - return av; + template + array_view reinterpret_as() [[cpu]] + { + int size{extent_.size() / sizeof(U) * sizeof(T)}; + + if (size * sizeof(U) != extent_.size() * sizeof(T)) { + throw runtime_exception{"errorMsg_throw", 0}; } - template - array_view reinterpret_as() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(m_device), extent<1>(size), 0); - return av; + + return array_view{extent<1>{size}, data()}; + } + template + array_view reinterpret_as() [[hc]] + { + int size{extent_.size() / sizeof(U) * sizeof(T)}; + + return array_view{extent<1>{size}, data()}; + } + + template + array_view reinterpret_as() const [[cpu]] + { + int size{extent_.size() / sizeof(U) * sizeof(T)}; + + if (size * sizeof(U) != extent_.size() * sizeof(T)) { + throw runtime_exception{"errorMsg_throw", 0}; } + return array_view{extent<1>{size}, data()}; + } + template + array_view reinterpret_as() const [[hc]] + { + int size{extent_.size() / sizeof(U) * sizeof(T)}; + + return array_view{extent<1>{size}, data()}; + } + /** @} */ /** @{ */ @@ -4753,45 +5210,52 @@ class array { * @return Returns an array_view from this array with the rank changed * to K from N. */ - template array_view - view_as(const hc::extent& viewExtent) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(m_device, viewExtent, 0); - return av; + template + array_view view_as(const hc::extent& view_extent) [[cpu]] + { + if (extent_.size() < view_extent.size()) { + throw runtime_exception{"errorMsg_throw", 0}; } - template array_view - view_as(const hc::extent& viewExtent) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - const array_view av(m_device, viewExtent, 0); - return av; + + return array_view{view_extent, data()}; + } + template + array_view view_as(const hc::extent& view_extent) [[hc]] + { + return array_view{view_extent, data()}; + } + + template + array_view view_as( + const hc::extent& view_extent) const [[cpu]] + { + if (extent_.size() < view_extent.size()) { + throw runtime_exception{"errorMsg_throw", 0}; } + return array_view{view_extent, data()}; + } + template + array_view view_as( + const hc::extent& view_extent) const [[hc]] + { + return array_view{view_extent, data()}; + } + /** @} */ - ~array() = default; + ~array() + { + if (this_idx_ == max_array_cnt_) return; - // FIXME: functions below may be considered to move to private - const acc_buffer_t& internal() const __CPU__ __HC__ { return m_device; } - int get_offset() const __CPU__ __HC__ { return 0; } - index get_index_base() const __CPU__ __HC__ { return index(); } -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - acc_buffer_t m_device; - extent extent; - - template friend - void copy(const array&, const array_view&); - template friend - void copy(const array_view&, array&); -}; + if (hsa_amd_memory_unlock(this) != HSA_STATUS_SUCCESS) { + // TODO: this is very bad and temporary. + throw std::runtime_error{"Failed to unlock locked array pointer."}; + } + locked_ptrs_[this_idx_].first.clear(); + } +}; // ------------------------------------------------------------------------ // array_view // ------------------------------------------------------------------------ @@ -4807,10 +5271,10 @@ class array_view { public: typedef typename std::remove_const::type nc_T; -#if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; +#if __HCC_ACCELERATOR__ == 1 + typedef detail::_data acc_buffer_t; #else - typedef Kalmar::_data_host acc_buffer_t; + typedef detail::_data_host acc_buffer_t; #endif /** @@ -4836,8 +5300,9 @@ class array_view * @param[in] src An array which contains the data that this array_view is * bound to. */ - array_view(hc::array& src) __CPU__ __HC__ - : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} + array_view(hc::array& src) [[cpu, hc]] + : array_view{src.get_extent(), src.data()} + {} // FIXME: following interfaces were not implemented yet // template @@ -4871,7 +5336,7 @@ class array_view * @param[in] ext The extent of this array_view. */ array_view(const hc::extent& ext, value_type* src) __CPU__ __HC__ -#if __KALMAR_ACCELERATOR__ == 1 +#if __HCC_ACCELERATOR__ == 1 : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {} #else : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {} @@ -4896,7 +5361,7 @@ class array_view * * @param[in] e0,e1,e2 The component values that will form the extent of * this array_view. - * @param[in] src A template argument that must resolve to a contiguousi + * @param[in] src A template argument that must resolve to a contiguous * container that supports .data() and .size() members (such * as std::vector or std::array) */ @@ -4994,11 +5459,11 @@ class array_view * this array. */ void copy_to(array& dest) const { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 for(int i= 0 ;i< N;i++) { if (dest.get_extent()[i] < this->extent[i]) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; } #endif copy(*this, dest); @@ -5026,11 +5491,11 @@ class array_view * source or any of its views are accessed on an accelerator_view through a * parallel_for_each or a copy operation. * - * @return A pointer to the first element in the linearized array. + * @return A pointer to the first element in the linearised array. */ T* data() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 cache.get_cpu_access(true); #endif static_assert(N == 1, "data() is only permissible on array views of rank 1"); @@ -5140,10 +5605,8 @@ class array_view * synchronized for. */ // FIXME: type parameter is not implemented - void synchronize_to(const accelerator_view& av) const { -#if __KALMAR_ACCELERATOR__ != 1 + void synchronize_to(const accelerator_view& av) const [[cpu]] { cache.sync_to(av.pQueue); -#endif } /** @@ -5172,7 +5635,7 @@ class array_view * not needed. */ void discard_data() const { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 cache.discard(); #endif } @@ -5186,11 +5649,11 @@ class array_view * the element. */ T& operator[] (const index& idx) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 cache.get_cpu_access(true); #endif T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; + return ptr[detail::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; } T& operator()(const index& idx) const __CPU__ __HC__ { @@ -5280,9 +5743,9 @@ class array_view */ array_view section(const index& idx, const hc::extent& ext) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, hc::extent>::contains(idx, ext,this->extent ) ) - throw runtime_exception("errorMsg_throw", 0); +#if __HCC_ACCELERATOR__ != 1 + if ( !detail::amp_helper, hc::extent>::contains(idx, ext,this->extent ) ) + throw runtime_exception{"errorMsg_throw", 0}; #endif array_view av(cache, ext, extent_base, idx + index_base, offset); return av; @@ -5293,7 +5756,7 @@ class array_view */ array_view section(const index& idx) const __CPU__ __HC__ { hc::extent ext(extent); - Kalmar::amp_helper, hc::extent>::minus(idx, ext); + detail::amp_helper, hc::extent>::minus(idx, ext); return section(idx, ext); } @@ -5307,7 +5770,7 @@ class array_view /** @{ */ /** - * Equivalent to + * Equivalent to * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". * * @param[in] i0,i1,i2 The component values that will form the origin of @@ -5346,11 +5809,11 @@ class array_view template array_view reinterpret_as() const __CPU__ __HC__ { static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); static_assert( ! (std::is_same::value ),"can't use short in the kernel"); if ( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; #endif int size = extent.size() * sizeof(T) / sizeof(ElementType); using buffer_type = typename array_view::acc_buffer_t; @@ -5371,9 +5834,9 @@ class array_view template array_view view_as(hc::extent viewExtent) const __CPU__ __HC__ { static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 if ( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; #endif array_view av(cache, viewExtent, offset + index_base[0]); return av; @@ -5389,24 +5852,30 @@ class array_view index get_index_base() const __CPU__ __HC__ { return index_base; } private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template friend - bool is_flat(const array_view&) noexcept; - template friend - void copy(const array&, const array_view&); - template friend - void copy(InputIter, InputIter, const array_view&); - template friend - void copy(const array_view&, array&); - template friend - void copy(const array_view&, OutputIter); - template friend - void copy(const array_view& src, const array_view& dest); - + template friend struct projection_helper; + template friend struct array_projection_helper; + template friend class array; + template friend class array_view; + + template + friend + bool is_flat(const array_view&) noexcept; + template + friend + void copy(const array&, const array_view&); + template + friend + void copy(InputIter, InputIter, const array_view&); + template + friend + void copy(const array_view&, array&); + template + friend + void copy(const array_view&, OutputIter); + template + friend + void copy(const array_view&, const array_view&); + // used by view_as and reinterpret_as array_view(const acc_buffer_t& cache, const hc::extent& ext, int offset) __CPU__ __HC__ @@ -5418,7 +5887,7 @@ class array_view const index& idx_b, int off) __CPU__ __HC__ : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {} - + acc_buffer_t cache; hc::extent extent; hc::extent extent_base; @@ -5443,10 +5912,10 @@ class array_view public: typedef typename std::remove_const::type nc_T; -#if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; +#if __HCC_ACCELERATOR__ == 1 + typedef detail::_data acc_buffer_t; #else - typedef Kalmar::_data_host acc_buffer_t; + typedef detail::_data_host acc_buffer_t; #endif /** @@ -5507,7 +5976,7 @@ class array_view * @param[in] ext The extent of this array_view. */ array_view(const hc::extent& ext, const value_type* src) __CPU__ __HC__ -#if __KALMAR_ACCELERATOR__ == 1 +#if __HCC_ACCELERATOR__ == 1 : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {} #else : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {} @@ -5519,7 +5988,7 @@ class array_view * * @param[in] e0,e1,e2 The component values that will form the extent of * this array_view. - * @param[in] src A template argument that must resolve to a contiguousi + * @param[in] src A template argument that must resolve to a contiguous * container that supports .data() and .size() members (such * as std::vector or std::array) */ @@ -5603,7 +6072,7 @@ class array_view offset = other.offset; return *this; } - + array_view& operator=(const array_view& other) __CPU__ __HC__ { if (this != &other) { cache = other.cache; @@ -5648,10 +6117,10 @@ class array_view * source or any of its views are accessed on an accelerator_view through a * parallel_for_each or a copy operation. * - * @return A const pointer to the first element in the linearized array. + * @return A const pointer to the first element in the linearised array. */ const T* data() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 cache.get_cpu_access(); #endif static_assert(N == 1, "data() is only permissible on array views of rank 1"); @@ -5732,10 +6201,9 @@ class array_view * @param[in] av The target accelerator_view that "this" array_view is * synchronized for access on. */ - void synchronize_to(const accelerator_view& av) const { -#if __KALMAR_ACCELERATOR__ != 1 + void synchronize_to(const accelerator_view& av) const [[cpu]] + { cache.sync_to(av.pQueue); -#endif } /** @@ -5765,11 +6233,11 @@ class array_view * the element. */ const T& operator[](const index& idx) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 cache.get_cpu_access(); #endif const T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; + return ptr[detail::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; } const T& operator()(const index& idx) const __CPU__ __HC__ { return (*this)[idx]; @@ -5803,7 +6271,7 @@ class array_view static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view"); return (*this)[index<1>(i0)]; } - + const T& operator()(int i0, int i1) const __CPU__ __HC__ { static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view"); return (*this)[index<2>(i0, i1)]; @@ -5874,7 +6342,7 @@ class array_view */ array_view section(const index& idx) const __CPU__ __HC__ { hc::extent ext(extent); - Kalmar::amp_helper, hc::extent>::minus(idx, ext); + detail::amp_helper, hc::extent>::minus(idx, ext); return section(idx, ext); } @@ -5888,7 +6356,7 @@ class array_view /** @{ */ /** - * Equivalent to + * Equivalent to * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". * * @param[in] i0,i1,i2 The component values that will form the origin of @@ -5927,7 +6395,7 @@ class array_view template array_view reinterpret_as() const __CPU__ __HC__ { static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); static_assert( ! (std::is_same::value ),"can't use short in the kernel"); #endif @@ -5950,9 +6418,9 @@ class array_view template array_view view_as(hc::extent viewExtent) const __CPU__ __HC__ { static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 if ( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); + throw runtime_exception{"errorMsg_throw", 0}; #endif array_view av(cache, viewExtent, offset + index_base[0]); return av; @@ -5968,36 +6436,42 @@ class array_view index get_index_base() const __CPU__ __HC__ { return index_base; } private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template friend - bool is_flat(const array_view&) noexcept; - template friend - void copy(const array&, const array_view&); - template friend - void copy(InputIter, InputIter, const array_view&); - template friend - void copy(const array_view&, array&); - template friend - void copy(const array_view&, OutputIter); - template friend - void copy(const array_view& src, const array_view& dest); - + template friend struct projection_helper; + template friend struct array_projection_helper; + template friend class array; + template friend class array_view; + + template + friend + bool is_flat(const array_view&) noexcept; + template + friend + void copy(const array&, const array_view&); + template + friend + void copy(InputIter, InputIter, const array_view&); + template + friend + void copy(const array_view&, array&); + template + friend + void copy(const array_view&, OutputIter); + template + friend + void copy(const array_view&, const array_view&); + // used by view_as and reinterpret_as array_view(const acc_buffer_t& cache, const hc::extent& ext, int offset) __CPU__ __HC__ : cache(cache), extent(ext), extent_base(ext), offset(offset) {} - + // used by section and projection array_view(const acc_buffer_t& cache, const hc::extent& ext_now, const hc::extent& ext_b, const index& idx_b, int off) __CPU__ __HC__ : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {} - + acc_buffer_t cache; hc::extent extent; hc::extent extent_base; @@ -6015,7 +6489,7 @@ static inline bool is_flat(const array_view& av) noexcept { } template -static inline bool is_flat(const array_view& av) noexcept { return true; } +static inline bool is_flat(const array_view&) noexcept { return true; } template struct copy_input @@ -6038,7 +6512,7 @@ template struct copy_input { void operator()(InputIter& It, T* ptr, const extent& ext, - const extent& base, const index& idx) + const extent&, const index& idx) { InputIter end = It; std::advance(end, ext[N - 1]); @@ -6068,7 +6542,7 @@ template struct copy_output { void operator()(const T* ptr, OutputIter& It, const extent& ext, - const extent& base, const index& idx) + const extent&, const index& idx) { ptr += idx[N - 1]; It = std::copy(ptr, ptr + ext[N - 1], It); @@ -6104,8 +6578,8 @@ template struct copy_bidir { void operator()(const T* src, T* dst, const extent& ext, - const extent& base1, const index& idx1, - const extent& base2, const index& idx2) + const extent&, const index& idx1, + const extent&, const index& idx2) { src += idx1[N - 1]; dst += idx2[N - 1]; @@ -6203,9 +6677,20 @@ struct do_copy * @param[in] src An object of type array to be copied from. * @param[out] dest An object of type array to be copied to. */ -template -void copy(const array& src, array& dest) { - src.internal().copy(dest.internal(), 0, 0, 0); +template +inline +void copy(const array& src, array& dest) +{ + if (src.get_extent() != dest.get_extent()) { + throw std::logic_error{"Tried to copy arrays of mismatched extents."}; + } + + src.get_accelerator_view().wait(); // TODO: overly conservative, temporary. + + auto s = hsa_memory_copy( + dest.data(), src.data(), src.get_extent().size() * sizeof(T)); + + if (s != HSA_STATUS_SUCCESS) throw std::runtime_error{"Array copy failed."}; } /** @{ */ @@ -6393,20 +6878,41 @@ void copy(const array_view& src, const array_view& dest) { * @param[in] srcEnd An interator to the end of a source container. * @param[out] dest An object of type array to be copied to. */ -template -void copy(InputIter srcBegin, InputIter srcEnd, array& dest) { -#if __KALMAR_ACCELERATOR__ != 1 - if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.get_extent().size() )) - throw runtime_exception("errorMsg_throw ,copy between different types", 0); -#endif - do_copy()(srcBegin, srcEnd, dest); +template +inline +void copy(InputIter srcBegin, InputIter srcEnd, array& dest) +{ + static_assert( + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>{}, + "Only contiguous random access iterators supported."); + static_assert( + std::is_same::value_type, T>{}, + "Only same type copies supported."); + + if (std::distance(srcBegin, srcEnd) != dest.get_extent().size()) { + throw std::logic_error{"Mismatched copy sizes."}; + } + + copy(srcBegin, dest); } -template -void copy(InputIter srcBegin, array& dest) { - InputIter srcEnd = srcBegin; - std::advance(srcEnd, dest.get_extent().size()); - hc::copy(srcBegin, srcEnd, dest); +template +inline +void copy(InputIter srcBegin, array& dest) +{ + static_assert( + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>{}, + "Only contiguous random access iterators supported."); + static_assert( + std::is_same::value_type, T>{}, + "Only same type copies supported."); + + hsa_memory_copy( // TODO: add to_address() and use it instead of &*. + dest.data(), &*srcBegin, dest.get_extent().size() * sizeof(T)); } /** @} */ @@ -6459,9 +6965,27 @@ void copy(InputIter srcBegin, const array_view& dest) { * @param[out] destBegin An output iterator addressing the position of the * first element in the destination container. */ -template -void copy(const array &src, OutputIter destBegin) { - do_copy()(src, destBegin); +template +inline +void copy(const array &src, OutputIter destBegin) +{ + static_assert( + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>{}, + "Only contiguous random access iterators supported."); + static_assert( + std::is_same< + typename std::iterator_traits::value_type, T>{}, + "Only same type copies supported."); + + src.get_accelerator_view().wait(); // TODO: conservative, temporary. + + // TODO: must add to_address() and use instead of &*. + auto s = hsa_memory_copy( + &*destBegin, src.data(), src.get_extent().size() * sizeof(T)); + + if (s != HSA_STATUS_SUCCESS) throw std::runtime_error{"Array copy failed."}; } /** @@ -6505,10 +7029,11 @@ void copy(const array_view &src, OutputIter destBegin) { * @param[in] src An object of type array to be copied from. * @param[out] dest An object of type array to be copied to. */ -template -completion_future copy_async(const array& src, array& dest) { - std::future fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); }); - return completion_future(fut.share()); +template +inline +completion_future copy_async(const array& src, array& dest) +{ + return completion_future{std::async([&]() { copy(src, dest); }).share()}; } /** @@ -6584,16 +7109,42 @@ completion_future copy_async(const array_view& src, const array_view * @param[in] srcEnd An interator to the end of a source container. * @param[out] dest An object of type array to be copied to. */ -template -completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array& dest) { - std::future fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); }); - return completion_future(fut.share()); +template +inline +completion_future copy_async( + InputIter srcBegin, InputIter srcEnd, array& dest) +{ + static_assert( + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>{}, + "Only contiguous random access iterators supported."); + static_assert( + std::is_same::value_type, T>{}, + "Only same type copies supported."); + + if (std::distance(srcBegin, srcEnd) != dest.get_extent().size()) { + throw std::logic_error{"Mismatched copy sizes."}; + } + + return copy_async(srcBegin, dest); } -template -completion_future copy_async(InputIter srcBegin, array& dest) { - std::future fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); }); - return completion_future(fut.share()); +template +inline +completion_future copy_async(InputIter srcBegin, array& dest) +{ + static_assert( + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>{}, + "Only contiguous random access iterators supported."); + static_assert( + std::is_same::value_type, T>{}, + "Only same type copies supported."); + + return completion_future{ + std::async([&, srcBegin]() { copy(srcBegin, dest); }).share()}; } /** @} */ @@ -6636,10 +7187,22 @@ completion_future copy_async(InputIter srcBegin, const array_view& dest) { * @param[out] destBegin An output iterator addressing the position of the * first element in the destination container. */ -template -completion_future copy_async(const array& src, OutputIter destBegin) { - std::future fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); }); - return completion_future(fut.share()); +template +inline +completion_future copy_async(const array& src, OutputIter destBegin) +{ + static_assert( + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>{}, + "Only contiguous random access iterators supported."); + static_assert( + std::is_same< + typename std::iterator_traits::value_type, T>{}, + "Only same type copies supported."); + + return completion_future{ + std::async([&, destBegin]() { copy(src, destBegin); }).share()}; } /** @@ -6707,6 +7270,18 @@ completion_future parallel_for_each( accelerator::get_auto_selection_view(), compute_domain, f); } +template +inline +void validate_compute_domain(const hc::extent& compute_domain) +{ + std::size_t sz{1}; + for (auto i = 0; i != n; ++i) { + sz *= compute_domain[i]; + + if (sz < 1) throw invalid_compute_domain{"Extent is not positive."}; + if (sz > UINT_MAX) throw invalid_compute_domain{"Extent is too large."}; + } +} //ND parallel_for_each, nontiled template @@ -6720,11 +7295,40 @@ completion_future parallel_for_each( if (av.get_accelerator().get_device_path() == L"cpu") { throw hc::runtime_exception{ - Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; + detail::__errorMsg_UnsupportedAccelerator, E_FAIL}; } + validate_compute_domain(compute_domain); + return completion_future{ - Kalmar::launch_kernel_async(av.pQueue, compute_domain, f)}; + detail::launch_kernel_async(av.pQueue, compute_domain, f)}; +} + +template +inline +void validate_tiled_compute_domain(const tiled_extent& compute_domain) +{ + validate_compute_domain(compute_domain); + + size_t sz{1}; + for (auto i = 0u; i != n; ++i) { + if (compute_domain.tile_dim[i] < 0) { + throw invalid_compute_domain{ + "The extent of the tile must be positive."}; + } + + constexpr int max_tile_dim{1024}; // Should be read via the HSArt. + sz *= compute_domain.tile_dim[i]; + if (max_tile_dim < sz) { + throw invalid_compute_domain{ + "The extent of the tile exceeds the device limit"}; + } + + if (compute_domain[i] < compute_domain.tile_dim[i]) { + throw invalid_compute_domain{ + "The extent of the tile exceeds the compute grid extent"}; + } + } } //ND parallel_for_each, tiled @@ -6738,11 +7342,13 @@ completion_future parallel_for_each( if (av.get_accelerator().get_device_path() == L"cpu") { throw hc::runtime_exception{ - Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL}; + detail::__errorMsg_UnsupportedAccelerator, E_FAIL}; } + validate_tiled_compute_domain(compute_domain); + return completion_future{ - Kalmar::launch_kernel_with_dynamic_group_memory_async( + detail::launch_kernel_with_dynamic_group_memory_async( av.pQueue, compute_domain, f)}; } } // namespace hc \ No newline at end of file diff --git a/include/hc_am.hpp b/include/hc_am.hpp index 592efa6e59c..fb3389ba93a 100644 --- a/include/hc_am.hpp +++ b/include/hc_am.hpp @@ -1,6 +1,6 @@ #pragma once -#include "hc.hpp" +//#include "hc.hpp" #include #include #include @@ -16,7 +16,6 @@ typedef int am_status_t; #define amHostCoherent 0x2 ///< Allocate coherent pinned host memory accessible from all GPUs. namespace hc { - // Info for each pointer in the memtry tracker: class AmPointerInfo { public: @@ -144,7 +143,7 @@ am_status_t am_copy(void* dst, const void* src, std::size_t size) __attribute_ * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. if @ info is NULL, * no info is written but the returned status indicates if the pointer was tracked. * - * @see AM_memtracker_add + * @see AM_memtracker_add */ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); @@ -162,21 +161,21 @@ am_status_t am_memtracker_add(void* ptr, hc::AmPointerInfo &info); /* * Update info for an existing pointer in the memory tracker. * - * @returns AM_ERROR_MISC if pointer is not found in tracker. - * @returns AM_SUCCESS if pointer is not found in tracker. + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. * * @see am_memtracker_getinfo, am_memtracker_add */ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags, void *appPtr=nullptr); -/** +/** * Remove @ptr from the tracker structure. * * @p ptr may be anywhere in a tracked memory range. * - * @returns AM_ERROR_MISC if pointer is not found in tracker. - * @returns AM_SUCCESS if pointer is not found in tracker. + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. * * @see am_memtracker_getinfo, am_memtracker_add */ @@ -211,7 +210,7 @@ void am_memtracker_update_peers(const hc::accelerator &acc, int peerCnt, hsa_age /* * Map device memory or hsa allocated host memory pointed to by @p ptr to the peers. - * + * * @p ptr pointer which points to device memory or host memory * @p num_peer number of peers to map * @p peers pointer to peer accelerator list. @@ -221,11 +220,11 @@ void am_memtracker_update_peers(const hc::accelerator &acc, int peerCnt, hsa_age * @return AM_ERROR_MISC if @p ptr is not found in the pointer tracker. * @return AM_ERROR_MISC if @p peers incudes a non peer accelerator. */ -am_status_t am_map_to_peers(void* ptr, std::size_t num_peer, const hc::accelerator* peers); +am_status_t am_map_to_peers(void* ptr, std::size_t num_peer, const hc::accelerator* peers); /* * Locks a host pointer to a vector of agents - * + * * @p ac acclerator corresponding to current device * @p hostPtr pointer to host memory which should be page-locked * @p size size of hostPtr to be page-locked @@ -238,9 +237,9 @@ am_status_t am_memory_host_lock(hc::accelerator &ac, void *hostPtr, std::size_t /* * Unlock page locked host memory - * + * * @p ac current device accelerator - * @p hostPtr host pointer + * @p hostPtr host pointer * @return AM_SUCCESS if unlocked successfully. * @return AM_ERROR_MISC if @p hostPtr unlock is un-successful. */ diff --git a/include/hc_defines.h b/include/hc_defines.h index 2ec34e0e712..7be087ef312 100644 --- a/include/hc_defines.h +++ b/include/hc_defines.h @@ -61,11 +61,11 @@ extern "C" __attribute__((noduplicate,amp)) void amp_barrier(unsigned int n) ; #endif /** - * @namespace Kalmar - * namespace for internal classes of Kalmar compiler / runtime + * @namespace detail + * namespace for internal classes of detail compiler / runtime */ -namespace Kalmar { -} // namespace Kalmar +namespace detail { +} // namespace detail // Provide automatic type conversion for void*. class auto_voidp { diff --git a/include/hc_short_vector.inl b/include/hc_short_vector.inl index 64125bde1d5..bbaf325fb71 100644 --- a/include/hc_short_vector.inl +++ b/include/hc_short_vector.inl @@ -263,6 +263,10 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } + + __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y) __CPU_GPU__ { + data = { x, y }; + } }; @@ -288,6 +292,10 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } + + __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z) __CPU_GPU__ { + data = { x, y, z }; + } }; @@ -313,6 +321,10 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } + + __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w) __CPU_GPU__ { + data = { x,y,z,w }; + } }; @@ -338,6 +350,11 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } + + __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w + , const SCALAR_TYPE s4, const SCALAR_TYPE s5, const SCALAR_TYPE s6, const SCALAR_TYPE s7) __CPU_GPU__ { + data = { x,y,z,w,s4,s5,s6,s7 }; + } }; @@ -363,6 +380,13 @@ public: __vector_data_container(vector_value_type v) __CPU_GPU__ { data = v; } + + __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w + , const SCALAR_TYPE s4, const SCALAR_TYPE s5, const SCALAR_TYPE s6, const SCALAR_TYPE s7 + , const SCALAR_TYPE s8, const SCALAR_TYPE s9, const SCALAR_TYPE sA, const SCALAR_TYPE sB + , const SCALAR_TYPE sC, const SCALAR_TYPE sD, const SCALAR_TYPE sE, const SCALAR_TYPE sF) __CPU_GPU__ { + data = { x,y,z,w,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF }; + } }; diff --git a/include/kalmar_aligned_alloc.h b/include/kalmar_aligned_alloc.h index 727004a1e42..75112aa8075 100644 --- a/include/kalmar_aligned_alloc.h +++ b/include/kalmar_aligned_alloc.h @@ -11,7 +11,7 @@ #include /** \cond HIDDEN_SYMBOLS */ -namespace Kalmar { +namespace detail { constexpr inline bool kalmar_is_alignment(std::size_t value) noexcept { return (value > 0) && ((value & (value - 1)) == 0); @@ -39,5 +39,5 @@ inline void kalmar_aligned_free(void* ptr) noexcept { } } -} // namespace Kalmar +} // namespace detail /** \endcond */ diff --git a/include/kalmar_buffer.h b/include/kalmar_buffer.h index 78f2a6e9e82..0b0873dc9d5 100644 --- a/include/kalmar_buffer.h +++ b/include/kalmar_buffer.h @@ -11,17 +11,18 @@ #include "kalmar_serialize.h" /** \cond HIDDEN_SYMBOLS */ -namespace Kalmar { +namespace detail { // Dummy interface that looks somewhat like std::shared_ptr template class _data { public: _data() = delete; - _data(int count) : p_(nullptr) {} + explicit + _data(int) : p_(nullptr) {} _data(const _data& d) restrict(cpu, amp) : p_(d.p_) {} - _data(int count, void* d) restrict(cpu, amp) + _data(int, void* d) restrict(cpu, amp) : p_(static_cast(d)) {} template _data(const _data& d) restrict(cpu, amp) @@ -29,20 +30,20 @@ class _data { explicit _data(T* t) restrict(cpu, amp) { p_ = t; } T* get(void) const restrict(cpu, amp) { return p_; } T* get_device_pointer() const restrict(cpu, amp) { return p_; } - std::shared_ptr get_av() const { return nullptr; } + std::shared_ptr get_av() const { return nullptr; } void reset() const {} - T* map_ptr(bool modify, size_t count, size_t offset) const { return nullptr; } - void unmap_ptr(const void* addr, bool modify, size_t count, size_t offset) const {} - void synchronize(bool modify = false) const {} - void get_cpu_access(bool modify = false) const {} - void copy(_data other, int, int, int) const {} - void write(const T*, int , int offset = 0, bool blocking = false) const {} - void read(T*, int , int offset = 0) const {} + T* map_ptr(bool, size_t, size_t) const { return nullptr; } + void unmap_ptr(const void*, bool, size_t, size_t) const {} + void synchronize(bool = false) const {} + void get_cpu_access(bool = false) const {} + void copy(_data, int, int, int) const {} + void write(const T*, int , int = 0, bool = false) const {} + void read(T*, int , int = 0) const {} void refresh() const {} void set_const() const {} access_type get_access() const { return access_type_auto; } - std::shared_ptr get_stage() const { return nullptr; } + std::shared_ptr get_stage() const { return nullptr; } private: T* p_; @@ -58,11 +59,11 @@ class _data_host { : mm(std::make_shared(count*sizeof(T), const_cast(src))), isArray(false) {} - _data_host(std::shared_ptr av, std::shared_ptr stage, int count, + _data_host(std::shared_ptr av, std::shared_ptr stage, int count, access_type mode) : mm(std::make_shared(av, stage, count*sizeof(T), mode)), isArray(true) {} - _data_host(std::shared_ptr av, std::shared_ptr stage, int count, + _data_host(std::shared_ptr av, std::shared_ptr stage, int count, void* device_pointer, access_type mode) : mm(std::make_shared(av, stage, count*sizeof(T), device_pointer, mode)), isArray(true) {} @@ -79,8 +80,8 @@ class _data_host { size_t size() const { return mm->count; } void reset() const { mm.reset(); } void get_cpu_access(bool modify = false) const { mm->get_cpu_access(modify); } - std::shared_ptr get_av() const { return mm->master; } - std::shared_ptr get_stage() const { return mm->stage; } + std::shared_ptr get_av() const { return mm->master; } + std::shared_ptr get_stage() const { return mm->stage; } access_type get_access() const { return mm->mode; } void copy(_data_host other, int src_offset, int dst_offset, int size) const { mm->copy(other.mm.get(), src_offset * sizeof(T), dst_offset * sizeof(T), size * sizeof(T)); @@ -95,10 +96,10 @@ class _data_host { return (T*)mm->map(count * sizeof(T), offset * sizeof(T), modify); } void unmap_ptr(const void* addr, bool modify, size_t count, size_t offset) const { return mm->unmap(const_cast(addr), count * sizeof(T), offset * sizeof(T), modify); } - void sync_to(std::shared_ptr pQueue) const { mm->sync(pQueue, false); } + void sync_to(std::shared_ptr pQueue) const { mm->sync(pQueue, false); } - explicit _data_host(typename std::remove_const::type* t) {} + explicit _data_host(typename std::remove_const::type*) {} }; -} // namespace Kalmar +} // namespace detail /** \endcond */ diff --git a/include/kalmar_exception.h b/include/kalmar_exception.h index 55d7cb82c2f..c8d8390d03f 100644 --- a/include/kalmar_exception.h +++ b/include/kalmar_exception.h @@ -10,7 +10,7 @@ #include #include -namespace Kalmar { +namespace detail { #ifndef E_FAIL #define E_FAIL 0x80004005 @@ -59,5 +59,5 @@ class accelerator_view_removed : public runtime_exception HRESULT get_view_removed_reason() const throw() { return get_error_code(); } }; -} // namespace Kalmar +} // namespace detail diff --git a/include/kalmar_index.h b/include/kalmar_index.h index c5e77478c46..8eed92aead3 100644 --- a/include/kalmar_index.h +++ b/include/kalmar_index.h @@ -10,7 +10,7 @@ namespace hc { template class extent; } // namespace hc -namespace Kalmar { +namespace detail { /** \cond HIDDEN_SYMBOLS */ template struct __indices {}; @@ -206,7 +206,7 @@ struct amp_helper<1, _Tp1, _Tp2> return idx[0] >= 0 && ext[0] > 0 && (idx[0] + ext[0]) <= ext2[0] ; } - static int inline flatten(const _Tp1& idx, const _Tp2& ext) restrict(amp,cpu) { + static int inline flatten(const _Tp1& idx, const _Tp2&) restrict(amp,cpu) { return idx[0]; } static void inline minus(const _Tp1& idx, _Tp2& ext) restrict(amp,cpu) { @@ -448,17 +448,6 @@ class index { template friend class hc::extent; template friend struct index_helper; template friend struct amp_helper; - -public: - __attribute__((annotate("__cxxamp_opencl_index"))) - void __cxxamp_opencl_index() restrict(amp, cpu) -#if __KALMAR_ACCELERATOR__ == 1 - { - index_helper>::set(*this); - } -#else - ; -#endif }; /////////////////////////////////////////////////////////////////////////////// @@ -581,5 +570,5 @@ index operator%(int value, const index& idx) restrict(amp,cpu) { /** @} */ -} // namespace Kalmar +} // namespace detail diff --git a/include/kalmar_launch.h b/include/kalmar_launch.h index 28cf1134dd0..0cdc8f2e65f 100644 --- a/include/kalmar_launch.h +++ b/include/kalmar_launch.h @@ -35,30 +35,7 @@ namespace hc } /** \cond HIDDEN_SYMBOLS */ -namespace Kalmar { - -template -inline -void append_kernel( - const std::shared_ptr& pQueue, const Kernel& f, void* kernel) -{ - Kalmar::BufferArgumentsAppender vis(pQueue, kernel); - Kalmar::Serialize s(&vis); - //f.__cxxamp_serialize(s); -} - -// template -// inline -// std::shared_ptr get_available_que(const Kernel& f) -// { -// Kalmar::QueueSearcher ser; -// Kalmar::Serialize s(&ser); -// f.__cxxamp_serialize(s); -// if (ser.get_que()) -// return ser.get_que(); -// else -// return getContext()->auto_select(); -// } +namespace detail { struct Indexer { template @@ -87,11 +64,14 @@ template struct Kernel_emitter { static __attribute__((used, annotate("__HCC_KERNEL__"))) - void entry_point(Kernel f) restrict(cpu, amp) + void entry_point(Kernel f) [[cpu]][[hc]] { - #if __KALMAR_ACCELERATOR__ != 0 + #if __HCC_ACCELERATOR__ != 0 Index tmp = Indexer{}; f(tmp); + #else + struct { void operator()(const Kernel&) {} } tmp{}; + tmp(f); #endif } }; @@ -193,13 +173,18 @@ using IndexType = typename Index_type::index_type; template inline void* make_registered_kernel( - const std::shared_ptr& q, const Kernel& f) + const std::shared_ptr& q, const Kernel& f) { - using K = Kalmar::Kernel_emitter, Kernel>; + struct Deleter { + void operator()(void* p) const { delete static_cast(p); } + }; + using K = detail::Kernel_emitter, Kernel>; + + std::unique_ptr tmp{ + new Kernel{f}, [](void* p) { delete static_cast(p); }}; void *kernel{CLAMP::CreateKernel( - linker_name_for(), q.get(), &f, sizeof(Kernel))}; - append_kernel(q, f, kernel); + linker_name_for(), q.get(), std::move(tmp), sizeof(Kernel))}; return kernel; } @@ -236,22 +221,25 @@ inline std::pair< std::array, std::array> dimensions(const Domain& domain) -{ +{ // TODO: optimise. using R = std::pair< std::array, std::array>; R r{}; - for (auto i = 0; i != domain.rank; ++i) r.first[i] = domain[i]; - r.second = local_dimensions(domain); + auto tmp = local_dimensions(domain); + for (auto i = 0; i != Domain::rank; ++i) { + r.first[i] = domain[i]; + r.second[i] = tmp[i]; + } return r; } template inline -std::shared_ptr launch_kernel_async( - const std::shared_ptr& q, +std::shared_ptr launch_kernel_async( + const std::shared_ptr& q, const Domain& domain, const Kernel& f) { @@ -267,7 +255,7 @@ std::shared_ptr launch_kernel_async( template inline void launch_kernel( - const std::shared_ptr& q, + const std::shared_ptr& q, const Domain& domain, const Kernel& f) { @@ -283,10 +271,9 @@ void launch_kernel( template inline void launch_kernel_with_dynamic_group_memory( - const std::shared_ptr& q, + const std::shared_ptr& q, const Domain& domain, - const Kernel& f, - std::size_t dynamic_group_memory_size) + const Kernel& f) { const auto dims{dimensions(domain)}; @@ -300,8 +287,8 @@ void launch_kernel_with_dynamic_group_memory( template inline -std::shared_ptr launch_kernel_with_dynamic_group_memory_async( - const std::shared_ptr& q, +std::shared_ptr launch_kernel_with_dynamic_group_memory_async( + const std::shared_ptr& q, const Domain& domain, const Kernel& f) { @@ -314,5 +301,5 @@ std::shared_ptr launch_kernel_with_dynamic_group_memory_async( dims.second.data(), domain.get_dynamic_group_segment_size()); } -} // namespace Kalmar +} // namespace detail /** \endcond */ diff --git a/include/kalmar_math.h b/include/kalmar_math.h index f96b422cf53..3355bf62bac 100644 --- a/include/kalmar_math.h +++ b/include/kalmar_math.h @@ -7,292 +7,295 @@ #pragma once +#include "hc_defines.h" + #include #include -extern "C" __fp16 __hc_acos_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_acos_half(_Float16 x) restrict(amp); extern "C" float __hc_acos(float x) restrict(amp); extern "C" double __hc_acos_double(double x) restrict(amp); -extern "C" __fp16 __hc_acosh_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_acosh_half(_Float16 x) restrict(amp); extern "C" float __hc_acosh(float x) restrict(amp); extern "C" double __hc_acosh_double(double x) restrict(amp); -extern "C" __fp16 __hc_asin_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_asin_half(_Float16 x) restrict(amp); extern "C" float __hc_asin(float x) restrict(amp); extern "C" double __hc_asin_double(double x) restrict(amp); -extern "C" __fp16 __hc_asinh_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_asinh_half(_Float16 x) restrict(amp); extern "C" float __hc_asinh(float x) restrict(amp); extern "C" double __hc_asinh_double(double x) restrict(amp); -extern "C" __fp16 __hc_atan_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_atan_half(_Float16 x) restrict(amp); extern "C" float __hc_atan(float x) restrict(amp); extern "C" double __hc_atan_double(double x) restrict(amp); -extern "C" __fp16 __hc_atanh_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_atanh_half(_Float16 x) restrict(amp); extern "C" float __hc_atanh(float x) restrict(amp); extern "C" double __hc_atanh_double(double x) restrict(amp); -extern "C" __fp16 __hc_atan2_half(__fp16 y, __fp16 x) restrict(amp); +extern "C" _Float16 __hc_atan2_half(_Float16 y, _Float16 x) restrict(amp); extern "C" float __hc_atan2(float y, float x) restrict(amp); extern "C" double __hc_atan2_double(double y, double x) restrict(amp); -extern "C" __fp16 __hc_cbrt_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_cbrt_half(_Float16 x) restrict(amp); extern "C" float __hc_cbrt(float x) restrict(amp); extern "C" double __hc_cbrt_double(double x) restrict(amp); -extern "C" __fp16 __hc_ceil_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_ceil_half(_Float16 x) restrict(amp); extern "C" float __hc_ceil(float x) restrict(amp); extern "C" double __hc_ceil_double(double x) restrict(amp); -extern "C" __fp16 __hc_copysign_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_copysign_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_copysign(float x, float y) restrict(amp); extern "C" double __hc_copysign_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_cos_half(__fp16 x) restrict(amp); -extern "C" __fp16 __hc_cos_native_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_cos_half(_Float16 x) restrict(amp); +extern "C" _Float16 __hc_cos_native_half(_Float16 x) restrict(amp); extern "C" float __hc_cos(float x) restrict(amp); extern "C" float __hc_cos_native(float x) restrict(amp); extern "C" double __hc_cos_double(double x) restrict(amp); -extern "C" __fp16 __hc_cosh_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_cosh_half(_Float16 x) restrict(amp); extern "C" float __hc_cosh(float x) restrict(amp); extern "C" double __hc_cosh_double(double x) restrict(amp); -extern "C" __fp16 __hc_cospi_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_cospi_half(_Float16 x) restrict(amp); extern "C" float __hc_cospi(float x) restrict(amp); extern "C" double __hc_cospi_double(double x) restrict(amp); -extern "C" __fp16 __hc_erf_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_erf_half(_Float16 x) restrict(amp); extern "C" float __hc_erf(float x) restrict(amp); extern "C" double __hc_erf_double(double x) restrict(amp); -extern "C" __fp16 __hc_erfc_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_erfc_half(_Float16 x) restrict(amp); extern "C" float __hc_erfc(float x) restrict(amp); extern "C" double __hc_erfc_double(double x) restrict(amp); -extern "C" __fp16 __hc_erfcinv_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_erfcinv_half(_Float16 x) restrict(amp); extern "C" float __hc_erfcinv(float x) restrict(amp); extern "C" double __hc_erfcinv_double(double x) restrict(amp); -extern "C" __fp16 __hc_erfinv_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_erfinv_half(_Float16 x) restrict(amp); extern "C" float __hc_erfinv(float x) restrict(amp); extern "C" double __hc_erfinv_double(double x) restrict(amp); -extern "C" __fp16 __hc_exp_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_exp_half(_Float16 x) restrict(amp); extern "C" float __hc_exp(float x) restrict(amp); extern "C" double __hc_exp_double(double x) restrict(amp); -extern "C" __fp16 __hc_exp10_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_exp10_half(_Float16 x) restrict(amp); extern "C" float __hc_exp10(float x) restrict(amp); extern "C" double __hc_exp10_double(double x) restrict(amp); -extern "C" __fp16 __hc_exp2_native_half(__fp16 x) restrict(amp); -extern "C" __fp16 __hc_exp2_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_exp2_native_half(_Float16 x) restrict(amp); +extern "C" _Float16 __hc_exp2_half(_Float16 x) restrict(amp); extern "C" float __hc_exp2(float x) restrict(amp); extern "C" float __hc_exp2_native(float x) restrict(amp); extern "C" double __hc_exp2_double(double x) restrict(amp); -extern "C" __fp16 __hc_expm1_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_expm1_half(_Float16 x) restrict(amp); extern "C" float __hc_expm1(float x) restrict(amp); extern "C" double __hc_expm1_double(double x) restrict(amp); -extern "C" __fp16 __hc_fabs_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_fabs_half(_Float16 x) restrict(amp); extern "C" float __hc_fabs(float x) restrict(amp); extern "C" double __hc_fabs_double(double x) restrict(amp); -extern "C" __fp16 __hc_fdim_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_fdim_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_fdim(float x, float y) restrict(amp); extern "C" double __hc_fdim_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_floor_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_floor_half(_Float16 x) restrict(amp); extern "C" float __hc_floor(float x) restrict(amp); extern "C" double __hc_floor_double(double x) restrict(amp); -extern "C" __fp16 __hc_fma_half(__fp16 x, __fp16 y, __fp16 z) restrict(amp); +extern "C" _Float16 __hc_fma_half( + _Float16 x, _Float16 y, _Float16 z) restrict(amp); extern "C" float __hc_fma(float x, float y, float z) restrict(amp); extern "C" double __hc_fma_double(double x, double y, double z) restrict(amp); -extern "C" __fp16 __hc_fmax_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_fmax_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_fmax(float x, float y) restrict(amp); extern "C" double __hc_fmax_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_fmin_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_fmin_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_fmin(float x, float y) restrict(amp); extern "C" double __hc_fmin_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_fmod_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_fmod_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_fmod(float x, float y) restrict(amp); extern "C" double __hc_fmod_double(double x, double y) restrict(amp); -extern "C" int __hc_fpclassify_half(__fp16 x) restrict(amp); +extern "C" int __hc_fpclassify_half(_Float16 x) restrict(amp); extern "C" int __hc_fpclassify(float x) restrict(amp); extern "C" int __hc_fpclassify_double(double x) restrict(amp); -extern "C" __fp16 __hc_frexp_half(__fp16 x, int *exp) restrict(amp); +extern "C" _Float16 __hc_frexp_half(_Float16 x, int *exp) restrict(amp); extern "C" float __hc_frexp(float x, int *exp) restrict(amp); extern "C" double __hc_frexp_double(double x, int *exp) restrict(amp); -extern "C" __fp16 __hc_hypot_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_hypot_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_hypot(float x, float y) restrict(amp); extern "C" double __hc_hypot_double(double x, double y) restrict(amp); -extern "C" int __hc_ilogb_half(__fp16 x) restrict(amp); +extern "C" int __hc_ilogb_half(_Float16 x) restrict(amp); extern "C" int __hc_ilogb(float x) restrict(amp); extern "C" int __hc_ilogb_double(double x) restrict(amp); -extern "C" int __hc_isfinite_half(__fp16 x) restrict(amp); +extern "C" int __hc_isfinite_half(_Float16 x) restrict(amp); extern "C" int __hc_isfinite(float x) restrict(amp); extern "C" int __hc_isfinite_double(double x) restrict(amp); -extern "C" int __hc_isinf_half(__fp16 x) restrict(amp); +extern "C" int __hc_isinf_half(_Float16 x) restrict(amp); extern "C" int __hc_isinf(float x) restrict(amp); extern "C" int __hc_isinf_double(double x) restrict(amp); -extern "C" int __hc_isnan_half(__fp16 x) restrict(amp); +extern "C" int __hc_isnan_half(_Float16 x) restrict(amp); extern "C" int __hc_isnan(float x) restrict(amp); extern "C" int __hc_isnan_double(double x) restrict(amp); -extern "C" int __hc_isnormal_half(__fp16 x) restrict(amp); +extern "C" int __hc_isnormal_half(_Float16 x) restrict(amp); extern "C" int __hc_isnormal(float x) restrict(amp); extern "C" int __hc_isnormal_double(double x) restrict(amp); -extern "C" __fp16 __hc_ldexp_half(__fp16 x, std::int16_t exp) [[hc]]; +extern "C" _Float16 __hc_ldexp_half(_Float16 x, std::int16_t exp) [[hc]]; extern "C" float __hc_ldexp(float x, int exp) restrict(amp); extern "C" double __hc_ldexp_double(double x, int exp) restrict(amp); -extern "C" __fp16 __hc_lgamma_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_lgamma_half(_Float16 x) restrict(amp); extern "C" float __hc_lgamma(float x) restrict(amp); extern "C" double __hc_lgamma_double(double x) restrict(amp); -extern "C" __fp16 __hc_log_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_log_half(_Float16 x) restrict(amp); extern "C" float __hc_log(float x) restrict(amp); extern "C" float __hc_log_native(float x) restrict(amp); extern "C" double __hc_log_double(double x) restrict(amp); -extern "C" __fp16 __hc_log10_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_log10_half(_Float16 x) restrict(amp); extern "C" float __hc_log10(float x) restrict(amp); extern "C" double __hc_log10_double(double x) restrict(amp); -extern "C" __fp16 __hc_log2_half(__fp16 x) restrict(amp); -extern "C" __fp16 __hc_log2_native_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_log2_half(_Float16 x) restrict(amp); +extern "C" _Float16 __hc_log2_native_half(_Float16 x) restrict(amp); extern "C" float __hc_log2(float x) restrict(amp); extern "C" float __hc_log2_native(float x) restrict(amp); extern "C" double __hc_log2_double(double x) restrict(amp); -extern "C" __fp16 __hc_log1p_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_log1p_half(_Float16 x) restrict(amp); extern "C" float __hc_log1p(float x) restrict(amp); extern "C" double __hc_log1p_double(double x) restrict(amp); -extern "C" __fp16 __hc_logb_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_logb_half(_Float16 x) restrict(amp); extern "C" float __hc_logb(float x) restrict(amp); extern "C" double __hc_logb_double(double x) restrict(amp); -extern "C" __fp16 __hc_modf_half(__fp16 x, __fp16 *iptr) restrict(amp); +extern "C" _Float16 __hc_modf_half(_Float16 x, _Float16 *iptr) restrict(amp); extern "C" float __hc_modf(float x, float *iptr) restrict(amp); extern "C" double __hc_modf_double(double x, double *iptr) restrict(amp); -extern "C" __fp16 __hc_nan_half(int tagp) restrict(amp); +extern "C" _Float16 __hc_nan_half(int tagp) restrict(amp); extern "C" float __hc_nan(int tagp) restrict(amp); extern "C" double __hc_nan_double(unsigned long tagp) restrict(amp); -extern "C" __fp16 __hc_nearbyint_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_nearbyint_half(_Float16 x) restrict(amp); extern "C" float __hc_nearbyint(float x) restrict(amp); extern "C" double __hc_nearbyint_double(double x) restrict(amp); -extern "C" __fp16 __hc_nextafter_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_nextafter_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_nextafter(float x, float y) restrict(amp); extern "C" double __hc_nextafter_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_pow_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_pow_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_pow(float x, float y) restrict(amp); extern "C" double __hc_pow_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_rcbrt_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_rcbrt_half(_Float16 x) restrict(amp); extern "C" float __hc_rcbrt(float x) restrict(amp); extern "C" double __hc_rcbrt_double(double x) restrict(amp); // TODO: rcp is implementation only, it does not have a public interface. -extern "C" __fp16 __hc_rcp_native_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_rcp_native_half(_Float16 x) restrict(amp); extern "C" float __hc_rcp_native(float x) restrict(amp); -extern "C" __fp16 __hc_remainder_half(__fp16 x, __fp16 y) restrict(amp); +extern "C" _Float16 __hc_remainder_half(_Float16 x, _Float16 y) restrict(amp); extern "C" float __hc_remainder(float x, float y) restrict(amp); extern "C" double __hc_remainder_double(double x, double y) restrict(amp); -extern "C" __fp16 __hc_remquo_half(__fp16 x, __fp16 y, int *quo) restrict(amp); +extern "C" _Float16 __hc_remquo_half(_Float16 x, _Float16 y, int *quo) restrict(amp); extern "C" float __hc_remquo(float x, float y, int *quo) restrict(amp); extern "C" double __hc_remquo_double(double x, double y, int *quo) restrict(amp); -extern "C" __fp16 __hc_round_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_round_half(_Float16 x) restrict(amp); extern "C" float __hc_round(float x) restrict(amp); extern "C" double __hc_round_double(double x) restrict(amp); -extern "C" __fp16 __hc_rsqrt_half(__fp16 x) restrict(amp); -extern "C" __fp16 __hc_rsqrt_native_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_rsqrt_half(_Float16 x) restrict(amp); +extern "C" _Float16 __hc_rsqrt_native_half(_Float16 x) restrict(amp); extern "C" float __hc_rsqrt(float x) restrict(amp); extern "C" float __hc_rsqrt_native(float x) restrict(amp); extern "C" double __hc_rsqrt_double(double x) restrict(amp); -extern "C" __fp16 __hc_scalb_half(__fp16 x, __fp16 exp) restrict(amp); +extern "C" _Float16 __hc_scalb_half(_Float16 x, _Float16 exp) restrict(amp); extern "C" float __hc_scalb(float x, float exp) restrict(amp); extern "C" double __hc_scalb_double(double x, double exp) restrict(amp); -extern "C" __fp16 __hc_scalbn_half(__fp16 x, int exp) restrict(amp); +extern "C" _Float16 __hc_scalbn_half(_Float16 x, int exp) restrict(amp); extern "C" float __hc_scalbn(float x, int exp) restrict(amp); extern "C" double __hc_scalbn_double(double x, int exp) restrict(amp); -extern "C" __fp16 __hc_sinpi_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_sinpi_half(_Float16 x) restrict(amp); extern "C" float __hc_sinpi(float x) restrict(amp); extern "C" double __hc_sinpi_double(double x) restrict(amp); -extern "C" int __hc_signbit_half(__fp16 x) restrict(amp); +extern "C" int __hc_signbit_half(_Float16 x) restrict(amp); extern "C" int __hc_signbit(float x) restrict(amp); extern "C" int __hc_signbit_double(double x) restrict(amp); -extern "C" __fp16 __hc_sin_half(__fp16 x) restrict(amp); -extern "C" __fp16 __hc_sin_native_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_sin_half(_Float16 x) restrict(amp); +extern "C" _Float16 __hc_sin_native_half(_Float16 x) restrict(amp); extern "C" float __hc_sin(float x) restrict(amp); extern "C" float __hc_sin_native(float x) restrict(amp); extern "C" double __hc_sin_double(double x) restrict(amp); -extern "C" __fp16 __hc_sincos_half(__fp16 x, __fp16 *c) restrict(amp); +extern "C" _Float16 __hc_sincos_half(_Float16 x, _Float16 *c) restrict(amp); extern "C" float __hc_sincos(float x, float *c) restrict(amp); extern "C" double __hc_sincos_double(double x, double *c) restrict(amp); -extern "C" __fp16 __hc_sinh_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_sinh_half(_Float16 x) restrict(amp); extern "C" float __hc_sinh(float x) restrict(amp); extern "C" double __hc_sinh_double(double x) restrict(amp); -extern "C" __fp16 __hc_sqrt_half(__fp16 x) restrict(amp); -extern "C" __fp16 __hc_sqrt_native_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_sqrt_half(_Float16 x) restrict(amp); +extern "C" _Float16 __hc_sqrt_native_half(_Float16 x) restrict(amp); extern "C" float __hc_sqrt(float x) restrict(amp); extern "C" float __hc_sqrt_native(float x) restrict(amp); extern "C" double __hc_sqrt_double(double x) restrict(amp); -extern "C" __fp16 __hc_tgamma_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_tgamma_half(_Float16 x) restrict(amp); extern "C" float __hc_tgamma(float x) restrict(amp); extern "C" double __hc_tgamma_double(double x) restrict(amp); -extern "C" __fp16 __hc_tan_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_tan_half(_Float16 x) restrict(amp); extern "C" float __hc_tan(float x) restrict(amp); extern "C" double __hc_tan_double(double x) restrict(amp); -extern "C" __fp16 __hc_tanh_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_tanh_half(_Float16 x) restrict(amp); extern "C" float __hc_tanh(float x) restrict(amp); extern "C" double __hc_tanh_double(double x) restrict(amp); -extern "C" __fp16 __hc_tanpi_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_tanpi_half(_Float16 x) restrict(amp); extern "C" float __hc_tanpi(float x) restrict(amp); extern "C" double __hc_tanpi_double(double x) restrict(amp); -extern "C" __fp16 __hc_trunc_half(__fp16 x) restrict(amp); +extern "C" _Float16 __hc_trunc_half(_Float16 x) restrict(amp); extern "C" float __hc_trunc(float x) restrict(amp); extern "C" double __hc_trunc_double(double x) restrict(amp); #define HCC_MATH_LIB_FN inline __attribute__((used, hc)) -namespace Kalmar +namespace detail { namespace fast_math { @@ -364,7 +367,7 @@ namespace Kalmar float acosf(float x) { return __hc_acos(x); } HCC_MATH_LIB_FN - __fp16 acos(__fp16 x) { return __hc_acos_half(x); } + _Float16 acos(_Float16 x) { return __hc_acos_half(x); } HCC_MATH_LIB_FN float acos(float x) { return fast_math::acosf(x); } @@ -373,7 +376,7 @@ namespace Kalmar float asinf(float x) { return __hc_asin(x); } HCC_MATH_LIB_FN - __fp16 asin(__fp16 x) { return __hc_asin_half(x); } + _Float16 asin(_Float16 x) { return __hc_asin_half(x); } HCC_MATH_LIB_FN float asin(float x) { return fast_math::asinf(x); } @@ -382,7 +385,7 @@ namespace Kalmar float atanf(float x) { return __hc_atan(x); } HCC_MATH_LIB_FN - __fp16 atan(__fp16 x) { return __hc_atan_half(x); } + _Float16 atan(_Float16 x) { return __hc_atan_half(x); } HCC_MATH_LIB_FN float atan(float x) { return fast_math::atanf(x); } @@ -391,7 +394,7 @@ namespace Kalmar float atan2f(float y, float x) { return __hc_atan2(y, x); } HCC_MATH_LIB_FN - __fp16 atan2(__fp16 y, __fp16 x) { return __hc_atan2_half(y, x); } + _Float16 atan2(_Float16 y, _Float16 x) { return __hc_atan2_half(y, x); } HCC_MATH_LIB_FN float atan2(float y, float x) { return fast_math::atan2f(y, x); } @@ -400,7 +403,7 @@ namespace Kalmar float ceilf(float x) { return __hc_ceil(x); } HCC_MATH_LIB_FN - __fp16 ceil(__fp16 x) { return __hc_ceil_half(x); } + _Float16 ceil(_Float16 x) { return __hc_ceil_half(x); } HCC_MATH_LIB_FN float ceil(float x) { return fast_math::ceilf(x); } @@ -409,7 +412,7 @@ namespace Kalmar float cosf(float x) { return __hc_cos_native(x); } HCC_MATH_LIB_FN - __fp16 cos(__fp16 x) { return __hc_cos_native_half(x); } + _Float16 cos(_Float16 x) { return __hc_cos_native_half(x); } HCC_MATH_LIB_FN float cos(float x) { return fast_math::cosf(x); } @@ -418,7 +421,7 @@ namespace Kalmar float coshf(float x) { return __hc_cosh(x); } HCC_MATH_LIB_FN - __fp16 cosh(__fp16 x) { return __hc_cosh_half(x); } + _Float16 cosh(_Float16 x) { return __hc_cosh_half(x); } HCC_MATH_LIB_FN float cosh(float x) { return fast_math::coshf(x); } @@ -427,7 +430,7 @@ namespace Kalmar float expf(float x) { return __hc_exp2_native(M_LOG2E * x); } HCC_MATH_LIB_FN - __fp16 exp(__fp16 x) { return __hc_exp2_native_half(M_LOG2E * x); } + _Float16 exp(_Float16 x) { return __hc_exp2_native_half(M_LOG2E * x); } HCC_MATH_LIB_FN float exp(float x) { return fast_math::expf(x); } @@ -436,7 +439,7 @@ namespace Kalmar float exp2f(float x) { return __hc_exp2_native(x); } HCC_MATH_LIB_FN - __fp16 exp2(__fp16 x) { return __hc_exp2_native_half(x); } + _Float16 exp2(_Float16 x) { return __hc_exp2_native_half(x); } HCC_MATH_LIB_FN float exp2(float x) { return fast_math::exp2f(x); } @@ -445,7 +448,7 @@ namespace Kalmar float fabsf(float x) { return __hc_fabs(x); } HCC_MATH_LIB_FN - __fp16 fabs(__fp16 x) { return __hc_fabs_half(x); } + _Float16 fabs(_Float16 x) { return __hc_fabs_half(x); } HCC_MATH_LIB_FN float fabs(float x) { return fast_math::fabsf(x); } @@ -454,7 +457,7 @@ namespace Kalmar float floorf(float x) { return __hc_floor(x); } HCC_MATH_LIB_FN - __fp16 floor(__fp16 x) { return __hc_floor_half(x); } + _Float16 floor(_Float16 x) { return __hc_floor_half(x); } HCC_MATH_LIB_FN float floor(float x) { return fast_math::floorf(x); } @@ -463,7 +466,7 @@ namespace Kalmar float fmaxf(float x, float y) { return __hc_fmax(x, y); } HCC_MATH_LIB_FN - __fp16 fmax(__fp16 x, __fp16 y) { return __hc_fmax_half(x, y); } + _Float16 fmax(_Float16 x, _Float16 y) { return __hc_fmax_half(x, y); } HCC_MATH_LIB_FN float fmax(float x, float y) { return fast_math::fmaxf(x, y); } @@ -472,7 +475,7 @@ namespace Kalmar float fminf(float x, float y) { return __hc_fmin(x, y); } HCC_MATH_LIB_FN - __fp16 fmin(__fp16 x, __fp16 y) { return __hc_fmin_half(x, y); } + _Float16 fmin(_Float16 x, _Float16 y) { return __hc_fmin_half(x, y); } HCC_MATH_LIB_FN float fmin(float x, float y) { return fast_math::fminf(x, y); } @@ -481,7 +484,7 @@ namespace Kalmar float fmodf(float x, float y) { return __hc_fmod(x, y); } HCC_MATH_LIB_FN - __fp16 fmod(__fp16 x, __fp16 y) { return __hc_fmod_half(x, y); } + _Float16 fmod(_Float16 x, _Float16 y) { return __hc_fmod_half(x, y); } HCC_MATH_LIB_FN float fmod(float x, float y) { return fast_math::fmodf(x, y); } @@ -490,25 +493,25 @@ namespace Kalmar float frexpf(float x, int *exp) { return __hc_frexp(x, exp); } HCC_MATH_LIB_FN - __fp16 frexp(__fp16 x, int *exp) { return __hc_frexp_half(x, exp); } + _Float16 frexp(_Float16 x, int *exp) { return __hc_frexp_half(x, exp); } HCC_MATH_LIB_FN float frexp(float x, int *exp) { return fast_math::frexpf(x, exp); } HCC_MATH_LIB_FN - int isfinite(__fp16 x) { return __hc_isfinite_half(x); } + int isfinite(_Float16 x) { return __hc_isfinite_half(x); } HCC_MATH_LIB_FN int isfinite(float x) { return __hc_isfinite(x); } HCC_MATH_LIB_FN - int isinf(__fp16 x) { return __hc_isinf_half(x); } + int isinf(_Float16 x) { return __hc_isinf_half(x); } HCC_MATH_LIB_FN int isinf(float x) { return __hc_isinf(x); } HCC_MATH_LIB_FN - int isnan(__fp16 x) { return __hc_isnan_half(x); } + int isnan(_Float16 x) { return __hc_isnan_half(x); } HCC_MATH_LIB_FN int isnan(float x) { return __hc_isnan(x); } @@ -517,7 +520,7 @@ namespace Kalmar float ldexpf(float x, int exp) { return __hc_ldexp(x,exp); } HCC_MATH_LIB_FN - __fp16 ldexp(__fp16 x, std::uint16_t exp) + _Float16 ldexp(_Float16 x, std::uint16_t exp) { return __hc_ldexp_half(x, exp); } @@ -539,9 +542,10 @@ namespace Kalmar float logf(float x) { return __hc_log2_native(x) * M_RLOG2_E_F; } HCC_MATH_LIB_FN - __fp16 log(__fp16 x) + _Float16 log(_Float16 x) { - return __hc_log2_native_half(x) * static_cast<__fp16>(M_RLOG2_E_F); + return + __hc_log2_native_half(x) * static_cast<_Float16>(M_RLOG2_E_F); } HCC_MATH_LIB_FN @@ -551,9 +555,10 @@ namespace Kalmar float log10f(float x) { return __hc_log2_native(x) * M_RLOG2_10_F; } HCC_MATH_LIB_FN - __fp16 log10(__fp16 x) + _Float16 log10(_Float16 x) { - return __hc_log2_native_half(x) * static_cast<__fp16>(M_RLOG2_10_F); + return + __hc_log2_native_half(x) * static_cast<_Float16>(M_RLOG2_10_F); } HCC_MATH_LIB_FN @@ -563,7 +568,7 @@ namespace Kalmar float log2f(float x) { return __hc_log2_native(x); } HCC_MATH_LIB_FN - __fp16 log2(__fp16 x) { return __hc_log2_native_half(x); } + _Float16 log2(_Float16 x) { return __hc_log2_native_half(x); } HCC_MATH_LIB_FN float log2(float x) { return fast_math::log2f(x); } @@ -572,7 +577,7 @@ namespace Kalmar float modff(float x, float *iptr) { return __hc_modf(x, iptr); } HCC_MATH_LIB_FN - __fp16 modf(__fp16 x, __fp16 *iptr) { return __hc_modf_half(x, iptr); } + _Float16 modf(_Float16 x, _Float16 *iptr) { return __hc_modf_half(x, iptr); } HCC_MATH_LIB_FN @@ -582,7 +587,7 @@ namespace Kalmar float powf(float x, float y) { return __hc_pow(x, y); } HCC_MATH_LIB_FN - __fp16 pow(__fp16 x, __fp16 y) { return __hc_pow_half(x, y); } + _Float16 pow(_Float16 x, _Float16 y) { return __hc_pow_half(x, y); } HCC_MATH_LIB_FN float pow(float x, float y) { return fast_math::powf(x, y); } @@ -591,7 +596,7 @@ namespace Kalmar float roundf(float x) { return __hc_round(x); } HCC_MATH_LIB_FN - __fp16 round(__fp16 x) { return __hc_round_half(x); } + _Float16 round(_Float16 x) { return __hc_round_half(x); } HCC_MATH_LIB_FN float round(float x) { return fast_math::roundf(x); } @@ -600,7 +605,7 @@ namespace Kalmar float rsqrtf(float x) { return __hc_rsqrt_native(x); } HCC_MATH_LIB_FN - __fp16 rsqrt(__fp16 x) { return __hc_rsqrt_native_half(x); } + _Float16 rsqrt(_Float16 x) { return __hc_rsqrt_native_half(x); } HCC_MATH_LIB_FN float rsqrt(float x) { return fast_math::rsqrtf(x); } @@ -609,7 +614,7 @@ namespace Kalmar int signbitf(float x) { return __hc_signbit(x); } HCC_MATH_LIB_FN - int signbit(__fp16 x) { return __hc_signbit_half(x); } + int signbit(_Float16 x) { return __hc_signbit_half(x); } HCC_MATH_LIB_FN int signbit(float x) { return fast_math::signbitf(x); } @@ -618,7 +623,7 @@ namespace Kalmar float sinf(float x) { return __hc_sin_native(x); } HCC_MATH_LIB_FN - __fp16 sin(__fp16 x) { return __hc_sin_native_half(x); } + _Float16 sin(_Float16 x) { return __hc_sin_native_half(x); } HCC_MATH_LIB_FN float sin(float x) { return fast_math::sinf(x); } @@ -627,7 +632,7 @@ namespace Kalmar void sincosf(float x, float *s, float *c) { *s = __hc_sincos(x, c); } HCC_MATH_LIB_FN - void sincos(__fp16 x, __fp16 *s, __fp16 *c) + void sincos(_Float16 x, _Float16 *s, _Float16 *c) { *s = __hc_sincos_half(x, c); } @@ -642,7 +647,7 @@ namespace Kalmar float sinhf(float x) { return __hc_sinh(x); } HCC_MATH_LIB_FN - __fp16 sinh(__fp16 x) { return __hc_sinh_half(x); } + _Float16 sinh(_Float16 x) { return __hc_sinh_half(x); } HCC_MATH_LIB_FN float sinh(float x) { return fast_math::sinhf(x); } @@ -651,7 +656,7 @@ namespace Kalmar float sqrtf(float x) { return __hc_sqrt_native(x); } HCC_MATH_LIB_FN - __fp16 sqrt(__fp16 x) { return __hc_sqrt_native_half(x); } + _Float16 sqrt(_Float16 x) { return __hc_sqrt_native_half(x); } HCC_MATH_LIB_FN float sqrt(float x) { return fast_math::sqrtf(x); } @@ -660,7 +665,7 @@ namespace Kalmar float tanf(float x) { return __hc_tan(x); } HCC_MATH_LIB_FN - __fp16 tan(__fp16 x) + _Float16 tan(_Float16 x) { return __hc_sin_native_half(x) * __hc_rcp_native_half(__hc_cos_native_half(x)); @@ -673,7 +678,7 @@ namespace Kalmar float tanhf(float x) { return __hc_tanh(x); } HCC_MATH_LIB_FN - __fp16 tanh(__fp16 x) { return __hc_tanh_half(x); } + _Float16 tanh(_Float16 x) { return __hc_tanh_half(x); } HCC_MATH_LIB_FN float tanh(float x) { return fast_math::tanhf(x); } @@ -682,7 +687,7 @@ namespace Kalmar float truncf(float x) { return __hc_trunc(x); } HCC_MATH_LIB_FN - __fp16 trunc(__fp16 x) { return __hc_trunc_half(x); } + _Float16 trunc(_Float16 x) { return __hc_trunc_half(x); } HCC_MATH_LIB_FN float trunc(float x) { return fast_math::truncf(x); } @@ -798,7 +803,7 @@ namespace Kalmar float acosf(float x) { return __hc_acos(x); } HCC_MATH_LIB_FN - __fp16 acos(__fp16 x) { return __hc_acos_half(x); } + _Float16 acos(_Float16 x) { return __hc_acos_half(x); } HCC_MATH_LIB_FN float acos(float x) { return precise_math::acosf(x); } @@ -810,7 +815,7 @@ namespace Kalmar float acoshf(float x) { return __hc_acosh(x); } HCC_MATH_LIB_FN - __fp16 acosh(__fp16 x) { return __hc_acosh_half(x); } + _Float16 acosh(_Float16 x) { return __hc_acosh_half(x); } HCC_MATH_LIB_FN float acosh(float x) { return precise_math::acoshf(x); } @@ -822,7 +827,7 @@ namespace Kalmar float asinf(float x) { return __hc_asin(x); } HCC_MATH_LIB_FN - __fp16 asin(__fp16 x) { return __hc_asin_half(x); } + _Float16 asin(_Float16 x) { return __hc_asin_half(x); } HCC_MATH_LIB_FN float asin(float x) { return precise_math::asinf(x); } @@ -834,7 +839,7 @@ namespace Kalmar float asinhf(float x) { return __hc_asinh(x); } HCC_MATH_LIB_FN - __fp16 asinh(__fp16 x) { return __hc_asinh_half(x); } + _Float16 asinh(_Float16 x) { return __hc_asinh_half(x); } HCC_MATH_LIB_FN float asinh(float x) { return precise_math::asinhf(x); } @@ -846,7 +851,7 @@ namespace Kalmar float atanf(float x) { return __hc_atan(x); } HCC_MATH_LIB_FN - __fp16 atan(__fp16 x) { return __hc_atan_half(x); } + _Float16 atan(_Float16 x) { return __hc_atan_half(x); } HCC_MATH_LIB_FN float atan(float x) { return precise_math::atanf(x); } @@ -858,7 +863,7 @@ namespace Kalmar float atanhf(float x) { return __hc_atanh(x); } HCC_MATH_LIB_FN - __fp16 atanh(__fp16 x) { return __hc_atanh_half(x); } + _Float16 atanh(_Float16 x) { return __hc_atanh_half(x); } HCC_MATH_LIB_FN float atanh(float x) { return precise_math::atanhf(x); } @@ -870,7 +875,7 @@ namespace Kalmar float atan2f(float y, float x) { return __hc_atan2(y, x); } HCC_MATH_LIB_FN - __fp16 atan2(__fp16 x, __fp16 y) { return __hc_atan2_half(x, y); } + _Float16 atan2(_Float16 x, _Float16 y) { return __hc_atan2_half(x, y); } HCC_MATH_LIB_FN float atan2(float y, float x) { return precise_math::atan2f(y, x); } @@ -882,7 +887,7 @@ namespace Kalmar float cbrtf(float x) { return __hc_cbrt(x); } HCC_MATH_LIB_FN - __fp16 cbrt(__fp16 x) { return __hc_cbrt_half(x); } + _Float16 cbrt(_Float16 x) { return __hc_cbrt_half(x); } HCC_MATH_LIB_FN float cbrt(float x) { return precise_math::cbrtf(x); } @@ -894,7 +899,7 @@ namespace Kalmar float ceilf(float x) { return __hc_ceil(x); } HCC_MATH_LIB_FN - __fp16 ceil(__fp16 x) { return __hc_ceil_half(x); } + _Float16 ceil(_Float16 x) { return __hc_ceil_half(x); } HCC_MATH_LIB_FN float ceil(float x) { return precise_math::ceilf(x); } @@ -906,7 +911,10 @@ namespace Kalmar float copysignf(float x, float y) { return __hc_copysign(x, y); } HCC_MATH_LIB_FN - __fp16 copysign(__fp16 x, __fp16 y) { return __hc_copysign_half(x, y); } + _Float16 copysign(_Float16 x, _Float16 y) + { + return __hc_copysign_half(x, y); + } HCC_MATH_LIB_FN float copysign(float x, float y) @@ -924,7 +932,7 @@ namespace Kalmar float cosf(float x) { return __hc_cos(x); } HCC_MATH_LIB_FN - __fp16 cos(__fp16 x) { return __hc_cos_half(x); } + _Float16 cos(_Float16 x) { return __hc_cos_half(x); } HCC_MATH_LIB_FN float cos(float x) { return precise_math::cosf(x); } @@ -936,7 +944,7 @@ namespace Kalmar float coshf(float x) { return __hc_cosh(x); } HCC_MATH_LIB_FN - __fp16 cosh(__fp16 x) { return __hc_cosh_half(x); } + _Float16 cosh(_Float16 x) { return __hc_cosh_half(x); } HCC_MATH_LIB_FN float cosh(float x) { return precise_math::coshf(x); } @@ -948,7 +956,7 @@ namespace Kalmar float cospif(float x) { return __hc_cospi(x); } HCC_MATH_LIB_FN - __fp16 cospi(__fp16 x) { return __hc_cospi_half(x); } + _Float16 cospi(_Float16 x) { return __hc_cospi_half(x); } HCC_MATH_LIB_FN float cospi(float x) { return precise_math::cospif(x); } @@ -960,7 +968,7 @@ namespace Kalmar float erff(float x) { return __hc_erf(x); } HCC_MATH_LIB_FN - __fp16 erf(__fp16 x) { return __hc_erf_half(x); } + _Float16 erf(_Float16 x) { return __hc_erf_half(x); } HCC_MATH_LIB_FN float erf(float x) { return precise_math::erff(x); } @@ -972,7 +980,7 @@ namespace Kalmar float erfcf(float x) { return __hc_erfc(x); } HCC_MATH_LIB_FN - __fp16 erfc(__fp16 x) { return __hc_erfc_half(x); } + _Float16 erfc(_Float16 x) { return __hc_erfc_half(x); } HCC_MATH_LIB_FN float erfc(float x) { return precise_math::erfcf(x); } @@ -984,7 +992,7 @@ namespace Kalmar float erfcinvf(float x) { return __hc_erfcinv(x); } HCC_MATH_LIB_FN - __fp16 erfcinv(__fp16 x) { return __hc_erfcinv_half(x); } + _Float16 erfcinv(_Float16 x) { return __hc_erfcinv_half(x); } HCC_MATH_LIB_FN float erfcinv(float x) { return precise_math::erfcinvf(x); } @@ -996,7 +1004,7 @@ namespace Kalmar float erfinvf(float x) { return __hc_erfinv(x); } HCC_MATH_LIB_FN - __fp16 erfinv(__fp16 x) { return __hc_erfinv_half(x); } + _Float16 erfinv(_Float16 x) { return __hc_erfinv_half(x); } HCC_MATH_LIB_FN float erfinv(float x) { return precise_math::erfinvf(x); } @@ -1008,7 +1016,7 @@ namespace Kalmar float expf(float x) { return __hc_exp(x); } HCC_MATH_LIB_FN - __fp16 exp(__fp16 x) { return __hc_exp_half(x); } + _Float16 exp(_Float16 x) { return __hc_exp_half(x); } HCC_MATH_LIB_FN float exp(float x) { return precise_math::expf(x); } @@ -1020,7 +1028,7 @@ namespace Kalmar float exp2f(float x) { return __hc_exp2(x); } HCC_MATH_LIB_FN - __fp16 exp2(__fp16 x) { return __hc_exp2_half(x); } + _Float16 exp2(_Float16 x) { return __hc_exp2_half(x); } HCC_MATH_LIB_FN float exp2(float x) { return precise_math::exp2f(x); } @@ -1032,7 +1040,7 @@ namespace Kalmar float exp10f(float x) { return __hc_exp10(x); } HCC_MATH_LIB_FN - __fp16 exp10(__fp16 x) { return __hc_exp10_half(x); } + _Float16 exp10(_Float16 x) { return __hc_exp10_half(x); } HCC_MATH_LIB_FN float exp10(float x) { return precise_math::exp10f(x); } @@ -1044,7 +1052,7 @@ namespace Kalmar float expm1f(float x) { return __hc_expm1(x); } HCC_MATH_LIB_FN - __fp16 expm1(__fp16 x) { return __hc_expm1_half(x); } + _Float16 expm1(_Float16 x) { return __hc_expm1_half(x); } HCC_MATH_LIB_FN float expm1(float x) { return precise_math::expm1f(x); } @@ -1056,7 +1064,7 @@ namespace Kalmar float fabsf(float x) { return __hc_fabs(x); } HCC_MATH_LIB_FN - __fp16 fabs(__fp16 x) { return __hc_fabs_half(x); } + _Float16 fabs(_Float16 x) { return __hc_fabs_half(x); } HCC_MATH_LIB_FN float fabs(float x) { return precise_math::fabsf(x); } @@ -1068,7 +1076,7 @@ namespace Kalmar float fdimf(float x, float y) { return __hc_fdim(x, y); } HCC_MATH_LIB_FN - __fp16 fdim(__fp16 x, __fp16 y) { return __hc_fdim_half(x, y); } + _Float16 fdim(_Float16 x, _Float16 y) { return __hc_fdim_half(x, y); } HCC_MATH_LIB_FN float fdim(float x, float y) { return precise_math::fdimf(x, y); } @@ -1080,7 +1088,7 @@ namespace Kalmar float floorf(float x) { return __hc_floor(x); } HCC_MATH_LIB_FN - __fp16 floor(__fp16 x) { return __hc_floor_half(x); } + _Float16 floor(_Float16 x) { return __hc_floor_half(x); } HCC_MATH_LIB_FN float floor(float x) { return precise_math::floorf(x); } @@ -1092,7 +1100,7 @@ namespace Kalmar float fmaf(float x, float y, float z) { return __hc_fma(x, y, z); } HCC_MATH_LIB_FN - __fp16 fma(__fp16 x, __fp16 y, __fp16 z) + _Float16 fma(_Float16 x, _Float16 y, _Float16 z) { return __hc_fma_half(x, y, z); } @@ -1113,7 +1121,7 @@ namespace Kalmar float fmaxf(float x, float y) { return __hc_fmax(x, y); } HCC_MATH_LIB_FN - __fp16 fmax(__fp16 x, __fp16 y) { return __hc_fmax_half(x, y); } + _Float16 fmax(_Float16 x, _Float16 y) { return __hc_fmax_half(x, y); } HCC_MATH_LIB_FN float fmax(float x, float y) { return precise_math::fmaxf(x, y); } @@ -1125,7 +1133,7 @@ namespace Kalmar float fminf(float x, float y) { return __hc_fmin(x, y); } HCC_MATH_LIB_FN - __fp16 fmin(__fp16 x, __fp16 y) { return __hc_fmin_half(x, y); } + _Float16 fmin(_Float16 x, _Float16 y) { return __hc_fmin_half(x, y); } HCC_MATH_LIB_FN float fmin(float x, float y) { return precise_math::fminf(x, y); } @@ -1137,7 +1145,7 @@ namespace Kalmar float fmodf(float x, float y) { return __hc_fmod(x, y); } HCC_MATH_LIB_FN - __fp16 fmod(__fp16 x, __fp16 y) { return __hc_fmod_half(x, y); } + _Float16 fmod(_Float16 x, _Float16 y) { return __hc_fmod_half(x, y); } HCC_MATH_LIB_FN float fmod(float x, float y) { return precise_math::fmodf(x, y); } @@ -1146,7 +1154,7 @@ namespace Kalmar double fmod(double x, double y) { return __hc_fmod_double(x, y); } HCC_MATH_LIB_FN - int fpclassify(__fp16 x) { return __hc_fpclassify_half(x); } + int fpclassify(_Float16 x) { return __hc_fpclassify_half(x); } HCC_MATH_LIB_FN int fpclassify(float x) { return __hc_fpclassify(x); } @@ -1158,7 +1166,7 @@ namespace Kalmar float frexpf(float x, int *exp) { return __hc_frexp(x, exp); } HCC_MATH_LIB_FN - __fp16 frexp(__fp16 x, int* exp) { return __hc_frexp_half(x, exp); } + _Float16 frexp(_Float16 x, int* exp) { return __hc_frexp_half(x, exp); } HCC_MATH_LIB_FN float frexp(float x, int *exp) { return precise_math::frexpf(x, exp); } @@ -1170,7 +1178,7 @@ namespace Kalmar float hypotf(float x, float y) { return __hc_hypot(x, y); } HCC_MATH_LIB_FN - __fp16 hypot(__fp16 x, __fp16 y) { return __hc_hypot_half(x, y); } + _Float16 hypot(_Float16 x, _Float16 y) { return __hc_hypot_half(x, y); } HCC_MATH_LIB_FN float hypot(float x, float y) { return precise_math::hypotf(x, y); } @@ -1182,7 +1190,7 @@ namespace Kalmar int ilogbf(float x) { return __hc_ilogb(x); } HCC_MATH_LIB_FN - int ilogb(__fp16 x) { return __hc_ilogb_half(x); } + int ilogb(_Float16 x) { return __hc_ilogb_half(x); } HCC_MATH_LIB_FN int ilogb(float x) { return precise_math::ilogbf(x); } @@ -1191,7 +1199,7 @@ namespace Kalmar int ilogb(double x) { return __hc_ilogb_double(x); } HCC_MATH_LIB_FN - int isfinite(__fp16 x) { return __hc_isfinite_half(x); } + int isfinite(_Float16 x) { return __hc_isfinite_half(x); } HCC_MATH_LIB_FN int isfinite(float x) { return __hc_isfinite(x); } @@ -1200,7 +1208,7 @@ namespace Kalmar int isfinite(double x) { return __hc_isfinite_double(x); } HCC_MATH_LIB_FN - int isinf(__fp16 x) { return __hc_isinf_half(x); } + int isinf(_Float16 x) { return __hc_isinf_half(x); } HCC_MATH_LIB_FN int isinf(float x) { return __hc_isinf(x); } @@ -1209,7 +1217,7 @@ namespace Kalmar int isinf(double x) { return __hc_isinf_double(x); } HCC_MATH_LIB_FN - int isnan(__fp16 x) { return __hc_isnan_half(x); } + int isnan(_Float16 x) { return __hc_isnan_half(x); } HCC_MATH_LIB_FN int isnan(float x) { return __hc_isnan(x); } @@ -1218,7 +1226,7 @@ namespace Kalmar int isnan(double x) { return __hc_isnan_double(x); } HCC_MATH_LIB_FN - int isnormal(__fp16 x) { return __hc_isnormal_half(x); } + int isnormal(_Float16 x) { return __hc_isnormal_half(x); } HCC_MATH_LIB_FN int isnormal(float x) { return __hc_isnormal(x); } @@ -1230,7 +1238,10 @@ namespace Kalmar float ldexpf(float x, int exp) { return __hc_ldexp(x, exp); } HCC_MATH_LIB_FN - __fp16 ldexp(__fp16 x, std::int16_t e) { return __hc_ldexp_half(x, e); } + _Float16 ldexp(_Float16 x, std::int16_t e) + { + return __hc_ldexp_half(x, e); + } HCC_MATH_LIB_FN float ldexp(float x, int exp) { return precise_math::ldexpf(x, exp); } @@ -1242,7 +1253,7 @@ namespace Kalmar float lgammaf(float x) { return __hc_lgamma(x); } HCC_MATH_LIB_FN - __fp16 lgamma(__fp16 x) { return __hc_lgamma_half(x); } + _Float16 lgamma(_Float16 x) { return __hc_lgamma_half(x); } HCC_MATH_LIB_FN float lgamma(float x) { return precise_math::lgammaf(x); } @@ -1254,7 +1265,7 @@ namespace Kalmar float logf(float x) { return __hc_log(x); } HCC_MATH_LIB_FN - __fp16 log(__fp16 x) { return __hc_log_half(x); } + _Float16 log(_Float16 x) { return __hc_log_half(x); } HCC_MATH_LIB_FN float log(float x) { return precise_math::logf(x); } @@ -1266,7 +1277,7 @@ namespace Kalmar float log10f(float x) { return __hc_log10(x); } HCC_MATH_LIB_FN - __fp16 log10(__fp16 x) { return __hc_log10_half(x); } + _Float16 log10(_Float16 x) { return __hc_log10_half(x); } HCC_MATH_LIB_FN float log10(float x) { return precise_math::log10f(x); } @@ -1278,7 +1289,7 @@ namespace Kalmar float log2f(float x) { return __hc_log2(x); } HCC_MATH_LIB_FN - __fp16 log2(__fp16 x) { return __hc_log2_half(x); } + _Float16 log2(_Float16 x) { return __hc_log2_half(x); } HCC_MATH_LIB_FN float log2(float x) { return precise_math::log2f(x); } @@ -1290,7 +1301,7 @@ namespace Kalmar float log1pf(float x) { return __hc_log1p(x); } HCC_MATH_LIB_FN - __fp16 log1p(__fp16 x) { return __hc_log1p_half(x); } + _Float16 log1p(_Float16 x) { return __hc_log1p_half(x); } HCC_MATH_LIB_FN float log1p(float x) { return precise_math::log1pf(x); } @@ -1302,7 +1313,7 @@ namespace Kalmar float logbf(float x) { return __hc_logb(x); } HCC_MATH_LIB_FN - __fp16 logb(__fp16 x) { return __hc_logb_half(x); } + _Float16 logb(_Float16 x) { return __hc_logb_half(x); } HCC_MATH_LIB_FN float logb(float x) { return precise_math::logbf(x); } @@ -1314,7 +1325,7 @@ namespace Kalmar float modff(float x, float *iptr) { return __hc_modf(x, iptr); } HCC_MATH_LIB_FN - __fp16 modf(__fp16 x, __fp16* p) { return __hc_modf_half(x, p); } + _Float16 modf(_Float16 x, _Float16* p) { return __hc_modf_half(x, p); } HCC_MATH_LIB_FN float modf(float x, float* p) { return precise_math::modff(x, p); } @@ -1323,7 +1334,7 @@ namespace Kalmar double modf(double x, double* p) { return __hc_modf_double(x, p); } HCC_MATH_LIB_FN - __fp16 nanh(int x) { return __hc_nan_half(x); } + _Float16 nanh(int x) { return __hc_nan_half(x); } HCC_MATH_LIB_FN float nanf(int tagp) { return __hc_nan(tagp); } @@ -1338,7 +1349,7 @@ namespace Kalmar float nearbyintf(float x) { return __hc_nearbyint(x); } HCC_MATH_LIB_FN - __fp16 nearbyint(__fp16 x) { return __hc_nearbyint_half(x); } + _Float16 nearbyint(_Float16 x) { return __hc_nearbyint_half(x); } HCC_MATH_LIB_FN float nearbyint(float x) { return precise_math::nearbyintf(x); } @@ -1350,7 +1361,7 @@ namespace Kalmar float nextafterf(float x, float y) { return __hc_nextafter(x, y); } HCC_MATH_LIB_FN - __fp16 nextafter(__fp16 x, __fp16 y) + _Float16 nextafter(_Float16 x, _Float16 y) { return __hc_nextafter_half(x, y); } @@ -1371,7 +1382,7 @@ namespace Kalmar float powf(float x, float y) { return __hc_pow(x, y); } HCC_MATH_LIB_FN - __fp16 pow(__fp16 x, __fp16 y) { return __hc_pow_half(x, y); } + _Float16 pow(_Float16 x, _Float16 y) { return __hc_pow_half(x, y); } HCC_MATH_LIB_FN float pow(float x, float y) { return precise_math::powf(x, y); } @@ -1383,7 +1394,7 @@ namespace Kalmar float rcbrtf(float x) { return __hc_rcbrt(x); } HCC_MATH_LIB_FN - __fp16 rcbrt(__fp16 x) { return __hc_rcbrt_half(x); } + _Float16 rcbrt(_Float16 x) { return __hc_rcbrt_half(x); } HCC_MATH_LIB_FN float rcbrt(float x) { return precise_math::rcbrtf(x); } @@ -1395,7 +1406,7 @@ namespace Kalmar float remainderf(float x, float y) { return __hc_remainder(x, y); } HCC_MATH_LIB_FN - __fp16 remainder(__fp16 x, __fp16 y) + _Float16 remainder(_Float16 x, _Float16 y) { return __hc_remainder_half(x, y); } @@ -1419,7 +1430,7 @@ namespace Kalmar } HCC_MATH_LIB_FN - __fp16 remquo(__fp16 x, __fp16 y, int* q) + _Float16 remquo(_Float16 x, _Float16 y, int* q) { return __hc_remquo_half(x, y, q); } @@ -1440,7 +1451,7 @@ namespace Kalmar float roundf(float x) { return __hc_round(x); } HCC_MATH_LIB_FN - __fp16 round(__fp16 x) { return __hc_round_half(x); } + _Float16 round(_Float16 x) { return __hc_round_half(x); } HCC_MATH_LIB_FN float round(float x) { return precise_math::roundf(x); } @@ -1452,7 +1463,7 @@ namespace Kalmar float rsqrtf(float x) { return __hc_rsqrt(x); } HCC_MATH_LIB_FN - __fp16 rsqrt(__fp16 x) { return __hc_rsqrt_half(x); } + _Float16 rsqrt(_Float16 x) { return __hc_rsqrt_half(x); } HCC_MATH_LIB_FN float rsqrt(float x) { return precise_math::rsqrtf(x); } @@ -1464,7 +1475,7 @@ namespace Kalmar float sinpif(float x) { return __hc_sinpi(x); } HCC_MATH_LIB_FN - __fp16 sinpi(__fp16 x) { return __hc_sinpi_half(x); } + _Float16 sinpi(_Float16 x) { return __hc_sinpi_half(x); } HCC_MATH_LIB_FN float sinpi(float x) { return precise_math::sinpif(x); } @@ -1476,7 +1487,7 @@ namespace Kalmar float scalbf(float x, float exp) { return __hc_scalb(x, exp); } HCC_MATH_LIB_FN - __fp16 scalb(__fp16 x, __fp16 y) { return __hc_scalb_half(x, y); } + _Float16 scalb(_Float16 x, _Float16 y) { return __hc_scalb_half(x, y); } HCC_MATH_LIB_FN float scalb(float x, float exp) { return precise_math::scalbf(x, exp); } @@ -1488,7 +1499,7 @@ namespace Kalmar float scalbnf(float x, int exp) { return __hc_scalbn(x, exp); } HCC_MATH_LIB_FN - __fp16 scalbn(__fp16 x, int e) { return __hc_scalbn_half(x, e); } + _Float16 scalbn(_Float16 x, int e) { return __hc_scalbn_half(x, e); } HCC_MATH_LIB_FN float scalbn(float x, int exp) { return precise_math::scalbnf(x, exp); } @@ -1500,7 +1511,7 @@ namespace Kalmar int signbitf(float x) { return __hc_signbit(x); } HCC_MATH_LIB_FN - int signbit(__fp16 x) { return __hc_signbit_half(x); } + int signbit(_Float16 x) { return __hc_signbit_half(x); } HCC_MATH_LIB_FN int signbit(float x) { return precise_math::signbitf(x); } @@ -1512,7 +1523,7 @@ namespace Kalmar float sinf(float x) { return __hc_sin(x); } HCC_MATH_LIB_FN - __fp16 sin(__fp16 x) { return __hc_sin_half(x); } + _Float16 sin(_Float16 x) { return __hc_sin_half(x); } HCC_MATH_LIB_FN float sin(float x) { return precise_math::sinf(x); } @@ -1524,7 +1535,7 @@ namespace Kalmar void sincosf(float x, float *s, float *c) { *s = __hc_sincos(x, c); } HCC_MATH_LIB_FN - void sincos(__fp16 x, __fp16* s, __fp16* c) + void sincos(_Float16 x, _Float16* s, _Float16* c) { *s = __hc_sincos_half(x, c); } @@ -1545,7 +1556,7 @@ namespace Kalmar float sinhf(float x) { return __hc_sinh(x); } HCC_MATH_LIB_FN - __fp16 sinh(__fp16 x) { return __hc_sinh_half(x); } + _Float16 sinh(_Float16 x) { return __hc_sinh_half(x); } HCC_MATH_LIB_FN float sinh(float x) { return precise_math::sinhf(x); } @@ -1557,7 +1568,7 @@ namespace Kalmar float sqrtf(float x) { return __hc_sqrt(x); } HCC_MATH_LIB_FN - __fp16 sqrt(__fp16 x) { return __hc_sqrt_half(x); } + _Float16 sqrt(_Float16 x) { return __hc_sqrt_half(x); } HCC_MATH_LIB_FN float sqrt(float x) { return precise_math::sqrtf(x); } @@ -1569,7 +1580,7 @@ namespace Kalmar float tgammaf(float x) { return __hc_tgamma(x); } HCC_MATH_LIB_FN - __fp16 tgamma(__fp16 x) { return __hc_tgamma_half(x); } + _Float16 tgamma(_Float16 x) { return __hc_tgamma_half(x); } HCC_MATH_LIB_FN float tgamma(float x) { return precise_math::tgammaf(x); } @@ -1581,7 +1592,7 @@ namespace Kalmar float tanf(float x) { return __hc_tan(x); } HCC_MATH_LIB_FN - __fp16 tan(__fp16 x) { return __hc_tan_half(x); } + _Float16 tan(_Float16 x) { return __hc_tan_half(x); } HCC_MATH_LIB_FN float tan(float x) { return precise_math::tanf(x); } @@ -1593,7 +1604,7 @@ namespace Kalmar float tanhf(float x) { return __hc_tanh(x); } HCC_MATH_LIB_FN - __fp16 tanh(__fp16 x) { return __hc_tanh_half(x); } + _Float16 tanh(_Float16 x) { return __hc_tanh_half(x); } HCC_MATH_LIB_FN float tanh(float x) { return precise_math::tanhf(x); } @@ -1605,7 +1616,7 @@ namespace Kalmar float tanpif(float x) { return __hc_tanpi(x); } HCC_MATH_LIB_FN - __fp16 tanpi(__fp16 x) { return __hc_tanpi_half(x); } + _Float16 tanpi(_Float16 x) { return __hc_tanpi_half(x); } HCC_MATH_LIB_FN float tanpi(float x) { return precise_math::tanpif(x); } @@ -1617,7 +1628,7 @@ namespace Kalmar float truncf(float x) { return __hc_trunc(x); } HCC_MATH_LIB_FN - __fp16 trunc(__fp16 x) { return __hc_trunc_half(x); } + _Float16 trunc(_Float16 x) { return __hc_trunc_half(x); } HCC_MATH_LIB_FN float trunc(float x) { return precise_math::truncf(x); } @@ -1625,4 +1636,4 @@ namespace Kalmar HCC_MATH_LIB_FN double trunc(double x) { return __hc_trunc_double(x); } } // namespace precise_math -} // namespace Kalmar +} // namespace detail \ No newline at end of file diff --git a/include/kalmar_runtime.h b/include/kalmar_runtime.h index 193d3eba456..9393d84cc80 100644 --- a/include/kalmar_runtime.h +++ b/include/kalmar_runtime.h @@ -12,7 +12,7 @@ class completion_future; typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t; -namespace Kalmar { +namespace detail { namespace enums { /// access_type is used for accelerator that supports unified memory @@ -73,7 +73,7 @@ enum hcCommandKind { // Commands sent to copy queues: -static inline bool isCopyCommand(hcCommandKind k) +static inline bool isCopyCommand(hcCommandKind k) { switch (k) { case hcMemcpyHostToHost: @@ -107,34 +107,34 @@ enum hcAgentProfile { }; } // namespace enums -} // namespace Kalmar +} // namespace detail + - /** \cond HIDDEN_SYMBOLS */ -namespace Kalmar { +namespace detail { -using namespace Kalmar::enums; +using namespace enums; /// forward declaration -class KalmarDevice; -class KalmarQueue; +class HCCDevice; +class HCCQueue; struct rw_info; -/// KalmarAsyncOp +/// HCCAsyncOp /// -/// This is an abstraction of all asynchronous operations within Kalmar -class KalmarAsyncOp { +/// This is an abstraction of all asynchronous operations within detail +class HCCAsyncOp { public: - KalmarAsyncOp(KalmarQueue *xqueue, hcCommandKind xCommandKind) : queue(xqueue), commandKind(xCommandKind), seqNum(0) {} + HCCAsyncOp(HCCQueue *xqueue, hcCommandKind xCommandKind) : queue(xqueue), commandKind(xCommandKind), seqNum(0) {} - virtual ~KalmarAsyncOp() {} + virtual ~HCCAsyncOp() {} virtual std::shared_future* getFuture() { return nullptr; } virtual void* getNativeHandle() { return nullptr;} /** * Get the timestamp when the asynchronous operation begins. * - * @return An implementaion-defined timestamp. + * @return An implementation-defined timestamp. */ virtual uint64_t getBeginTimestamp() { return 0L; } @@ -164,7 +164,7 @@ class KalmarAsyncOp { * * @param mode[in] wait mode, must be one of the value in hcWaitMode enum. */ - virtual void setWaitMode(hcWaitMode mode) {} + virtual void setWaitMode(hcWaitMode mode) = 0; void setSeqNumFromQueue(); uint64_t getSeqNum () const { return seqNum;}; @@ -172,10 +172,10 @@ class KalmarAsyncOp { hcCommandKind getCommandKind() const { return commandKind; }; void setCommandKind(hcCommandKind xCommandKind) { commandKind = xCommandKind; }; - KalmarQueue *getQueue() const { return queue; }; + HCCQueue *getQueue() const { return queue; }; private: - KalmarQueue *queue; + HCCQueue *queue; // Kind of this command - copy, kernel, barrier, etc: hcCommandKind commandKind; @@ -186,20 +186,20 @@ class KalmarAsyncOp { }; -/// KalmarQueue +/// HCCQueue /// This is the implementation of accelerator_view -/// KalamrQueue is responsible for data operations and launch kernel -class KalmarQueue +/// HCCQueue is responsible for data operations and launch kernel +class HCCQueue { public: - KalmarQueue(KalmarDevice* pDev, queuing_mode mode = queuing_mode_automatic, execute_order order = execute_in_order) + HCCQueue(HCCDevice* pDev, queuing_mode mode = queuing_mode_automatic, execute_order order = execute_in_order) : pDev(pDev), mode(mode), order(order), opSeqNums(0) {} - virtual ~KalmarQueue() {} + virtual ~HCCQueue() {} virtual void flush() {} - virtual void wait(hcWaitMode mode = hcWaitModeBlocked) {} + virtual void wait(hcWaitMode mode = hcWaitModeBlocked) = 0; // sync kernel launch with dynamic group memory virtual @@ -212,7 +212,7 @@ class KalmarQueue // async kernel launch with dynamic group memory virtual - std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( void* kernel, std::size_t dim_ext, const std::size_t* ext, @@ -229,7 +229,7 @@ class KalmarQueue // async kernel launch virtual - std::shared_ptr LaunchKernelAsync( + std::shared_ptr LaunchKernelAsync( void* kernel, std::size_t dim_ext, const std::size_t* ext, @@ -238,7 +238,7 @@ class KalmarQueue /// read data from device to host virtual void read(void* device, void* dst, size_t count, size_t offset) = 0; - /// wrtie data from host to device + /// write data from host to device virtual void write(void* device, const void* src, size_t count, size_t offset, bool blocking) = 0; /// copy data between two device pointers @@ -257,7 +257,7 @@ class KalmarQueue virtual uint32_t GetGroupSegmentSize(void *kernel) { return 0; } - KalmarDevice* getDev() const { return pDev; } + HCCDevice* getDev() const { return pDev; } queuing_mode get_mode() const { return mode; } void set_mode(queuing_mode mod) { mode = mod; } @@ -279,7 +279,7 @@ class KalmarQueue virtual void* getHSAAMRegion() { return nullptr; } virtual void* getHSAAMHostRegion() { return nullptr; } - + virtual void* getHSACoherentAMHostRegion() { return nullptr; } /// get kernarg region handle @@ -289,19 +289,19 @@ class KalmarQueue virtual bool hasHSAInterOp() { return false; } /// enqueue marker - virtual std::shared_ptr EnqueueMarker(memory_scope) { return nullptr; } + virtual std::shared_ptr EnqueueMarker(memory_scope) { return nullptr; } /// enqueue marker with prior dependency - virtual std::shared_ptr EnqueueMarkerWithDependency(int count, std::shared_ptr *depOps, memory_scope scope) { return nullptr; } + virtual std::shared_ptr EnqueueMarkerWithDependency(int count, std::shared_ptr *depOps, memory_scope scope) { return nullptr; } - virtual std::shared_ptr detectStreamDeps(hcCommandKind commandKind, KalmarAsyncOp *newCopyOp) { return nullptr; }; + virtual std::shared_ptr detectStreamDeps(hcCommandKind commandKind, HCCAsyncOp *newCopyOp) { return nullptr; }; /// copy src to dst asynchronously - virtual std::shared_ptr EnqueueAsyncCopy(const void* src, void* dst, size_t size_bytes) { return nullptr; } - virtual std::shared_ptr EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, - hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, - const Kalmar::KalmarDevice *copyDevice) { return nullptr; }; + virtual std::shared_ptr EnqueueAsyncCopy(const void* src, void* dst, size_t size_bytes) { return nullptr; } + virtual std::shared_ptr EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, + hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, + const detail::HCCDevice *copyDevice) { return nullptr; }; // Copy src to dst synchronously virtual void copy(const void *src, void *dst, size_t size_bytes) { } @@ -309,8 +309,8 @@ class KalmarQueue /// copy src to dst, with caller providing extended information about the pointers. //// TODO - remove me, this form is deprecated. virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceUnpinnedCopy) { }; - virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, - const Kalmar::KalmarDevice *copyDev, bool forceUnpinnedCopy) { }; + virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, + const detail::HCCDevice *copyDev, bool forceUnpinnedCopy) { }; /// cleanup internal resource /// this function is usually called by dtor of the implementation classes @@ -318,10 +318,10 @@ class KalmarQueue /// resource clean up sequence virtual void dispose() {} - virtual void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, + virtual void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, const void * args, size_t argsize, hc::completion_future *cf, const char *kernel_name) { }; - + /// set CU affinity of this queue. /// the setting is permanent until the queue is destroyed or another setting /// is called. @@ -331,17 +331,17 @@ class KalmarQueue uint64_t assign_op_seq_num() { return ++opSeqNums; }; private: - KalmarDevice* pDev; + HCCDevice* pDev; queuing_mode mode; execute_order order; uint64_t opSeqNums; // last seqnum assigned to an op in this queue }; -/// KalmarDevice +/// HCCDevice /// This is the base implementation of accelerator -/// KalmarDevice is responsible for create/release memory on device -class KalmarDevice +/// HCCDevice is responsible for create/release memory on device +class HCCDevice { private: access_type cpu_type; @@ -349,13 +349,13 @@ class KalmarDevice // Set true if the device has large bar #if !TLS_QUEUE - /// default KalmarQueue - std::shared_ptr def; - /// make sure KalamrQueue is created only once + /// default HCCQueue + std::shared_ptr def; + /// make sure HCCQueue is created only once std::once_flag flag; #else - /// default KalmarQueue for each calling thread - std::map< std::thread::id, std::shared_ptr > tlsDefaultQueueMap; + /// default HCCQueue for each calling thread + std::map< std::thread::id, std::shared_ptr > tlsDefaultQueueMap; /// mutex for tlsDefaultQueueMap std::mutex tlsDefaultQueueMap_mutex; #endif @@ -366,7 +366,7 @@ class KalmarDevice bool cpu_accessible_am; - KalmarDevice(access_type type = access_type_read_write) + HCCDevice(access_type type = access_type_read_write) : cpu_type(type), #if !TLS_QUEUE def(), flag() @@ -400,11 +400,11 @@ class KalmarDevice virtual void BuildProgram(void* size, void* source) {} /// create kernel - virtual + virtual void* CreateKernel( const char* fun, - KalmarQueue *queue, - const void* callable = nullptr, + HCCQueue *queue, + std::unique_ptr callable, std::size_t callable_size = 0u) = 0; /// check if a given kernel is compatible with the device @@ -413,13 +413,13 @@ class KalmarDevice /// check the dimension information is correct virtual bool check(size_t* size, size_t dim_ext) { return true; } - /// create KalmarQueue from current device - virtual std::shared_ptr createQueue(execute_order order = execute_in_order) = 0; - virtual ~KalmarDevice() {} + /// create HCCQueue from current device + virtual std::shared_ptr createQueue(execute_order order = execute_in_order) = 0; + virtual ~HCCDevice() {} - std::shared_ptr get_default_queue() { + std::shared_ptr get_default_queue() { #if !TLS_QUEUE - std::call_once(flag, [&]() { + std::call_once(flag, [&]() { def = createQueue(); }); return def; @@ -429,7 +429,7 @@ class KalmarDevice if (tlsDefaultQueueMap.find(tid) == tlsDefaultQueueMap.end()) { tlsDefaultQueueMap[tid] = createQueue(); } - std::shared_ptr result = tlsDefaultQueueMap[tid]; + std::shared_ptr result = tlsDefaultQueueMap[tid]; tlsDefaultQueueMap_mutex.unlock(); return result; #endif @@ -439,7 +439,7 @@ class KalmarDevice virtual size_t GetMaxTileStaticSize() { return 0; } /// get all queues associated with this device - virtual std::vector< std::shared_ptr > get_all_queues() { return std::vector< std::shared_ptr >(); } + virtual std::vector< std::shared_ptr > get_all_queues() { return std::vector< std::shared_ptr >(); } virtual void memcpySymbol(const char* symbolName, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {} @@ -454,7 +454,7 @@ class KalmarDevice virtual hcAgentProfile getProfile() { return hcAgentProfileNone; } /// check if @p other can access to this device's device memory, return true if so, false otherwise - virtual bool is_peer(const KalmarDevice* other) {return false;} + virtual bool is_peer(const HCCDevice* other) {return false;} /// get device's compute unit count virtual unsigned int get_compute_unit_count() {return 0;} @@ -465,11 +465,11 @@ class KalmarDevice }; -class CPUQueue final : public KalmarQueue +class CPUQueue final : public HCCQueue { public: - CPUQueue(KalmarDevice* pDev) : KalmarQueue(pDev) {} + CPUQueue(HCCDevice* pDev) : HCCQueue(pDev) {} void read(void* device, void* dst, size_t count, size_t offset) override { if (dst != device) @@ -486,10 +486,11 @@ class CPUQueue final : public KalmarQueue memmove((char*)dst + dst_offset, (char*)src + src_offset, count); } + [[noreturn]] void* CreateKernel( - const char*, KalmarQueue*, const void*, std::size_t) override + const char*, HCCQueue*, const void*, std::size_t) override { - return nullptr; + throw std::runtime_error{"Unsupported."}; } void LaunchKernel( void*, @@ -500,7 +501,7 @@ class CPUQueue final : public KalmarQueue throw std::runtime_error{"Unsupported."}; } [[noreturn]] - std::shared_ptr LaunchKernelAsync( + std::shared_ptr LaunchKernelAsync( void*, std::size_t, const std::size_t*, @@ -518,7 +519,7 @@ class CPUQueue final : public KalmarQueue throw std::runtime_error{"Unsupported."}; } [[noreturn]] - std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( void*, std::size_t, const std::size_t*, @@ -535,10 +536,12 @@ class CPUQueue final : public KalmarQueue void unmap(void* device, void* addr, size_t count, size_t offset, bool modify) override {} void Push(void *kernel, int idx, void* device, bool modify) override {} + + void wait(hcWaitMode = hcWaitModeBlocked) override {} }; /// cpu accelerator -class CPUDevice final : public KalmarDevice +class CPUDevice final : public HCCDevice { public: std::wstring get_path() const override { return L"cpu"; } @@ -550,29 +553,34 @@ class CPUDevice final : public KalmarDevice bool is_emulated() const override { return true; } uint32_t get_version() const override { return 0; } - std::shared_ptr createQueue(execute_order order = execute_in_order) override { return std::shared_ptr(new CPUQueue(this)); } + std::shared_ptr createQueue( + execute_order order = execute_in_order) override + { + return std::shared_ptr(new CPUQueue(this)); + } void* create(size_t count, struct rw_info* /* not used */ ) override { return kalmar_aligned_alloc(0x1000, count); } - void release(void* ptr, struct rw_info* /* nout used */) override { kalmar_aligned_free(ptr); } + void release(void* ptr, struct rw_info* /* not used */) override { kalmar_aligned_free(ptr); } + [[noreturn]] void* CreateKernel( const char*, - KalmarQueue*, - const void* = nullptr, + HCCQueue*, + std::unique_ptr, std::size_t = 0u) { - return nullptr; + throw std::runtime_error{"Unsupported."}; } }; -/// KalmarContext +/// HCCContext /// This is responsible for managing all devices /// User will need to add their customize devices -class KalmarContext +class HCCContext { private: //TODO: Think about a system which has multiple CPU socket, e.g. server. In this case, //We might be able to assume that only the first device is CPU, or we only mimic one cpu - //device when constructing KalmarContext. - KalmarDevice* get_default_dev() { + //device when constructing HCCContext. + HCCDevice* get_default_dev() { if (!def) { if (Devices.size() <= 1) { fprintf(stderr, "There is no device can be used to do the computation\n"); @@ -584,46 +592,47 @@ class KalmarContext } protected: /// default device - KalmarDevice* def; - std::vector Devices; - KalmarContext() : def(nullptr), Devices() { Devices.push_back(new CPUDevice); } + HCCDevice* def; + std::vector Devices; + HCCContext() : def(nullptr), Devices() { Devices.push_back(new CPUDevice); } - bool init_success = false; + bool init_success = false; public: - virtual ~KalmarContext() {} + virtual ~HCCContext() {} - std::vector getDevices() { return Devices; } + std::vector getDevices() { return Devices; } /// set default device by path - bool set_default(const std::wstring& path) { - auto result = std::find_if(std::begin(Devices), std::end(Devices), - [&] (const KalmarDevice* pDev) - { return pDev->get_path() == path; }); - if (result == std::end(Devices)) - return false; - else { - def = *result; + bool set_default(const std::wstring& path) + { + for (auto&& Device : Devices) { + if (Device->get_path() != path) continue; + + def = Device; + return true; } + + return false; } /// get auto selection queue - std::shared_ptr auto_select() { + std::shared_ptr auto_select() { return get_default_dev()->get_default_queue(); } /// get device from path - KalmarDevice* getDevice(std::wstring path = L"") { - if (path == L"default" || path == L"") - return get_default_dev(); - auto result = std::find_if(std::begin(Devices), std::end(Devices), - [&] (const KalmarDevice* dev) - { return dev->get_path() == path; }); - if (result != std::end(Devices)) - return *result; - else - return get_default_dev(); + HCCDevice* getDevice(std::wstring path = L"") { + if (path == L"default" || path == L"") return get_default_dev(); + + for (auto&& Device : Devices) { + if (Device->get_path() != path) continue; + + return Device; + } + + return get_default_dev(); } /// get system ticks @@ -642,24 +651,27 @@ class KalmarContext virtual void* getPrintfBufferPointerVA() { return nullptr; }; }; -KalmarContext *getContext(); +HCCContext *getContext(); namespace CLAMP { void* CreateKernel( - const char*, KalmarQueue*, const void* = nullptr, std::size_t = 0u); + const char*, + HCCQueue*, + std::unique_ptr, + std::size_t = 0u); } // namespace CLAMP -static inline const std::shared_ptr get_cpu_queue() { +static inline const std::shared_ptr get_cpu_queue() { static auto cpu_queue = getContext()->getDevice(L"cpu")->get_default_queue(); return cpu_queue; } -static inline bool is_cpu_queue(const std::shared_ptr& Queue) { +static inline bool is_cpu_queue(const std::shared_ptr& Queue) { return Queue->getDev()->get_path() == L"cpu"; } -static inline void copy_helper(std::shared_ptr& srcQueue, void* src, - std::shared_ptr& dstQueue, void* dst, +static inline void copy_helper(std::shared_ptr& srcQueue, void* src, + std::shared_ptr& dstQueue, void* dst, size_t cnt, bool block, size_t src_offset = 0, size_t dst_offset = 0) { /// In shared memory architecture, src and dst may points to the same buffer @@ -668,8 +680,8 @@ static inline void copy_helper(std::shared_ptr& srcQueue, void* src return ; /// If device pointer comes from cpu, let the device queue to handle the copy /// For example, if src is on cpu and dst is on device, - /// in OpenCL, clEnqueueWrtieBuffer to write data from src to device - + /// in OpenCL, clEnqueueWriteBuffer to write data from src to device + if (is_cpu_queue(dstQueue)) srcQueue->read(src, (char*)dst + dst_offset, cnt, src_offset); else @@ -681,7 +693,7 @@ static inline void copy_helper(std::shared_ptr& srcQueue, void* src /// Used to avoid unnecessary copy when array_view is used enum states { - /// exclusive owned data, safe to read and wrtie + /// exclusive owned data, safe to read and write modified, /// shared on multiple devices, the content are all the same, cannot modify shared, @@ -718,17 +730,17 @@ struct rw_info /// unified memory and access_type is not none void *data; const size_t count; - /// This pointer pointes to the latest queue that manages the data - std::shared_ptr curr; - /// This pointer pointes to the queue that used to construct this rw_info + /// This pointer points to the latest queue that manages the data + std::shared_ptr curr; + /// This pointer points to the queue that used to construct this rw_info /// This will be null if the constructor is constructed by size only - std::shared_ptr master; + std::shared_ptr master; /// staged queue - std::shared_ptr stage; + std::shared_ptr stage; /// This is used as cache for device buffer /// When this rw_info is going to be used(computed) on device, /// rw_info will allocate buffer for the device - std::map devs; + std::map devs; access_type mode; /// This will be set if this rw_info is constructed with host pointer /// because rw_info cannot free host pointer @@ -740,7 +752,7 @@ struct rw_info bool toReleaseDevPointer; - /// consruct array_view + /// construct array_view /// According to standard, array_view will be constructed by size, or size with /// host pointer. /// If it is constructed with host pointer, treat it is constructed on cpu @@ -758,10 +770,10 @@ struct rw_info /// construct array /// According to AMP standard, array should be constructed with /// 1. one accelerator_view - /// 2. one acceleratir_view, with another staged one + /// 2. one accelerator_view, with another staged one /// In this case, master should be cpu device /// If it is not, ignore the stage one, fallback to case 1. - rw_info(const std::shared_ptr& Queue, const std::shared_ptr& Stage, + rw_info(const std::shared_ptr& Queue, const std::shared_ptr& Stage, const size_t count, access_type mode_) : data(nullptr), count(count), curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(true) { if (mode == access_type_auto) @@ -785,7 +797,7 @@ struct rw_info /// toReleaseDevPointer is now set as false, so when this instance goes /// into destruction, device memory associated with it will NOT be /// released - rw_info(const std::shared_ptr& Queue, const std::shared_ptr& Stage, + rw_info(const std::shared_ptr& Queue, const std::shared_ptr& Stage, const size_t count, void* device_pointer, access_type mode_) : data(nullptr), count(count), curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(false) { @@ -809,7 +821,7 @@ struct rw_info return devs[curr->getDev()].data; } - void construct(std::shared_ptr pQueue) { + void construct(std::shared_ptr pQueue) { curr = pQueue; devs[pQueue->getDev()] = {pQueue->getDev()->create(count, this), invalid}; if (is_cpu_queue(pQueue)) @@ -825,7 +837,7 @@ struct rw_info /// shared, it implies that the data on cpu is the same on device where /// curr located, use data on cpu to perform the later operation /// For example, if data on device a is going to be copied to device b - /// and the data on device a and cpu is the same, it is okay to copy data + /// and the data on device a and cpu is the same, it is okay to copy data /// from cpu to device b void try_switch_to_cpu() { if (is_cpu_queue(curr)) @@ -836,12 +848,12 @@ struct rw_info curr = cpu_queue; } - /// synchronize data to device pQueue belongs to by using pQuquq + /// synchronize data to device pQueue belongs to by using pQueue /// @pQueue: queue that used to synchronize /// @modify: the data will be modified or not - /// @blcok: this call will be blocking or not + /// @block: this call will be blocking or not /// none blocking occurs in serialization stage - void sync(std::shared_ptr pQueue, bool modify, bool block = true) { + void sync(std::shared_ptr pQueue, bool modify, bool block = true) { if (!curr) { /// This can only happen if array_view is constructed with size and /// is not accessed before @@ -857,7 +869,7 @@ struct rw_info if (curr == pQueue) return; - /// If both queues are from the same device, upadte state only + /// If both queues are from the same device, update state only if (curr->getDev() == pQueue->getDev()) { // curr->wait(); curr = pQueue; @@ -994,7 +1006,7 @@ struct rw_info cpu_dev->release(devs[cpu_dev].data, this); devs.erase(cpu_dev); } - KalmarDevice* pDev; + HCCDevice* pDev; dev_info info; for (const auto it : devs) { std::tie(pDev, info) = it; @@ -1008,8 +1020,8 @@ struct rw_info //--- Implementation: // -inline void KalmarAsyncOp::setSeqNumFromQueue() { seqNum = queue->assign_op_seq_num(); }; +inline void HCCAsyncOp::setSeqNumFromQueue() { seqNum = queue->assign_op_seq_num(); }; -} // namespace Kalmar +} // namespace detail /** \endcond */ diff --git a/include/kalmar_serialize.h b/include/kalmar_serialize.h index f5e8a40c248..30f5e519853 100644 --- a/include/kalmar_serialize.h +++ b/include/kalmar_serialize.h @@ -5,14 +5,14 @@ #include "kalmar_exception.h" /** \cond HIDDEN_SYMBOLS */ -namespace Kalmar +namespace detail { /// traverse all the buffers that are going to be used in kernel class FunctorBufferWalker { public: - virtual void Append(size_t sz, const void* s) {} - virtual void AppendPtr(size_t sz, const void* s) {} + virtual void Append(size_t, const void*) {} + virtual void AppendPtr(size_t, const void*) {} virtual void visit_buffer(struct rw_info* rw, bool modify, bool isArray) = 0; }; @@ -32,10 +32,10 @@ class Serialize { /// before/after kernel launches in cpu path class CPUVisitor : public FunctorBufferWalker { - std::shared_ptr pQueue; + std::shared_ptr pQueue; std::set bufs; public: - CPUVisitor(std::shared_ptr pQueue) : pQueue(pQueue) {} + CPUVisitor(std::shared_ptr pQueue) : pQueue(pQueue) {} void visit_buffer(struct rw_info* rw, bool modify, bool isArray) override { if (isArray) { auto curr = pQueue->getDev()->get_path(); @@ -59,15 +59,15 @@ class CPUVisitor : public FunctorBufferWalker /// Append kernel argument to kernel class BufferArgumentsAppender : public FunctorBufferWalker { - std::shared_ptr pQueue; + std::shared_ptr pQueue; void* k_; int current_idx_; public: - BufferArgumentsAppender(std::shared_ptr pQueue, void* k) + BufferArgumentsAppender(std::shared_ptr pQueue, void* k) : pQueue(pQueue), k_(k), current_idx_(0) {} - void Append(size_t sz, const void *s) override { + void Append(size_t, const void*) override { } - void AppendPtr(size_t sz, const void *s) override { + void AppendPtr(size_t, const void*) override { } void visit_buffer(rw_info* rw, bool modify, bool isArray) override { if (isArray) { @@ -89,13 +89,14 @@ class BufferArgumentsAppender : public FunctorBufferWalker /// and the view using which work is submitted to the accelerator, is chosen /// from the objects of type array that were captured in the kernel lambda. /// -/// Thise Searcher will visit all the array and find a view to launch kernel +/// This Searcher will visit all the array and find a view to launch kernel class QueueSearcher : public FunctorBufferWalker { - std::shared_ptr pQueue; + std::shared_ptr pQueue; public: QueueSearcher() = default; - void visit_buffer(struct rw_info* rw, bool modify, bool isArray) override { + void visit_buffer(struct rw_info* rw, bool, bool isArray) override + { if (isArray && !pQueue) { if (rw->master->getDev()->get_path() != L"cpu") pQueue = rw->master; @@ -103,8 +104,8 @@ class QueueSearcher : public FunctorBufferWalker pQueue = rw->stage; } } - std::shared_ptr get_que() const { return pQueue; } + std::shared_ptr get_que() const { return pQueue; } }; -} // namespace Kalmar +} // namespace detail /** \endcond */ diff --git a/lib/cpu/mcwamp_cpu.cpp b/lib/cpu/mcwamp_cpu.cpp index 946428a61b0..0edc5ba0d18 100644 --- a/lib/cpu/mcwamp_cpu.cpp +++ b/lib/cpu/mcwamp_cpu.cpp @@ -15,13 +15,13 @@ #include #include -namespace Kalmar { +namespace detail { -class CPUFallbackQueue final : public KalmarQueue +class CPUFallbackQueue final : public HCCQueue { public: - CPUFallbackQueue(KalmarDevice* pDev) : KalmarQueue(pDev) {} + CPUFallbackQueue(HCCDevice* pDev) : HCCQueue(pDev) {} void LaunchKernel( void*, std::size_t, const std::size_t*, const std::size_t*) override @@ -29,7 +29,7 @@ class CPUFallbackQueue final : public KalmarQueue throw std::runtime_error{"Unsupported."}; } [[noreturn]] - std::shared_ptr LaunchKernelAsync( + std::shared_ptr LaunchKernelAsync( void*, std::size_t, const std::size_t*, @@ -47,7 +47,7 @@ class CPUFallbackQueue final : public KalmarQueue throw std::runtime_error{"Unsupported."}; } [[noreturn]] - std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( void*, std::size_t, const std::size_t*, @@ -79,12 +79,14 @@ class CPUFallbackQueue final : public KalmarQueue void unmap(void* device, void* addr, size_t count, size_t offset, bool modify) override {} void Push(void *kernel, int idx, void* device, bool isConst) override {} + + void wait(hcWaitMode = hcWaitModeBlocked) override {} }; -class CPUFallbackDevice final : public KalmarDevice +class CPUFallbackDevice final : public HCCDevice { public: - CPUFallbackDevice() : KalmarDevice() {} + CPUFallbackDevice() : HCCDevice() {} std::wstring get_path() const override { return L"fallback"; } std::wstring get_description() const override { return L"CPU Fallback"; } @@ -98,23 +100,29 @@ class CPUFallbackDevice final : public KalmarDevice void* create(size_t count, struct rw_info* /* not used */) override { return kalmar_aligned_alloc(0x1000, count); } - void release(void *device, struct rw_info* /* not used */ ) override { + void release(void *device, struct rw_info* /* not used */ ) override { kalmar_aligned_free(device); } - std::shared_ptr createQueue(execute_order order = execute_in_order) override { - return std::shared_ptr(new CPUFallbackQueue(this)); + std::shared_ptr createQueue( + execute_order = execute_in_order) override + { + return std::shared_ptr(new CPUFallbackQueue(this)); } + [[noreturn]] void* CreateKernel( - const char*, KalmarQueue*, const void* = nullptr, std::size_t = 0u) + const char*, + HCCQueue*, + std::unique_ptr, + std::size_t = 0u) { - return nullptr; + throw std::runtime_error{"Unsupported."}; } }; template inline void deleter(T* ptr) { delete ptr; } -class CPUContext final : public KalmarContext +class CPUContext final : public HCCContext { public: CPUContext() { Devices.push_back(new CPUFallbackDevice); } @@ -124,8 +132,8 @@ class CPUContext final : public KalmarContext static CPUContext ctx; -} // namespace Kalmar +} // namespace detail extern "C" void *GetContextImpl() { - return &Kalmar::ctx; + return &detail::ctx; } diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 71ab2060c2a..d13ab632077 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -4,7 +4,7 @@ // //===----------------------------------------------------------------------===// -// Kalmar Runtime implementation (HSA version) +// detail Runtime implementation (HSA version) #include "kalmar_runtime.h" #include "kalmar_aligned_alloc.h" @@ -160,7 +160,7 @@ double QUEUE_FLUSHING_FRAC; sstream << "\t" << *op << ";";\ }\ sstream << msg << "\n";\ - Kalmar::ctx.getHccProfileStream() << sstream.str();\ + detail::ctx.getHccProfileStream() << sstream.str();\ } @@ -189,11 +189,11 @@ thread_local ShortTid hcc_tlsShortTid; #define CASE_STRING(X) case X: case_string = #X ;break; -static const char* getHcCommandKindString(Kalmar::hcCommandKind k) { +static const char* getHcCommandKindString(detail::hcCommandKind k) { const char* case_string; switch(k) { - using namespace Kalmar; + using namespace detail; CASE_STRING(hcCommandInvalid); CASE_STRING(hcMemcpyHostToHost); CASE_STRING(hcMemcpyHostToDevice); @@ -505,7 +505,7 @@ PrintfPacket** printf_buffer_locked_va = nullptr; } // namespace hc -namespace Kalmar { +namespace detail { enum class HCCRuntimeStatus{ @@ -540,7 +540,7 @@ inline static void checkHCCRuntimeStatus(const HCCRuntimeStatus status, const un fprintf(stderr, "### HCC runtime error: %s at %s line:%d\n", getHCCRuntimeStatusMessage(status), __FILENAME__, line); std::string m("HCC Runtime Error - "); m += getHCCRuntimeStatusMessage(status); - throw Kalmar::runtime_exception(m.c_str(), 0); + throw detail::runtime_exception(m.c_str(), 0); //if (q != nullptr) // assert(HSA_STATUS_SUCCESS == hsa_queue_destroy(q)); //assert(HSA_STATUS_SUCCESS == hsa_shut_down()); @@ -548,17 +548,17 @@ inline static void checkHCCRuntimeStatus(const HCCRuntimeStatus status, const un } } -} // namespace Kalmar +} // namespace detail // forward declaration -namespace Kalmar { +namespace detail { class HSAQueue; class HSADevice; namespace CLAMP { - void LoadInMemoryProgram(KalmarQueue*); + void LoadInMemoryProgram(HCCQueue*); } // namespace CLAMP -} // namespace Kalmar +} // namespace detail /// /// kernel compilation / kernel launching @@ -570,7 +570,7 @@ class HSAExecutable { hsa_code_object_reader_t hsaCodeObjectReader; hsa_executable_t hsaExecutable; friend class HSAKernel; - friend class Kalmar::HSADevice; + friend class detail::HSADevice; public: HSAExecutable(hsa_executable_t _hsaExecutable, @@ -685,16 +685,16 @@ class HSAKernel { // Stores the device and queue for op coordinate: struct HSAOpCoord { - HSAOpCoord(Kalmar::HSAQueue *queue); + HSAOpCoord(detail::HSAQueue *queue); int _deviceId; uint64_t _queueId; }; // Base class for the other HSA ops: -class HSAOp : public Kalmar::KalmarAsyncOp { +class HSAOp : public detail::HCCAsyncOp { public: - HSAOp(Kalmar::KalmarQueue *queue, hc::hcCommandKind commandKind) ; + HSAOp(detail::HCCQueue *queue, hc::hcCommandKind commandKind) ; const HSAOpCoord opCoord() const { return _opCoord; }; int asyncOpsIndex() const { return _asyncOpsIndex; }; @@ -706,7 +706,7 @@ class HSAOp : public Kalmar::KalmarAsyncOp { virtual bool barrierNextSyncNeedsSysRelease() const { return 0; }; virtual bool barrierNextKernelNeedsSysAcquire() const { return 0; }; - Kalmar::HSAQueue *hsaQueue() const; + detail::HSAQueue *hsaQueue() const; bool isReady() override; protected: uint64_t apiStartTick; @@ -737,7 +737,7 @@ class HSACopy : public HSAOp { // keep a reference which prevents those ops from being deleted until this op is deleted. std::shared_ptr depAsyncOp; - const Kalmar::HSADevice* copyDevice; // Which device did the copy. + const detail::HSADevice* copyDevice; // Which device did the copy. // source pointer const void* src; @@ -752,15 +752,15 @@ class HSACopy : public HSAOp { public: std::shared_future* getFuture() override { return future; } - const Kalmar::HSADevice* getCopyDevice() const { return copyDevice; } ; // Which device did the copy. + const detail::HSADevice* getCopyDevice() const { return copyDevice; } ; // Which device did the copy. - void setWaitMode(Kalmar::hcWaitMode mode) override { + void setWaitMode(detail::hcWaitMode mode) override { switch (mode) { - case Kalmar::hcWaitModeBlocked: + case detail::hcWaitModeBlocked: waitMode = HSA_WAIT_STATE_BLOCKED; break; - case Kalmar::hcWaitModeActive: + case detail::hcWaitModeActive: waitMode = HSA_WAIT_STATE_ACTIVE; break; } @@ -769,7 +769,7 @@ class HSACopy : public HSAOp { std::string getCopyCommandString() { - using namespace Kalmar; + using namespace detail; std::string s; switch (getCommandKind()) { @@ -803,7 +803,7 @@ class HSACopy : public HSAOp { // Copy mode will be set later on. // HSA signals would be waited in HSA_WAIT_STATE_ACTIVE by default for HSACopy instances - HSACopy(Kalmar::KalmarQueue *queue, const void* src_, void* dst_, size_t sizeBytes_); + HSACopy(detail::HCCQueue *queue, const void* src_, void* dst_, size_t sizeBytes_); @@ -817,7 +817,7 @@ class HSACopy : public HSAOp { dispose(); } - hsa_status_t enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo); + hsa_status_t enqueueAsyncCopyCommand(const detail::HSADevice *copyDevice, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo); // wait for the async copy to complete hsa_status_t waitComplete(); @@ -839,11 +839,11 @@ class HSACopy : public HSAOp { void syncCopy(); void syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, - const Kalmar::HSADevice *copyDevice, bool forceUnpinnedCopy); + const detail::HSADevice *copyDevice, bool forceUnpinnedCopy); private: - hsa_status_t hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, const Kalmar::HSADevice *copyDevice, + hsa_status_t hcc_memory_async_copy(detail::hcCommandKind copyKind, const detail::HSADevice *copyDevice, const hc::AmPointerInfo &dstPtrInfo, const hc::AmPointerInfo &srcPtrInfo, size_t sizeBytes, int depSignalCnt, const hsa_signal_t *depSignals, hsa_signal_t completion_signal); @@ -885,12 +885,12 @@ class HSABarrier : public HSAOp { bool barrierNextKernelNeedsSysAcquire() const override { return _barrierNextKernelNeedsSysAcquire; }; - void setWaitMode(Kalmar::hcWaitMode mode) override { + void setWaitMode(detail::hcWaitMode mode) override { switch (mode) { - case Kalmar::hcWaitModeBlocked: + case detail::hcWaitModeBlocked: waitMode = HSA_WAIT_STATE_BLOCKED; break; - case Kalmar::hcWaitModeActive: + case detail::hcWaitModeActive: waitMode = HSA_WAIT_STATE_ACTIVE; break; } @@ -902,8 +902,8 @@ class HSABarrier : public HSAOp { // constructor with 1 prior dependency - HSABarrier(Kalmar::KalmarQueue *queue, std::shared_ptr dependent_op) : - HSAOp(queue, Kalmar::hcCommandMarker), + HSABarrier(detail::HCCQueue *queue, std::shared_ptr dependent_op) : + HSAOp(queue, detail::hcCommandMarker), isDispatched(false), future(nullptr), _acquire_scope(hc::no_scope), @@ -913,7 +913,7 @@ class HSABarrier : public HSAOp { { if (dependent_op != nullptr) { - assert (dependent_op->getCommandKind() == Kalmar::hcCommandMarker); + assert (dependent_op->getCommandKind() == detail::hcCommandMarker); depAsyncOps[0] = std::static_pointer_cast (dependent_op); depCount = 1; @@ -923,8 +923,8 @@ class HSABarrier : public HSAOp { } // constructor with at most 5 prior dependencies - HSABarrier(Kalmar::KalmarQueue *queue, int count, std::shared_ptr *dependent_op_array) : - HSAOp(queue, Kalmar::hcCommandMarker), + HSABarrier(detail::HCCQueue *queue, int count, std::shared_ptr *dependent_op_array) : + HSAOp(queue, detail::hcCommandMarker), isDispatched(false), future(nullptr), _acquire_scope(hc::no_scope), @@ -943,7 +943,7 @@ class HSABarrier : public HSAOp { } } else { // throw an exception - throw Kalmar::runtime_exception("Incorrect number of dependent signals passed to HSABarrier constructor", count); + throw detail::runtime_exception("Incorrect number of dependent signals passed to HSABarrier constructor", count); } } @@ -978,11 +978,12 @@ class HSABarrier : public HSAOp { }; // end of HSABarrier class HSADispatch : public HSAOp { - Kalmar::HSADevice* device_{nullptr}; + detail::HSADevice* device_{nullptr}; const char* kernel_name_{nullptr}; const HSAKernel* kernel_{nullptr}; + std::unique_ptr callable_{nullptr, [](void*){}}; std::unique_ptr kernargMemory_{ nullptr, hsa_amd_memory_unlock}; @@ -1006,12 +1007,12 @@ class HSADispatch : public HSAOp { kernel_->getLongKernelName().c_str() : ""; } - void setWaitMode(Kalmar::hcWaitMode mode) override { + void setWaitMode(detail::hcWaitMode mode) override { switch (mode) { - case Kalmar::hcWaitModeBlocked: + case detail::hcWaitModeBlocked: waitMode_ = HSA_WAIT_STATE_BLOCKED; break; - case Kalmar::hcWaitModeActive: + case detail::hcWaitModeActive: waitMode_ = HSA_WAIT_STATE_ACTIVE; break; } @@ -1026,22 +1027,24 @@ class HSADispatch : public HSAOp { } HSADispatch( - Kalmar::HSADevice* device, - Kalmar::KalmarQueue* queue, + detail::HSADevice* device, + detail::HCCQueue* queue, HSAKernel* kernel, const hsa_kernel_dispatch_packet_t* aql = nullptr); HSADispatch( - Kalmar::HSADevice* device, - Kalmar::KalmarQueue* queue, + detail::HSADevice* device, + detail::HCCQueue* queue, HSAKernel* kernel, - const void* callable, + std::unique_ptr callable, std::size_t callable_size, const hsa_kernel_dispatch_packet_t* aql = nullptr) : HSADispatch{device, queue, kernel, aql} { + callable_ = std::move(callable); + void* tmp{nullptr}; auto r = hsa_amd_memory_lock( - const_cast(callable), callable_size, nullptr, 0, &tmp); + callable_.get(), callable_size, nullptr, 0, &tmp); STATUS_CHECK(r, __LINE__); @@ -1131,7 +1134,7 @@ pool_iterator::pool_iterator() /// /// memory allocator /// -namespace Kalmar { +namespace detail { @@ -1194,10 +1197,10 @@ struct RocrQueue { -class HSAQueue final : public KalmarQueue +class HSAQueue final : public HCCQueue { private: - friend class Kalmar::HSADevice; + friend class detail::HSADevice; friend class RocrQueue; friend std::ostream& operator<<(std::ostream& os, const HSAQueue & hav); @@ -1214,7 +1217,7 @@ class HSAQueue final : public KalmarQueue // tries to lock the queue to insert a new packet. // Step through the runtime code with the unit test HC/execute_order.cpp // for details - std::recursive_mutex qmutex; // Protect structures for this KalmarQueue. Currently just the hsaQueue. + std::recursive_mutex qmutex; // Protect structures for this HCCQueue. Currently just the hsaQueue. bool drainingQueue_; // mode that we are draining queue, used to allow barrier ops to be enqueued. @@ -1222,9 +1225,9 @@ class HSAQueue final : public KalmarQueue // // kernel dispatches and barriers associated with this HSAQueue instance // - // When a kernel k is dispatched, we'll get a KalmarAsyncOp f. + // When a kernel k is dispatched, we'll get a HCCAsyncOp f. // This vector would hold f. acccelerator_view::wait() would trigger - // HSAQueue::wait(), and all future objects in the KalmarAsyncOp objects + // HSAQueue::wait(), and all future objects in the HCCAsyncOp objects // will be waited on. // std::vector< std::shared_ptr > asyncOps; @@ -1269,7 +1272,7 @@ class HSAQueue final : public KalmarQueue // If there are previous kernel dispatches which use b, then we wait on // them before dispatch kernel k. bufferKernelMap[b] will be cleared then. // - // After kernel k is dispatched, we'll get a KalmarAsync object f, we then + // After kernel k is dispatched, we'll get a HCCAsync object f, we then // walk through each buffer b used by k and mark the association as: // bufferKernelMap[b] = f // @@ -1279,7 +1282,7 @@ class HSAQueue final : public KalmarQueue // association between buffers and kernel dispatches // key: buffer address // value: a vector of kernel dispatches - std::map > > bufferKernelMap; + std::map > > bufferKernelMap; // association between a kernel and buffers used by it // key: kernel @@ -1291,7 +1294,7 @@ class HSAQueue final : public KalmarQueue public: - HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order) ; + HSAQueue(HCCDevice* pDev, hsa_agent_t agent, execute_order order) ; bool nextKernelNeedsSysAcquire() const { return _nextKernelNeedsSysAcquire; }; void setNextKernelNeedsSysAcquire(bool r) { _nextKernelNeedsSysAcquire = r; }; @@ -1307,7 +1310,7 @@ class HSAQueue final : public KalmarQueue uint64_t getSeqNum() const { return queueSeqNum; }; - Kalmar::HSADevice * getHSADev() const; + detail::HSADevice * getHSADev() const; void dispose() override; @@ -1403,7 +1406,7 @@ class HSAQueue final : public KalmarQueue // // Also different modes and optimizations can control when dependencies are added. // TODO - return reference if possible to avoid shared ptr overhead. - std::shared_ptr detectStreamDeps(hcCommandKind newCommandKind, KalmarAsyncOp *kNewOp) { + std::shared_ptr detectStreamDeps(hcCommandKind newCommandKind, HCCAsyncOp *kNewOp) { const auto newOp = static_cast (kNewOp); @@ -1415,7 +1418,7 @@ class HSAQueue final : public KalmarQueue // Ensure we have not already added the op we are checking into asyncOps, // that must be done after we check for deps. if (newOp && (newOp == asyncOps.back().get())) { - throw Kalmar::runtime_exception("enqueued op before checking dependencies!", 0); + throw detail::runtime_exception("enqueued op before checking dependencies!", 0); } bool needDep = false; @@ -1460,7 +1463,7 @@ class HSAQueue final : public KalmarQueue void waitForStreamDeps (HSAOp *newOp) { - std::shared_ptr depOp = detectStreamDeps(newOp->getCommandKind(), newOp); + std::shared_ptr depOp = detectStreamDeps(newOp->getCommandKind(), newOp); if (depOp != nullptr) { EnqueueMarkerWithDependency(1, &depOp, HCC_OPT_FLUSH ? hc::no_scope : hc::system_scope); } @@ -1610,7 +1613,7 @@ class HSAQueue final : public KalmarQueue kernelBufferMap.erase(ker); } - std::shared_ptr LaunchKernelAsync( + std::shared_ptr LaunchKernelAsync( void* ker, std::size_t nr_dim, const std::size_t* global, @@ -1620,7 +1623,7 @@ class HSAQueue final : public KalmarQueue ker, nr_dim, global, local, 0); } - std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( + std::shared_ptr LaunchKernelWithDynamicGroupMemoryAsync( void* ker, size_t nr_dim, const size_t* global, @@ -1642,7 +1645,7 @@ class HSAQueue final : public KalmarQueue waitForStreamDeps(dispatch); // create a shared_ptr instance - std::shared_ptr sp_dispatch(dispatch); + std::shared_ptr sp_dispatch(dispatch); // associate the kernel dispatch with this queue pushAsyncOp(std::static_pointer_cast (sp_dispatch)); @@ -2013,7 +2016,7 @@ class HSAQueue final : public KalmarQueue } // enqueue a barrier packet - std::shared_ptr EnqueueMarker(memory_scope release_scope) override { + std::shared_ptr EnqueueMarker(memory_scope release_scope) override { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -2041,8 +2044,8 @@ class HSAQueue final : public KalmarQueue // // fenceScope specifies the scope of the acquire and release fence that will be // applied after the marker executes. See hc::memory_scope - std::shared_ptr EnqueueMarkerWithDependency(int count, - std::shared_ptr *depOps, + std::shared_ptr EnqueueMarkerWithDependency(int count, + std::shared_ptr *depOps, hc::memory_scope fenceScope) override { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -2057,7 +2060,7 @@ class HSAQueue final : public KalmarQueue for (int i=0; idepAsyncOps[i]; if (depOp != nullptr) { - auto depHSAQueue = static_cast (depOp->getQueue()); + auto depHSAQueue = static_cast (depOp->getQueue()); // Same accelerator: // Inherit system-acquire and system-release bits op we are dependent on. // - barriers @@ -2111,15 +2114,15 @@ class HSAQueue final : public KalmarQueue return barrier; } else { // throw an exception - throw Kalmar::runtime_exception("Incorrect number of dependent signals passed to EnqueueMarkerWithDependency", count); + throw detail::runtime_exception("Incorrect number of dependent signals passed to EnqueueMarkerWithDependency", count); } } - std::shared_ptr EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, + std::shared_ptr EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, - const Kalmar::KalmarDevice *copyDevice) override; + const detail::HCCDevice *copyDevice) override; - std::shared_ptr EnqueueAsyncCopy(const void *src, void *dst, size_t size_bytes) override ; + std::shared_ptr EnqueueAsyncCopy(const void *src, void *dst, size_t size_bytes) override ; // synchronous copy @@ -2144,7 +2147,7 @@ class HSAQueue final : public KalmarQueue } void copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, - const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy) override ; + const detail::HCCDevice *copyDevice, bool forceUnpinnedCopy) override ; void copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, bool foo) override ; @@ -2163,7 +2166,7 @@ class HSAQueue final : public KalmarQueue // Both execute_in_order and execute_any_order flags always remove ops in-order at the end of the pipe. // Note if not found above targetIndex=-1 and we skip the loop: for (int i = targetIndex; i>=0; i--) { - Kalmar::KalmarAsyncOp *op = asyncOps[i].get(); + detail::HCCAsyncOp *op = asyncOps[i].get(); if (op) { asyncOps[i].reset(); @@ -2221,7 +2224,7 @@ hsa_status_t RocrQueue::setCuMask(HSAQueue *hccQueue) { } -class HSADevice final : public KalmarDevice +class HSADevice final : public HCCDevice { friend std::ostream& operator<<(std::ostream& os, const HSAQueue & hav); private: @@ -2238,7 +2241,7 @@ class HSADevice final : public KalmarDevice size_t queue_size; std::mutex queues_mutex; // protects access to the queues vector: - std::vector< std::weak_ptr > queues; + std::vector< std::weak_ptr > queues; std::mutex rocrQueuesMutex; // protects rocrQueues std::vector< RocrQueue *> rocrQueues; @@ -2276,7 +2279,7 @@ class HSADevice final : public KalmarDevice UnpinnedCopyEngine::CopyMode copy_mode; // Creates or steals a rocrQueue and returns it in theif->rocrQueue - void createOrstealRocrQueue(Kalmar::HSAQueue *thief) { + void createOrstealRocrQueue(detail::HSAQueue *thief) { RocrQueue *foundRQ = nullptr; this->rocrQueuesMutex.lock(); @@ -2740,8 +2743,8 @@ class HSADevice final : public KalmarDevice void* CreateKernel( const char* fun, - Kalmar::KalmarQueue *queue, - const void* callable, + detail::HCCQueue *queue, + std::unique_ptr callable, std::size_t callable_size) override { // try load kernels lazily in case it was not done so at bootstrap // due to HCC_LAZYINIT env var @@ -2846,14 +2849,15 @@ class HSADevice final : public KalmarDevice // HSADispatch instance will be deleted in: // HSAQueue::LaunchKernel() - // or it will be created as a shared_ptr in: + // or it will be created as a shared_ptr in: // HSAQueue::LaunchKernelAsync() - return new HSADispatch{this, queue, kernel, callable, callable_size}; + return new HSADispatch{ + this, queue, kernel, std::move(callable), callable_size}; } - std::shared_ptr createQueue(execute_order order = execute_in_order) override { + std::shared_ptr createQueue(execute_order order = execute_in_order) override { auto hsaAv = new HSAQueue(this, agent, order); - std::shared_ptr q = std::shared_ptr(hsaAv); + std::shared_ptr q = std::shared_ptr(hsaAv); queues_mutex.lock(); queues.push_back(q); hsaAv->queueSeqNum = this->queueSeqNums++; @@ -2865,8 +2869,8 @@ class HSADevice final : public KalmarDevice return max_tile_static_size; } - std::vector< std::shared_ptr > get_all_queues() override { - std::vector< std::shared_ptr > result; + std::vector< std::shared_ptr > get_all_queues() override { + std::vector< std::shared_ptr > result; queues_mutex.lock(); for (auto&& queue : queues) { if (!queue.expired()) { @@ -2907,7 +2911,7 @@ class HSADevice final : public KalmarDevice return ri._found_local_memory_pool; } - bool is_peer(const Kalmar::KalmarDevice* other) override { + bool is_peer(const detail::HCCDevice* other) override { hsa_status_t status; if(!hasHSACoarsegrainedRegion()) @@ -2916,7 +2920,7 @@ class HSADevice final : public KalmarDevice auto self_pool = getHSAAMRegion(); hsa_amd_memory_pool_access_t access; - hsa_agent_t* agent = static_cast( const_cast (other)->getHSAAgent()); + hsa_agent_t* agent = static_cast( const_cast (other)->getHSAAgent()); //TODO: CPU acclerator will return NULL currently, return false. if(nullptr == agent) @@ -3166,7 +3170,7 @@ void hccgetenv(const char *var_name, char **var, const char *usage) -class HSAContext final : public KalmarContext +class HSAContext final : public HCCContext { public: std::map agentToDeviceMap_; @@ -3253,7 +3257,7 @@ class HSAContext final : public KalmarContext void ReadHccEnv() ; std::ostream &getHccProfileStream() const { return *hccProfileStream; }; - HSAContext() : KalmarContext(), signalPool(), signalPoolFlag(), signalCursor(0), signalPoolMutex() { + HSAContext() : HCCContext(), signalPool(), signalPoolFlag(), signalCursor(0), signalPoolMutex() { host.handle = (uint64_t)-1; ReadHccEnv(); @@ -3442,7 +3446,7 @@ class HSAContext final : public KalmarContext STATUS_CHECK(status, __LINE__); hc::printf_buffer_locked_va = nullptr; - // destroy all KalmarDevices associated with this context + // destroy all HCCDevices associated with this context for (auto dev : Devices) delete dev; Devices.clear(); @@ -3527,12 +3531,12 @@ class HSAContext final : public KalmarContext static HSAContext ctx; -} // namespace Kalmar +} // namespace detail // ---------------------------------------------------------------------- // member function implementation of HSADevice // ---------------------------------------------------------------------- -namespace Kalmar { +namespace detail { // Global free function to read HCC_ENV vars. Really this should be called once per process not once-per-event. @@ -3608,7 +3612,7 @@ void HSAContext::ReadHccEnv() }; -HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : KalmarDevice(access_type_read_write), +HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : HCCDevice(access_type_read_write), agent(a), programs(), max_tile_static_size(0), queue_size(0), queues(), queues_mutex(), rocrQueues(0/*empty*/), rocrQueuesMutex(), @@ -3774,7 +3778,7 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : KalmarD if (HCC_CHECK_COPY && !this->cpu_accessible_am) { - throw Kalmar::runtime_exception("HCC_CHECK_COPY can only be used on machines where accelerator memory is visible to CPU (ie large-bar systems)", 0); + throw detail::runtime_exception("HCC_CHECK_COPY can only be used on machines where accelerator memory is visible to CPU (ie large-bar systems)", 0); } @@ -3797,17 +3801,17 @@ static int get_seqnum_from_agent(hsa_agent_t hsaAgent) } } -} // namespace Kalmar +} // namespace detail // ---------------------------------------------------------------------- // member function implementation of HSAQueue // ---------------------------------------------------------------------- -namespace Kalmar { +namespace detail { std::ostream& operator<<(std::ostream& os, const HSAQueue & hav) { - auto device = static_cast(hav.getDev()); + auto device = static_cast(hav.getDev()); os << "queue#" << device->accSeqNum << "." << hav.queueSeqNum; return os; } @@ -3815,8 +3819,8 @@ std::ostream& operator<<(std::ostream& os, const HSAQueue & hav) -HSAQueue::HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order) : - KalmarQueue(pDev, queuing_mode_automatic, order), +HSAQueue::HSAQueue(HCCDevice* pDev, hsa_agent_t agent, execute_order order) : + HCCQueue(pDev, queuing_mode_automatic, order), rocrQueue(nullptr), asyncOps(), drainingQueue_(false), valid(true), _nextSyncNeedsSysRelease(false), _nextKernelNeedsSysAcquire(false), bufferKernelMap(), kernelBufferMap() @@ -3827,7 +3831,7 @@ HSAQueue::HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order) : std::lock_guard l(this->qmutex); - auto device = static_cast(this->getDev()); + auto device = static_cast(this->getDev()); device->createOrstealRocrQueue(this); } @@ -3846,7 +3850,7 @@ void HSAQueue::dispose() override { { DBOUT(DB_LOCK, " ptr:" << this << " dispose lock_guard...\n"); - Kalmar::HSADevice* device = static_cast(getDev()); + detail::HSADevice* device = static_cast(getDev()); // NOTE: needs to acquire rocrQueuesMutex and then the qumtex in this // sequence in order to avoid potential deadlock with other threads @@ -3884,15 +3888,15 @@ void HSAQueue::dispose() override { DBOUT(DB_INIT, "HSAQueue::dispose() " << this << " out\n"); } -Kalmar::HSADevice * HSAQueue::getHSADev() const { - return static_cast(this->getDev()); +detail::HSADevice * HSAQueue::getHSADev() const { + return static_cast(this->getDev()); }; hsa_queue_t *HSAQueue::acquireLockedRocrQueue() { DBOUT(DB_LOCK, " ptr:" << this << " lock...\n"); this->qmutex.lock(); if (this->rocrQueue == nullptr) { - auto device = static_cast(this->getDev()); + auto device = static_cast(this->getDev()); device->createOrstealRocrQueue(this); } @@ -3936,13 +3940,13 @@ HSAQueue::getHSAKernargRegion() override { } void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, - const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy) override { + const detail::HCCDevice *copyDevice, bool forceUnpinnedCopy) override { // wait for all previous async commands in this queue to finish // TODO - can remove this synchronization, copy is tail-synchronous not required on front end. this->wait(); - const Kalmar::HSADevice *copyDeviceHsa = static_cast (copyDevice); + const detail::HSADevice *copyDeviceHsa = static_cast (copyDevice); // create a HSACopy instance HSACopy* copyCommand = new HSACopy(this, src, dst, size_bytes); @@ -3961,7 +3965,7 @@ void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCom // TODO - remove me void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, bool foo) override { - const Kalmar::KalmarDevice *copyDevice; + const detail::HCCDevice *copyDevice; if (srcPtrInfo._isInDeviceMem) { copyDevice = (srcPtrInfo._acc.get_dev_ptr()); } else if (dstPtrInfo._isInDeviceMem) { @@ -3974,15 +3978,15 @@ void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCom } -std::shared_ptr HSAQueue::EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, +std::shared_ptr HSAQueue::EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, - const Kalmar::KalmarDevice *copyDevice) override { + const detail::HCCDevice *copyDevice) override { hsa_status_t status = HSA_STATUS_SUCCESS; // create shared_ptr instance - const Kalmar::HSADevice *copyDeviceHsa = static_cast (copyDevice); + const detail::HSADevice *copyDeviceHsa = static_cast (copyDevice); std::shared_ptr copyCommand = std::make_shared(this, src, dst, size_bytes); // euqueue the async copy command @@ -3997,7 +4001,7 @@ std::shared_ptr HSAQueue::EnqueueAsyncCopyExt(const void* src, vo // enqueue an async copy command -std::shared_ptr HSAQueue::EnqueueAsyncCopy(const void *src, void *dst, size_t size_bytes) override { +std::shared_ptr HSAQueue::EnqueueAsyncCopy(const void *src, void *dst, size_t size_bytes) override { hsa_status_t status = HSA_STATUS_SUCCESS; // create shared_ptr instance @@ -4013,10 +4017,10 @@ std::shared_ptr HSAQueue::EnqueueAsyncCopy(const void *src, void if (!srcInTracker) { // throw an exception - throw Kalmar::runtime_exception("trying to copy from unpinned src pointer", 0); + throw detail::runtime_exception("trying to copy from unpinned src pointer", 0); } else if (!dstInTracker) { // throw an exception - throw Kalmar::runtime_exception("trying to copy from unpinned dst pointer", 0); + throw detail::runtime_exception("trying to copy from unpinned dst pointer", 0); }; @@ -4027,11 +4031,11 @@ std::shared_ptr HSAQueue::EnqueueAsyncCopy(const void *src, void // The caller of this function is responsible for avoiding this situation, by examining the // host and device allow-access mappings and using a CPU staging copy BEFORE calling // this routine. - const Kalmar::HSADevice *copyDevice; + const detail::HSADevice *copyDevice; if (srcPtrInfo._isInDeviceMem) { // D2H or D2D - copyDevice = static_cast(srcPtrInfo._acc.get_dev_ptr()); + copyDevice = static_cast(srcPtrInfo._acc.get_dev_ptr()); } else if (dstPtrInfo._isInDeviceMem) { // H2D - copyDevice = static_cast(dstPtrInfo._acc.get_dev_ptr()); + copyDevice = static_cast(dstPtrInfo._acc.get_dev_ptr()); } else { copyDevice = nullptr; // H2H } @@ -4059,7 +4063,7 @@ HSAQueue::dispatch_hsa_kernel( ((1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS) - 1); if (dims == 0) { - throw Kalmar::runtime_exception("dispatch_hsa_kernel: must set dims in aql.header", 0); + throw detail::runtime_exception("dispatch_hsa_kernel: must set dims in aql.header", 0); } uint16_t packetType = (aql->header >> HSA_PACKET_HEADER_TYPE) & @@ -4067,11 +4071,11 @@ HSAQueue::dispatch_hsa_kernel( if (packetType != HSA_PACKET_TYPE_KERNEL_DISPATCH) { - throw Kalmar::runtime_exception("dispatch_hsa_kernel: must set packetType and fence bits in aql.header", 0); + throw detail::runtime_exception("dispatch_hsa_kernel: must set packetType and fence bits in aql.header", 0); } - Kalmar::HSADevice* device = static_cast(this->getDev()); + detail::HSADevice* device = static_cast(this->getDev()); std::shared_ptr sp_dispatch = std::make_shared(device, this/*queue*/, nullptr, aql); if (HCC_OPT_FLUSH) { @@ -4101,18 +4105,18 @@ HSAQueue::dispatch_hsa_kernel( } }; -} // namespace Kalmar +} // namespace detail // ---------------------------------------------------------------------- // member function implementation of HSADispatch // ---------------------------------------------------------------------- HSADispatch::HSADispatch( - Kalmar::HSADevice* device, - Kalmar::KalmarQueue *queue, + detail::HSADevice* device, + detail::HCCQueue *queue, HSAKernel* kernel, const hsa_kernel_dispatch_packet_t *aql) : - HSAOp{queue, Kalmar::hcCommandKernel}, + HSAOp{queue, detail::hcCommandKernel}, device_{device}, kernel_name_{nullptr}, kernel_{kernel}, @@ -4242,7 +4246,7 @@ hsa_status_t HSADispatch::dispatchKernel( // set dispatch fences // The fence bits must be set on entry into this function. uint16_t header = aql_.header; - if (hsaQueue()->get_execute_order() == Kalmar::execute_in_order) { + if (hsaQueue()->get_execute_order() == detail::execute_in_order) { //std::cout << "barrier bit on\n"; // set AQL header with barrier bit on if execute in order header |= ((HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | @@ -4255,15 +4259,13 @@ hsa_status_t HSADispatch::dispatchKernel( aql_.kernarg_address = kernargMemory_.get(); - std::cout << aql_.kernarg_address << std::endl; - // write packet uint32_t queueMask = lockedHsaQueue->size - 1; // TODO: Need to check if package write is correct. uint64_t index = hsa_queue_load_write_index_relaxed(lockedHsaQueue); uint64_t nextIndex = index + 1; if (nextIndex - hsa_queue_load_read_index_scacquire(lockedHsaQueue) >= lockedHsaQueue->size) { - checkHCCRuntimeStatus(Kalmar::HCCRuntimeStatus::HCCRT_STATUS_ERROR_COMMAND_QUEUE_OVERFLOW, __LINE__, lockedHsaQueue); + checkHCCRuntimeStatus(detail::HCCRuntimeStatus::HCCRT_STATUS_ERROR_COMMAND_QUEUE_OVERFLOW, __LINE__, lockedHsaQueue); } @@ -4278,7 +4280,7 @@ hsa_status_t HSADispatch::dispatchKernel( /* * Create a signal to wait for the dispatch to finish. */ - std::pair ret = Kalmar::ctx.getSignal(); + std::pair ret = detail::ctx.getSignal(); _signal = ret.first; _signalIndex = ret.second; q_aql->completion_signal = _signal; @@ -4321,7 +4323,7 @@ HSADispatch::waitComplete() { // wait for completion if (hsa_signal_wait_scacquire(_signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), waitMode_) != 0) { - throw Kalmar::runtime_exception("Signal wait returned unexpected value\n", 0); + throw detail::runtime_exception("Signal wait returned unexpected value\n", 0); } DBOUT (DB_MISC, "complete!\n"); @@ -4431,7 +4433,7 @@ HSADispatch::dispose() { //LOG_PROFILE(this, start, end, "kernel", kname.c_str(), std::hex << "kernel="<< kernel << " " << (kernel? kernel->kernelCodeHandle:0x0) << " aql.kernel_object=" << aql.kernel_object << std::dec); LOG_PROFILE(this, start, end, "kernel", getKernelName(), ""); } - Kalmar::ctx.releaseSignal(_signal, _signalIndex); + detail::ctx.releaseSignal(_signal, _signalIndex); } inline uint64_t @@ -4486,12 +4488,12 @@ hsa_status_t HSADispatch::setLaunchConfiguration( std::stringstream msg; msg << "The extent of the tile (" << localDims[i] << ") exceeds the device limit (" << workgroup_max_dim[i] << ")."; - throw Kalmar::runtime_exception(msg.str().c_str(), -1); + throw detail::runtime_exception(msg.str().c_str(), -1); } else if (localDims[i] > globalDims[i]) { std::stringstream msg; msg << "The extent of the tile (" << localDims[i] << ") exceeds the compute grid extent (" << globalDims[i] << ")."; - throw Kalmar::runtime_exception(msg.str().c_str(), -1); + throw detail::runtime_exception(msg.str().c_str(), -1); } workgroup_size[i] = localDims[i]; } @@ -4572,7 +4574,7 @@ hsa_status_t HSADispatch::setLaunchConfiguration( msg << "The number of work items (" << actual_flat_group_size << ") per work group exceeds the limit (" << max_num_work_items_per_cu << ") of kernel " << kernel->kernelName << " ."; - throw Kalmar::runtime_exception(msg.str().c_str(), -1); + throw detail::runtime_exception(msg.str().c_str(), -1); } }; validate_kernel_flat_group_size(); @@ -4689,7 +4691,7 @@ HSABarrier::enqueueAsync(hc::memory_scope fenceScope) { } // Create a signal to wait for the barrier to finish. - std::pair ret = Kalmar::ctx.getSignal(); + std::pair ret = detail::ctx.getSignal(); _signal = ret.first; _signalIndex = ret.second; @@ -4711,7 +4713,7 @@ HSABarrier::enqueueAsync(hc::memory_scope fenceScope) { const uint32_t queueMask = rocrQueue->size - 1; uint64_t nextIndex = index + 1; if (nextIndex - hsa_queue_load_read_index_scacquire(rocrQueue) >= rocrQueue->size) { - checkHCCRuntimeStatus(Kalmar::HCCRuntimeStatus::HCCRT_STATUS_ERROR_COMMAND_QUEUE_OVERFLOW, __LINE__, rocrQueue); + checkHCCRuntimeStatus(detail::HCCRuntimeStatus::HCCRT_STATUS_ERROR_COMMAND_QUEUE_OVERFLOW, __LINE__, rocrQueue); } // Define the barrier packet to be at the calculated queue index address @@ -4789,7 +4791,7 @@ HSABarrier::dispose() { }; LOG_PROFILE(this, start, end, "barrier", "depcnt=" + std::to_string(depCount) + ",acq=" + fenceToString(acqBits) + ",rel=" + fenceToString(relBits), depss.str()) } - Kalmar::ctx.releaseSignal(_signal, _signalIndex); + detail::ctx.releaseSignal(_signal, _signalIndex); // Release referecne to our dependent ops: for (int i=0; igetDev()->get_seqnum()), _queueId(queue->getSeqNum()) {} -HSAOp::HSAOp(Kalmar::KalmarQueue *queue, hc::hcCommandKind commandKind) : - KalmarAsyncOp(queue, commandKind), - _opCoord(static_cast (queue)), +HSAOp::HSAOp(detail::HCCQueue *queue, hc::hcCommandKind commandKind) : + HCCAsyncOp(queue, commandKind), + _opCoord(static_cast (queue)), _asyncOpsIndex(-1), _signalIndex(-1), - _agent(static_cast(hsaQueue()->getDev())->getAgent()) + _agent(static_cast(hsaQueue()->getDev())->getAgent()) { _signal.handle=0; - apiStartTick = Kalmar::ctx.getSystemTicks(); + apiStartTick = detail::ctx.getSystemTicks(); }; -Kalmar::HSAQueue *HSAOp::hsaQueue() const +detail::HSAQueue *HSAOp::hsaQueue() const { - return static_cast (this->getQueue()); + return static_cast (this->getQueue()); }; bool HSAOp::isReady() override { @@ -4858,14 +4860,14 @@ bool HSAOp::isReady() override { // // Copy mode will be set later on. // HSA signals would be waited in HSA_WAIT_STATE_ACTIVE by default for HSACopy instances -HSACopy::HSACopy(Kalmar::KalmarQueue *queue, const void* src_, void* dst_, size_t sizeBytes_) : HSAOp(queue, Kalmar::hcCommandInvalid), +HSACopy::HSACopy(detail::HCCQueue *queue, const void* src_, void* dst_, size_t sizeBytes_) : HSAOp(queue, detail::hcCommandInvalid), isSubmitted(false), isAsync(false), isSingleStepCopy(false), isPeerToPeer(false), future(nullptr), depAsyncOp(nullptr), copyDevice(nullptr), waitMode(HSA_WAIT_STATE_ACTIVE), src(src_), dst(dst_), sizeBytes(sizeBytes_) { - apiStartTick = Kalmar::ctx.getSystemTicks(); + apiStartTick = detail::ctx.getSystemTicks(); } // wait for the async copy to complete @@ -4905,7 +4907,7 @@ HSACopy::waitComplete() { void checkCopy(const void *s1, const void *s2, size_t sizeBytes) { if (memcmp(s1, s2, sizeBytes) != 0) { - throw Kalmar::runtime_exception("HCC_CHECK_COPY mismatch detected", 0); + throw detail::runtime_exception("HCC_CHECK_COPY mismatch detected", 0); } } @@ -4913,7 +4915,7 @@ void checkCopy(const void *s1, const void *s2, size_t sizeBytes) // Small wrapper that calls hsa_amd_memory_async_copy. // HCC knows exactly which copy-engine it wants to perfom the copy and has already made. -hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, const Kalmar::HSADevice *copyDeviceArg, +hsa_status_t HSACopy::hcc_memory_async_copy(detail::hcCommandKind copyKind, const detail::HSADevice *copyDeviceArg, const hc::AmPointerInfo &dstPtrInfo, const hc::AmPointerInfo &srcPtrInfo, size_t sizeBytes, int depSignalCnt, const hsa_signal_t *depSignals, hsa_signal_t completion_signal) @@ -4922,18 +4924,18 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons this->copyDevice = copyDeviceArg; // beautiful...: - hsa_agent_t copyAgent = * static_cast(const_cast(copyDeviceArg)->getHSAAgent()); + hsa_agent_t copyAgent = * static_cast(const_cast(copyDeviceArg)->getHSAAgent()); hsa_status_t status; hsa_device_type_t device_type; status = hsa_agent_get_info(copyAgent, HSA_AGENT_INFO_DEVICE, &device_type); if (status != HSA_STATUS_SUCCESS) { - throw Kalmar::runtime_exception("invalid copy agent used for hcc_memory_async_copy", status); + throw detail::runtime_exception("invalid copy agent used for hcc_memory_async_copy", status); } if (device_type != HSA_DEVICE_TYPE_GPU) { - throw Kalmar::runtime_exception("copy agent must be GPU hcc_memory_async_copy", -1); + throw detail::runtime_exception("copy agent must be GPU hcc_memory_async_copy", -1); } - hsa_agent_t hostAgent = const_cast (copyDeviceArg)->getHostAgent(); + hsa_agent_t hostAgent = const_cast (copyDeviceArg)->getHostAgent(); /* Determine src and dst pointer passed to ROCR runtime. * @@ -4946,7 +4948,7 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons hsa_agent_t srcAgent, dstAgent; switch (copyKind) { - case Kalmar::hcMemcpyHostToHost: + case detail::hcMemcpyHostToHost: srcAgent=hostAgent; dstAgent=hostAgent; /* H2H case @@ -4956,7 +4958,7 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons dstPtr = this->dst; srcPtr = const_cast(this->src); break; - case Kalmar::hcMemcpyHostToDevice: + case detail::hcMemcpyHostToDevice: srcAgent=hostAgent; dstAgent=copyAgent; /* H2D case @@ -4969,7 +4971,7 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons (reinterpret_cast(const_cast(this->src)) - reinterpret_cast(srcPtrInfo._hostPointer)); break; - case Kalmar::hcMemcpyDeviceToHost: + case detail::hcMemcpyDeviceToHost: srcAgent=copyAgent; dstAgent=hostAgent; /* D2H case @@ -4982,7 +4984,7 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons reinterpret_cast(dstPtrInfo._hostPointer)); srcPtr = const_cast(this->src); break; - case Kalmar::hcMemcpyDeviceToDevice: + case detail::hcMemcpyDeviceToDevice: this->isPeerToPeer = (dstPtrInfo._acc != srcPtrInfo._acc); srcAgent=copyAgent; dstAgent=copyAgent; @@ -4993,7 +4995,7 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons srcPtr = const_cast(this->src); break; default: - throw Kalmar::runtime_exception("bad copyKind in hcc_memory_async_copy", copyKind); + throw detail::runtime_exception("bad copyKind in hcc_memory_async_copy", copyKind); }; @@ -5021,7 +5023,7 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons status = hsa_amd_memory_async_copy(dstPtr, dstAgent, srcPtr, srcAgent, sizeBytes, depSignalCnt, depSignals, completion_signal); if (status != HSA_STATUS_SUCCESS) { - throw Kalmar::runtime_exception("hsa_amd_memory_async_copy error", status); + throw detail::runtime_exception("hsa_amd_memory_async_copy error", status); } @@ -5042,24 +5044,24 @@ hsa_status_t HSACopy::hcc_memory_async_copy(Kalmar::hcCommandKind copyKind, cons -static Kalmar::hcCommandKind resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem) +static detail::hcCommandKind resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem) { if (!srcInDeviceMem && !dstInDeviceMem) { - return Kalmar::hcMemcpyHostToHost; + return detail::hcMemcpyHostToHost; } else if (!srcInDeviceMem && dstInDeviceMem) { - return Kalmar::hcMemcpyHostToDevice; + return detail::hcMemcpyHostToDevice; } else if (srcInDeviceMem && !dstInDeviceMem) { - return Kalmar::hcMemcpyDeviceToHost; + return detail::hcMemcpyDeviceToHost; } else if (srcInDeviceMem && dstInDeviceMem) { - return Kalmar::hcMemcpyDeviceToDevice; + return detail::hcMemcpyDeviceToDevice; } else { // Invalid copy copyDir - should never reach here since we cover all 4 possible options above. - throw Kalmar::runtime_exception("invalid copy copyDir", 0); + throw detail::runtime_exception("invalid copy copyDir", 0); } } inline hsa_status_t -HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo) { +HSACopy::enqueueAsyncCopyCommand(const detail::HSADevice *copyDevice, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo) { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -5078,7 +5080,7 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc:: { // Create a signal to wait for the async copy command to finish. - std::pair ret = Kalmar::ctx.getSignal(); + std::pair ret = detail::ctx.getSignal(); _signal = ret.first; _signalIndex = ret.second; @@ -5173,11 +5175,11 @@ HSACopy::dispose() { LOG_PROFILE(this, start, end, "copy", getCopyCommandString(), "\t" << sizeBytes << " bytes;\t" << sizeBytes/1024.0/1024 << " MB;\t" << bw << " GB/s;"); } - Kalmar::ctx.releaseSignal(_signal, _signalIndex); + detail::ctx.releaseSignal(_signal, _signalIndex); } else { if (HCC_PROFILE & HCC_PROFILE_TRACE) { uint64_t start = apiStartTick; - uint64_t end = Kalmar::ctx.getSystemTicks(); + uint64_t end = detail::ctx.getSystemTicks(); double bw = (double)(sizeBytes)/(end-start) * (1000.0/1024.0) * (1000.0/1024.0); LOG_PROFILE(this, start, end, "copyslo", getCopyCommandString(), "\t" << sizeBytes << " bytes;\t" << sizeBytes/1024.0/1024 << " MB;\t" << bw << " GB/s;"); } @@ -5206,7 +5208,7 @@ HSACopy::getEndTimestamp() override { void -HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, const Kalmar::HSADevice *copyDevice, bool forceUnpinnedCopy) +HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, const detail::HSADevice *copyDevice, bool forceUnpinnedCopy) { bool srcInTracker = (srcPtrInfo._sizeBytes != 0); bool dstInTracker = (dstPtrInfo._sizeBytes != 0); @@ -5218,8 +5220,8 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI int depSignalCnt = 0; - if ((copyDevice == nullptr) && (copyDir != Kalmar::hcMemcpyHostToHost) && (copyDir != Kalmar::hcMemcpyDeviceToDevice)) { - throw Kalmar::runtime_exception("Null copyDevice can only be used with HostToHost or DeviceToDevice copy", -1); + if ((copyDevice == nullptr) && (copyDir != detail::hcMemcpyHostToHost) && (copyDir != detail::hcMemcpyDeviceToDevice)) { + throw detail::runtime_exception("Null copyDevice can only be used with HostToHost or DeviceToDevice copy", -1); } @@ -5227,7 +5229,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI bool useFastCopy = true; switch (copyDir) { - case Kalmar::hcMemcpyHostToDevice: + case detail::hcMemcpyHostToDevice: if (!srcInTracker || forceUnpinnedCopy) { DBOUT(DB_COPY,"HSACopy::syncCopyExt(), invoke UnpinnedCopyEngine::CopyHostToDevice()\n"); @@ -5237,7 +5239,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI break; - case Kalmar::hcMemcpyDeviceToHost: + case detail::hcMemcpyDeviceToHost: if (!dstInTracker || forceUnpinnedCopy) { DBOUT(DB_COPY,"HSACopy::syncCopyExt(), invoke UnpinnedCopyEngine::CopyDeviceToHost()\n"); UnpinnedCopyEngine::CopyMode d2hCopyMode = copyDevice->copy_mode; @@ -5250,7 +5252,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI }; break; - case Kalmar::hcMemcpyHostToHost: + case detail::hcMemcpyHostToHost: DBOUT(DB_COPY,"HSACopy::syncCopyExt(), invoke memcpy\n"); // Since this is sync copy, we assume here that the GPU has already drained younger commands. @@ -5259,7 +5261,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI useFastCopy = false; break; - case Kalmar::hcMemcpyDeviceToDevice: + case detail::hcMemcpyDeviceToDevice: if (forceUnpinnedCopy) { // TODO - is this a same-device copy or a P2P? hsa_agent_t dstAgent = * (static_cast (dstPtrInfo._acc.get_hsa_agent())); @@ -5276,7 +5278,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI break; default: - throw Kalmar::runtime_exception("unexpected copy type", HSA_STATUS_SUCCESS); + throw detail::runtime_exception("unexpected copy type", HSA_STATUS_SUCCESS); }; @@ -5287,7 +5289,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI DBOUT(DB_COPY, "HSACopy::syncCopyExt(), useFastCopy=1, fetch and init a HSA signal\n"); // Get a signal and initialize it: - std::pair ret = Kalmar::ctx.getSignal(); + std::pair ret = detail::ctx.getSignal(); _signal = ret.first; _signalIndex = ret.second; @@ -5296,7 +5298,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI DBOUT(DB_CMD, "HSACopy::syncCopyExt(), invoke hsa_amd_memory_async_copy()\n"); if (copyDevice == nullptr) { - throw Kalmar::runtime_exception("Null copyDevice reached call to hcc_memory_async_copy", -1); + throw detail::runtime_exception("Null copyDevice reached call to hcc_memory_async_copy", -1); } @@ -5309,7 +5311,7 @@ HSACopy::syncCopyExt(hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrI DBOUT(DB_COPY,"done!\n"); } else { DBOUT(DB_COPY, "HSACopy::syncCopyExt(), hsa_amd_memory_async_copy() returns: 0x" << std::hex << hsa_status << std::dec <<"\n"); - throw Kalmar::runtime_exception("hsa_amd_memory_async_copy error", hsa_status); + throw detail::runtime_exception("hsa_amd_memory_async_copy error", hsa_status); } } @@ -5366,11 +5368,11 @@ HSACopy::syncCopy() { // Resolve default to a specific Kind so we know which algorithm to use: setCommandKind (resolveMemcpyDirection(srcInDeviceMem, dstInDeviceMem)); - Kalmar::HSADevice *copyDevice; + detail::HSADevice *copyDevice; if (srcInDeviceMem) { // D2D, H2D - copyDevice = static_cast (srcPtrInfo._acc.get_dev_ptr()); + copyDevice = static_cast (srcPtrInfo._acc.get_dev_ptr()); }else if (dstInDeviceMem) { // D2H - copyDevice = static_cast (dstPtrInfo._acc.get_dev_ptr()); + copyDevice = static_cast (dstPtrInfo._acc.get_dev_ptr()); } else { copyDevice = nullptr; // H2D } @@ -5384,7 +5386,7 @@ HSACopy::syncCopy() { // ---------------------------------------------------------------------- extern "C" void *GetContextImpl() { - return &Kalmar::ctx; + return &detail::ctx; } // op printer diff --git a/lib/hsa/unpinned_copy_engine.cpp b/lib/hsa/unpinned_copy_engine.cpp index 183baea5759..5ee131ed6d8 100644 --- a/lib/hsa/unpinned_copy_engine.cpp +++ b/lib/hsa/unpinned_copy_engine.cpp @@ -27,7 +27,7 @@ THE SOFTWARE. #include "unpinned_copy_engine.h" #include "hc_rt_debug.h" -#define THROW_ERROR(err, hsaErr) { hc::print_backtrace(); throw (Kalmar::runtime_exception("HCC unpinned copy engine error", hsaErr)); } +#define THROW_ERROR(err, hsaErr) { hc::print_backtrace(); throw (detail::runtime_exception("HCC unpinned copy engine error", hsaErr)); } void errorCheck(hsa_status_t hsa_error_code, int line_num, std::string str) { if ((hsa_error_code != HSA_STATUS_SUCCESS)&& (hsa_error_code != HSA_STATUS_INFO_BREAK)) { diff --git a/lib/mcwamp.cpp b/lib/mcwamp.cpp index 56c4844db9b..00803c08da8 100644 --- a/lib/mcwamp.cpp +++ b/lib/mcwamp.cpp @@ -68,7 +68,7 @@ struct RuntimeImpl { bool isCPU; }; -namespace Kalmar { +namespace detail { namespace CLAMP { //////////////////////////////////////////////////////////// @@ -266,7 +266,7 @@ static inline uint64_t Read8byteIntegerFromBuffer(const char *data, size_t pos) // Returns true if a compatible code object is found, and returns its size and // pointer to the code object. Returns false in case no compatible code object // is found. -inline bool DetermineAndGetProgram(KalmarQueue* pQueue, size_t* kernel_size, void** kernel_source) { +inline bool DetermineAndGetProgram(HCCQueue* pQueue, size_t* kernel_size, void** kernel_source) { bool FoundCompatibleKernel = false; @@ -328,7 +328,7 @@ inline bool DetermineAndGetProgram(KalmarQueue* pQueue, size_t* kernel_size, voi // only check bundles with HCC triple prefix string if (Triple.compare(0, HCC_TRIPLE_PREFIX_LENGTH, HCC_TRIPLE_PREFIX) == 0) { - // use KalmarDevice::IsCompatibleKernel to check + // use HCCDevice::IsCompatibleKernel to check size_t SizeST = (size_t)Size; void *Content = (unsigned char *)data + Offset; if (pQueue->getDev()->IsCompatibleKernel((void*)SizeST, Content)) { @@ -343,7 +343,7 @@ inline bool DetermineAndGetProgram(KalmarQueue* pQueue, size_t* kernel_size, voi return FoundCompatibleKernel; } -void LoadInMemoryProgram(KalmarQueue* pQueue) { +void LoadInMemoryProgram(HCCQueue* pQueue) { size_t kernel_size = 0; void* kernel_source = nullptr; @@ -356,25 +356,26 @@ void LoadInMemoryProgram(KalmarQueue* pQueue) { // used in parallel_for_each.h void* CreateKernel( const char* name, - KalmarQueue* pQueue, - const void* callable, + HCCQueue* pQueue, + std::unique_ptr callable, std::size_t callable_size) { // TODO - should create a HSAQueue:: CreateKernel member function that creates and returns a dispatch. - return pQueue->getDev()->CreateKernel(name, pQueue, callable, callable_size); + return pQueue->getDev()->CreateKernel( + name, pQueue, std::move(callable), callable_size); } } // namespace CLAMP -KalmarContext *getContext() { - return static_cast(CLAMP::GetOrInitRuntime()->m_GetContextImpl()); +HCCContext *getContext() { + return static_cast(CLAMP::GetOrInitRuntime()->m_GetContextImpl()); } -// Kalmar runtime bootstrap logic -class KalmarBootstrap { +// detail runtime bootstrap logic +class HCCBootstrap { private: RuntimeImpl* runtime; public: - KalmarBootstrap() : runtime(nullptr) { + HCCBootstrap() : runtime(nullptr) { bool to_init = true; char* lazyinit_env = getenv("HCC_LAZYINIT"); if (lazyinit_env != nullptr) { @@ -390,15 +391,15 @@ class KalmarBootstrap { runtime = CLAMP::GetOrInitRuntime(); // get context - KalmarContext* context = static_cast(runtime->m_GetContextImpl()); + HCCContext* context = static_cast(runtime->m_GetContextImpl()); - const std::vector devices = context->getDevices(); + const std::vector devices = context->getDevices(); // load kernels on the default queue for each device for (auto dev = devices.begin(); dev != devices.end(); dev++) { // get default queue on the device - std::shared_ptr queue = (*dev)->get_default_queue(); + std::shared_ptr queue = (*dev)->get_default_queue(); // load kernels on the default queue for the device CLAMP::LoadInMemoryProgram(queue.get()); @@ -407,11 +408,11 @@ class KalmarBootstrap { } }; -} // namespace Kalmar +} // namespace detail extern "C" void __attribute__((constructor)) __hcc_shared_library_init() { // this would initialize kernels when the shared library get loaded - static Kalmar::KalmarBootstrap boot; + static detail::HCCBootstrap boot; } extern "C" void __attribute__((destructor)) __hcc_shared_library_fini() { From b12b5009206fc69dc7f93b74bfa411ffc6a6de54 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 5 Aug 2018 18:23:12 +0300 Subject: [PATCH 003/134] No Kalmars allowed. --- lib/hsa/mcwamp_hsa.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index b1eabd3032e..815369e50d2 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -1850,7 +1850,7 @@ class HSAQueue final : public HCCQueue STATUS_CHECK(status, __LINE__); sync_copy(data, *static_cast(getHostAgent()), ((char*)device) + offset, *agent, count); } else { - throw Kalmar::runtime_exception("host buffer allocation failed!", 0); + throw detail::runtime_exception("host buffer allocation failed!", 0); } return data; } else { @@ -2952,7 +2952,7 @@ class HSADevice final : public HCCDevice } } } else { - throw Kalmar::runtime_exception("HSA executable NOT built yet!", 0); + throw detail::runtime_exception("HSA executable NOT built yet!", 0); } return symbol_ptr; @@ -2977,7 +2977,7 @@ class HSADevice final : public HCCDevice STATUS_CHECK(status, __LINE__); } } else { - throw Kalmar::runtime_exception("HSA executable NOT built yet!", 0); + throw detail::runtime_exception("HSA executable NOT built yet!", 0); } } @@ -2987,7 +2987,7 @@ class HSADevice final : public HCCDevice unsigned long* symbol_ptr = (unsigned long*)getSymbolAddress(symbolName); memcpySymbol(symbol_ptr, hostptr, count, offset, kind); } else { - throw Kalmar::runtime_exception("HSA executable NOT built yet!", 0); + throw detail::runtime_exception("HSA executable NOT built yet!", 0); } } @@ -4400,7 +4400,7 @@ hsa_status_t HSADispatch::setLaunchConfiguration( int dynamicGroupSize) { assert((0 < dims) && (dims <= 3)); - DBOUT(DB_MISC, "static group segment size: " << kernel->static_group_segment_size + DBOUT(DB_MISC, "static group segment size: " << kernel_->static_group_segment_size << " dynamic group segment size: " << dynamicGroupSize << "\n"); // Set group dims From c8847a5e70a3ee91541343df28af0cd58f0d0d48 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 6 Aug 2018 13:46:09 +0300 Subject: [PATCH 004/134] Partial, WiP support for array_view. --- include/hc.hpp | 649 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 426 insertions(+), 223 deletions(-) diff --git a/include/hc.hpp b/include/hc.hpp index 5031a90cf84..27f573791af 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -37,7 +37,10 @@ #include #include #include +#include +#include #include +#include #include #ifndef __HC__ @@ -3416,7 +3419,8 @@ class tiled_index { * an index. The implicit conversion converts to the .global index * member. */ - operator const index<3>() const __CPU__ __HC__ { + operator index<3>() const [[cpu, hc]] + { return global; } @@ -3905,17 +3909,9 @@ void copy(const array &src, OutputIter destBegin); * @tparam T The element type of this array * @tparam N The dimensionality of the array, defaults to 1 if elided. */ -template -class array { - static_assert(!std::is_const{}, "array is not supported"); - static_assert( - std::is_trivially_copyable{}, - "Only trivially copyable types are supported."); - static_assert( - std::is_trivially_destructible{}, - "Only trivially destructible types are supported."); - +struct array_base{ struct Deleter { + template void operator()(T* ptr) { // TODO: this may throw in a dtor, which is bad. if (hsa_memory_free(ptr) != HSA_STATUS_SUCCESS) { @@ -3923,10 +3919,20 @@ class array { } } }; - using Guarded_locked_ptr = std::pair; + using Guarded_locked_ptr = std::pair; inline static constexpr std::size_t max_array_cnt_{65521u}; // Prime. inline static std::array locked_ptrs_{}; +}; +template +class array : private array_base { + static_assert(!std::is_const{}, "array is not supported"); + static_assert( + std::is_trivially_copyable{}, + "Only trivially copyable types are supported."); + static_assert( + std::is_trivially_destructible{}, + "Only trivially destructible types are supported."); accelerator_view owner_; accelerator_view associate_; @@ -3990,11 +3996,11 @@ class array { return n; } while (true); // TODO: add termination after a number of attempts. } - array* this_() const [[hc]] + array* const this_() const [[hc]] { const auto n = reinterpret_cast(this) % max_array_cnt_; - return locked_ptrs_[n].second; + return static_cast(locked_ptrs_[n].second); } public: /** @@ -5266,17 +5272,74 @@ class array { * over native CPU data. It exposes an indexing interface congruent to that of * array. */ +struct array_view_base { + inline static constexpr std::size_t max_array_view_cnt_{65536}; + + inline static std::mutex mutex_; // TODO: use shared_mutex if C++17 feasible + inline static std::unordered_map> cache_{}; + + static + const std::shared_ptr& cache_for_(void* ptr, std::size_t byte_cnt) + { + std::lock_guard lck{mutex_}; + + const auto it = cache_.find(ptr); + + if (it != cache_.cend()) return it->second; + + static const accelerator acc{}; + + void* tmp{nullptr}; + auto s = hsa_memory_allocate( + *static_cast(acc.get_hsa_am_system_region()), + byte_cnt, + &tmp); + + if (s != HSA_STATUS_SUCCESS) { + throw std::runtime_error{"Failed cache allocation for array_view."}; + } + + return cache_.emplace( + std::piecewise_construct, + std::make_tuple(ptr), + std::make_tuple(tmp, hsa_memory_free)).first->second; + } +}; + template -class array_view -{ -public: - typedef typename std::remove_const::type nc_T; -#if __HCC_ACCELERATOR__ == 1 - typedef detail::_data acc_buffer_t; -#else - typedef detail::_data_host acc_buffer_t; -#endif +class array_view : private array_view_base { + static_assert( + std::is_trivially_copyable{}, + "Only trivially copyable types are supported."); + static_assert( + std::is_trivially_destructible{}, + "Only trivially destructible types are supported."); + + std::shared_ptr data_; + accelerator_view owner_; + hc::extent extent_; + T* base_ptr_; + void* source_; + + template friend class array; + template friend class array_view; + template + friend + void copy(const array&, const array_view&); + template + friend + void copy(InputIter, InputIter, const array_view&); + template + friend + void copy(const array_view&, array&); + template + friend + void copy(const array_view&, OutputIter); + template + friend + void copy(const array_view&, const array_view&); +public: /** * The rank of this array. */ @@ -5302,13 +5365,22 @@ class array_view */ array_view(hc::array& src) [[cpu, hc]] : array_view{src.get_extent(), src.data()} - {} + { // TODO: refactor to pass owner directly to delegated to ctor. + owner_ = src.get_accelerator_view(); + } - // FIXME: following interfaces were not implemented yet - // template - // explicit array_view::array_view(Container& src); - // template - // explicit array_view::array_view(value_type (&src) [Size]) __CPU__ __HC__; + template< + typename Container, + typename std::enable_if< + N == 1 && __is_container::value>::type* = nullptr> + explicit + array_view(Container& src) : array_view{hc::extent<1>(src.size()), src} + {} + template + explicit + array_view(value_type (&src)[m]) [[cpu, hc]] + : array_view{hc::extent<1>{m}, src} + {} /** * Constructs an array_view which is bound to the data contained in the @@ -5320,10 +5392,17 @@ class array_view * as std::vector or std::array) * @param[in] extent The extent of this array_view. */ - template ::value>::type> - array_view(const hc::extent& extent, Container& src) - : array_view(extent, src.data()) - { static_assert( std::is_same::value, "container element type and array view element type must match"); } + template< // TODO: redo the type predicates. + typename Container, + typename std::enable_if< + __is_container::value>::type* = nullptr> + array_view(const hc::extent& extent, Container& src) + : array_view{extent, src.data()} + { + static_assert( + std::is_same::value, + "container element type and array view element type must match"); + } /** * Constructs an array_view which is bound to the data contained in the @@ -5335,12 +5414,17 @@ class array_view * size of extent, the behavior is undefined. * @param[in] ext The extent of this array_view. */ - array_view(const hc::extent& ext, value_type* src) __CPU__ __HC__ -#if __HCC_ACCELERATOR__ == 1 - : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {} -#else - : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {} -#endif + array_view(const hc::extent& ext, value_type* src) [[cpu]] + : + data_{cache_for_(src, ext.size() * sizeof(T))}, + owner_{accelerator{L"cpu"}.get_default_view()}, + extent_{ext}, + base_ptr_{static_cast(data_.get())}, + source_{src} + {} + array_view(const hc::extent& ext, value_type* src) [[hc]] + : data_{nullptr, [](void*){}}, extent_{ext}, base_ptr_{src} + {} /** * Constructs an array_view which is not bound to a data source. The extent @@ -5352,8 +5436,10 @@ class array_view * * @param[in] ext The extent of this array_view. */ - explicit array_view(const hc::extent& ext) - : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {} + explicit + array_view(const hc::extent& ext) + : array_view{ext, reinterpret_cast(this)} + {} /** * Equivalent to construction using @@ -5365,15 +5451,27 @@ class array_view * container that supports .data() and .size() members (such * as std::vector or std::array) */ - template ::value>::type> - array_view(int e0, Container& src) - : array_view(hc::extent(e0), src) {} - template ::value>::type> - array_view(int e0, int e1, Container& src) - : array_view(hc::extent(e0, e1), src) {} - template ::value>::type> - array_view(int e0, int e1, int e2, Container& src) - : array_view(hc::extent(e0, e1, e2), src) {} + template< + typename Container, + typename std::enable_if< + N == 1 && __is_container::value>::type* = nullptr> + array_view(int e0, Container& src) + : array_view{hc::extent{e0}, src} + {} + template< + typename Container, + typename std::enable_if< + N == 2 && __is_container::value>::type* = nullptr> + array_view(int e0, int e1, Container& src) + : array_view{hc::extent{e0, e1}, src} + {} + template< + typename Container, + typename std::enable_if< + N == 3 && __is_container::value>::type* = nullptr> + array_view(int e0, int e1, int e2, Container& src) + : array_view{hc::extent{e0, e1, e2}, src} + {} /** * Equivalent to construction using @@ -5385,12 +5483,18 @@ class array_view * to. If the number of elements pointed to is less than * the size of extent, the behavior is undefined. */ - array_view(int e0, value_type *src) __CPU__ __HC__ - : array_view(hc::extent(e0), src) {} - array_view(int e0, int e1, value_type *src) __CPU__ __HC__ - : array_view(hc::extent(e0, e1), src) {} - array_view(int e0, int e1, int e2, value_type *src) __CPU__ __HC__ - : array_view(hc::extent(e0, e1, e2), src) {} + template::type* = nullptr> + array_view(int e0, value_type *src) [[cpu, hc]] + : array_view{hc::extent{e0}, src} + {} + template::type* = nullptr> + array_view(int e0, int e1, value_type *src) [[cpu, hc]] + : array_view{hc::extent{e0, e1}, src} + {} + template::type* = nullptr> + array_view(int e0, int e1, int e2, value_type *src) [[cpu, hc]] + : array_view{hc::extent{e0, e1, e2}, src} + {} /** * Equivalent to construction using @@ -5399,11 +5503,17 @@ class array_view * @param[in] e0,e1,e2 The component values that will form the extent of * this array_view. */ - explicit array_view(int e0) : array_view(hc::extent(e0)) {} - explicit array_view(int e0, int e1) - : array_view(hc::extent(e0, e1)) {} - explicit array_view(int e0, int e1, int e2) - : array_view(hc::extent(e0, e1, e2)) {} + template::type* = nullptr> + explicit + array_view(int e0) : array_view{hc::extent{e0}} + {} + template::type* = nullptr> + array_view(int e0, int e1) : array_view{hc::extent{e0, e1}} + {} + template::type* = nullptr> + array_view(int e0, int e1, int e2) + : array_view{hc::extent{e0, e1, e2}} + {} /** * Copy constructor. Constructs an array_view from the supplied argument @@ -5413,13 +5523,35 @@ class array_view * array_view from which to initialize this * new array_view. */ - array_view(const array_view& other) __CPU__ __HC__ - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} + array_view(const array_view& other) [[cpu, hc]] = default; + + /** + * Move constructor. Constructs an array_view from the supplied argument + * other. + * + * @param[in] other An object of type array_view or + * array_view from which to initialize this + * new array_view. + */ + array_view(array_view&& other) [[cpu, hc]] + : + data_{std::move(other.data_)}, + owner_{std::move(other.owner_)}, + extent_{std::move(other.extent_)}, + base_ptr_{other.base_ptr_}, + source_{other.source_} + { + other.base_ptr_ = nullptr; + other.source_ = nullptr; + } /** * Access the extent that defines the shape of this array_view. */ - hc::extent get_extent() const __CPU__ __HC__ { return extent; } + hc::extent get_extent() const [[cpu, hc]] + { + return extent_; + } /** * Access the accelerator_view where the data source of the array_view is @@ -5430,7 +5562,10 @@ class array_view * data source underlying the array_view is an array, the method returns * the accelerator_view where the source array is located. */ - accelerator_view get_source_accelerator_view() const { return cache.get_av(); } + accelerator_view get_source_accelerator_view() const + { + return owner_; + } /** * Assigns the contents of the array_view "other" to this array_view, using @@ -5440,17 +5575,23 @@ class array_view * into this array. * @return Returns *this. */ - array_view& operator=(const array_view& other) __CPU__ __HC__ { - if (this != &other) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - } + array_view& operator=(const array_view& other) [[cpu, hc]] = default; + + /** + * Moves the contents of the array_view "other" to this array_view, leaving + * "other" in a moved-from state. + * + * @param[in] other An object of type array_view from which to move + * into this array. + * @return Returns *this. + */ + array_view& operator=(array_view&& other) + { + using std::swap; + swap(*this, other); + return *this; } - /** * Copies the data referred to by this array_view to the array given by * "dest", as if by calling "copy(*this, dest)" @@ -5458,14 +5599,8 @@ class array_view * @param[in] dest An object of type array to which to copy data from * this array. */ - void copy_to(array& dest) const { -#if __HCC_ACCELERATOR__ != 1 - for(int i= 0 ;i< N;i++) - { - if (dest.get_extent()[i] < this->extent[i]) - throw runtime_exception{"errorMsg_throw", 0}; - } -#endif + void copy_to(array& dest) const + { copy(*this, dest); } @@ -5476,7 +5611,10 @@ class array_view * @param[in] dest An object of type array_view to which to copy data * from this array. */ - void copy_to(const array_view& dest) const { copy(*this, dest); } + void copy_to(const array_view& dest) const + { + copy(*this, dest); + } /** * Returns a pointer to the first data element underlying this array_view. @@ -5489,17 +5627,16 @@ class array_view * view is created without a data source, the pointer returned by data() in * CPU context is ephemeral and is invalidated when the original data * source or any of its views are accessed on an accelerator_view through a - * parallel_for_each or a copy operation. + * parallel_for_each or a copy operation. * * @return A pointer to the first element in the linearised array. */ - T* data() const __CPU__ __HC__ { + T* data() const [[cpu, hc]] + { + static_assert( + N == 1, "data() is only permissible on array views of rank 1"); -#if __HCC_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - static_assert(N == 1, "data() is only permissible on array views of rank 1"); - return reinterpret_cast(cache.get() + offset + index_base[0]); + return base_ptr_; } /** @@ -5508,8 +5645,9 @@ class array_view * @return A (const) pointer to the first element in the array_view on the * device memory. */ - T* accelerator_pointer() const __CPU__ __HC__ { - return reinterpret_cast(cache.get_device_pointer() + offset + index_base[0]); + T* accelerator_pointer() const [[cpu, hc]] // TODO: this should also be removed. + { + return data(); } /** @@ -5517,7 +5655,18 @@ class array_view * memory has been modified outside the array_view interface. This will * render all cached information stale. */ - void refresh() const { cache.refresh(); } + void refresh() const + { + static const auto cpu_av = accelerator{L"cpu"}.get_default_view(); + + if (owner_ == cpu_av) return; + + auto s = hsa_memory_copy( + base_ptr_, source_, extent_.size() * sizeof(T)); + if (s != HSA_STATUS_SUCCESS) { + throw std::runtime_error{"Failed to refresh cache for array_view."}; + } + } /** * Calling this member function synchronizes any modifications made to the @@ -5552,8 +5701,20 @@ class array_view * type of access on the data source that the array_view is * synchronized for. */ - // FIXME: type parameter is not implemented - void synchronize() const { cache.get_cpu_access(); } + void synchronize(access_type type = access_type_read) const + { + static const auto cpu_av = accelerator{L"cpu"}.get_default_view(); + + if (owner_ == cpu_av) return; + if (type == access_type_none || type == access_type_write) return; + + auto s = hsa_memory_copy( + source_, base_ptr_, extent_.size() * sizeof(T)); + + if (s == HSA_STATUS_SUCCESS) return; + + throw std::runtime_error{"Failed to synchronise array_view."}; + } /** * An asynchronous version of synchronize, which returns a completion @@ -5565,10 +5726,13 @@ class array_view * used to chain other operations to be executed after the * completion of the asynchronous operation. */ - // FIXME: type parameter is not implemented - completion_future synchronize_async() const { - std::future fut = std::async([&]() mutable { synchronize(); }); - return completion_future(fut.share()); + completion_future synchronize_async( + access_type type = access_type_read) const + { + if (type == access_type_none || type == access_type_write) return {}; + + return completion_future{ + std::async([this]() { synchronize(); }).share()}; } /** @@ -5604,9 +5768,10 @@ class array_view * type of access on the data source that the array_view is * synchronized for. */ - // FIXME: type parameter is not implemented - void synchronize_to(const accelerator_view& av) const [[cpu]] { - cache.sync_to(av.pQueue); + void synchronize_to( + const accelerator_view& av, access_type type = access_type_read) const + { + if (av != owner_) synchronize(type); } /** @@ -5624,8 +5789,13 @@ class array_view * used to chain other operations to be executed after the * completion of the asynchronous operation. */ - // FIXME: this method is not implemented yet - completion_future synchronize_to_async(const accelerator_view& av) const; + completion_future synchronize_to_async( + const accelerator_view& av, access_type type = access_type_read) const + { + if (type == access_type_none || type == access_type_write) return {}; + + if (av != owner_) return synchronize_async(type); + } /** * Indicates to the runtime that it may discard the current logical @@ -5634,10 +5804,9 @@ class array_view * accelerator_view, and its use is recommended if the existing content is * not needed. */ - void discard_data() const { -#if __HCC_ACCELERATOR__ != 1 - cache.discard(); -#endif + void discard_data() const + { + // Since we use system coarse grained, this is a NOP. } /** @{ */ @@ -5648,16 +5817,26 @@ class array_view * @param[in] idx An object of type index that specifies the location of * the element. */ - T& operator[] (const index& idx) const __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[detail::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; + T& operator[](const index& idx) const [[cpu]] + { + return data()[detail::amp_helper, hc::extent>:: + flatten(idx, extent_)]; + } + T& operator[](const index& idx) const [[hc]] + { + return data()[detail::amp_helper, hc::extent>:: + flatten(idx, extent_)]; + } + template::type* = nullptr> + T& operator[](int i0) const [[cpu]][[hc]] + { + return operator[](index<1>{i0}); } - T& operator()(const index& idx) const __CPU__ __HC__ { - return (*this)[idx]; + + T& operator()(const index& idx) const [[cpu, hc]] + { + return operator[](idx); } /** @} */ @@ -5673,8 +5852,11 @@ class array_view * responsible to explicitly synchronize the array_view to the CPU before * calling this method. Failure to do so results in undefined behavior. */ - // FIXME: this method is not implemented - T& get_ref(const index& idx) const __CPU__ __HC__; + T& get_ref(const index& idx) const [[cpu, hc]] + { + return base_ptr_[detail::amp_helper, hc::extent>:: + flatten(idx, extent_)]; + } /** @{ */ /** @@ -5684,13 +5866,32 @@ class array_view * @param[in] i0,i1,i2 The component values that will form the index into * this array. */ - T& operator() (int i0, int i1) const __CPU__ __HC__ { - static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view"); - return (*this)[index<2>(i0, i1)]; + T& operator()(int i0) const [[cpu, hc]] + { + static_assert( + N == 1, + "T& array_view::operator()(int) is only permissible on " + "array_view"); + + return operator[](index<1>{i0}); } - T& operator() (int i0, int i1, int i2) const __CPU__ __HC__ { - static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view"); - return (*this)[index<3>(i0, i1, i2)]; + T& operator()(int i0, int i1) const [[cpu, hc]] + { + static_assert( + N == 2, + "T& array_view::operator()(int, int) is only permissible on " + "array_view"); + + return operator[](index<2>{i0, i1}); + } + T& operator()(int i0, int i1, int i2) const [[cpu, hc]] + { + static_assert( + N == 3, + "T& array_view::operator()(int, int, int) is only permissible on " + "array_view"); + + return operator[](index<3>{i0, i1, i2}); } /** @} */ @@ -5714,13 +5915,24 @@ class array_view * @return Returns an array_view whose dimension is one lower than that of * this array_view. */ - typename projection_helper::result_type - operator[] (int i) const __CPU__ __HC__ { - return projection_helper::project(*this, i); - } - typename projection_helper::result_type - operator() (int i0) const __CPU__ __HC__ { return (*this)[i0]; } + template 1)>::type* = nullptr> + array_view operator[](int i0) const [[cpu, hc]] + { + hc::extent ext; + for (auto i = 1; i != N; ++i) ext[i - 1] = extent_[i]; + + array_view tmp{ext, static_cast(source_)}; // TODO: this is incorrect. + tmp.base_ptr_ += i0 * ext.size(); + tmp.source_ += i0 * ext.size(); + return tmp; + } + + template 1)>::type* = nullptr> + array_view operator()(int i0) const [[cpu, hc]] + { + return operator[](i0); + } /** @} */ /** @@ -5741,31 +5953,35 @@ class array_view * @return Returns a subsection of the source array at specified origin, * and with the specified extent. */ - array_view section(const index& idx, - const hc::extent& ext) const __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - if ( !detail::amp_helper, hc::extent>::contains(idx, ext,this->extent ) ) - throw runtime_exception{"errorMsg_throw", 0}; -#endif - array_view av(cache, ext, extent_base, idx + index_base, offset); - return av; + array_view section( + const index& idx, const hc::extent& ext) const [[cpu]] + { + // if (!detail::amp_helper, hc::extent>::contains(idx, ext, extent_)) + // throw runtime_exception{"errorMsg_throw", 0}; + + // array_view av(cache, ext, extent_base, idx + index_base, offset); + + // return av; + return *this; } /** * Equivalent to "section(idx, this->extent – idx)". */ - array_view section(const index& idx) const __CPU__ __HC__ { - hc::extent ext(extent); + array_view section(const index& idx) const [[cpu, hc]] + { + hc::extent ext{extent_}; detail::amp_helper, hc::extent>::minus(idx, ext); + return section(idx, ext); } /** * Equivalent to "section(index(), ext)". */ - array_view section(const hc::extent& ext) const __CPU__ __HC__ { - index idx; - return section(idx, ext); + array_view section(const hc::extent& ext) const [[cpu, hc]] + { + return section(index{}, ext); } /** @{ */ @@ -5778,19 +5994,26 @@ class array_view * @param[in] e0,e1,e2 The component values that will form the extent of * the section */ - array_view section(int i0, int e0) const __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); + array_view section(int i0, int e0) const [[cpu, hc]] + { + static_assert(N == 1, "Rank must be 1."); + + return section(index<1>{i0}, hc::extent<1>{e0}); } - array_view section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); + array_view section(int i0, int i1, int e0, int e1) const [[cpu, hc]] + { + static_assert(N == 2, "Rank must be 2."); + + return section(index<2>{i0, i1}, hc::extent<2>{e0, e1}); } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); + array_view section( + int i0, int i1, int i2, int e0, int e1, int e2) const [[cpu, hc]] + { + static_assert(N == 3, "Rank must be 3."); + + return section(index<3>{i0, i1, i2}, hc::extent<3>{e0, e1, e2}); } /** @} */ @@ -5806,23 +6029,34 @@ class array_view * @return Returns an array_view from this array_view with the element * type reinterpreted from T to ElementType. */ - template - array_view reinterpret_as() const __CPU__ __HC__ { - static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __HCC_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); - if ( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception{"errorMsg_throw", 0}; -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(cache), - extent<1>(size), - (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); - return av; + template + array_view reinterpret_as() const [[cpu]] + { + static_assert( + N == 1, + "reinterpret_as is only permissible on array views of rank 1."); + + hc::extent<1> tmp{extent_.size() / sizeof(U)}; + + if (extent_.size() * sizeof(T) != tmp.size() * sizeof(U)) { + throw runtime_exception{"errorMsg_throw", 0}; } + if (source_) return array_view{tmp, source_}; + return array_view{tmp}; + } + template + array_view reinterpret_as() const [[hc]] + { + static_assert( + N == 1, + "reinterpret_as is only permissible on array views of rank 1."); + + hc::extent<1> tmp{extent_.size() / sizeof(U)}; + + return array_view{tmp, base_ptr_}; + } + /** * This member function is similar to "array::view_as", although it * only supports array_views of rank 1 (only those guarantee that all @@ -5831,68 +6065,37 @@ class array_view * @return Returns an array_view from this array_view with the rank * changed to K from 1. */ - template - array_view view_as(hc::extent viewExtent) const __CPU__ __HC__ { - static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __HCC_ACCELERATOR__ != 1 - if ( viewExtent.size() > extent.size()) - throw runtime_exception{"errorMsg_throw", 0}; -#endif - array_view av(cache, viewExtent, offset + index_base[0]); - return av; - } - - ~array_view() __CPU__ __HC__ = default; - - // FIXME: the following functions could be considered to move to private - const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; } - - int get_offset() const __CPU__ __HC__ { return offset; } + template + array_view view_as(const hc::extent& view_extent) const [[cpu]] + { + static_assert( + N == 1, "view_as is only permissible on array views of rank 1"); - index get_index_base() const __CPU__ __HC__ { return index_base; } + if (extent_.size() < view_extent.size()) { + throw runtime_exception{"errorMsg_throw", 0}; + } -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; + return array_view{view_extent, source_}; + } + template + array_view view_as(const hc::extent& view_extent) const [[hc]] + { + static_assert( + N == 1, "view_as is only permissible on array views of rank 1"); - template - friend - bool is_flat(const array_view&) noexcept; - template - friend - void copy(const array&, const array_view&); - template - friend - void copy(InputIter, InputIter, const array_view&); - template - friend - void copy(const array_view&, array&); - template - friend - void copy(const array_view&, OutputIter); - template - friend - void copy(const array_view&, const array_view&); + return array_view{view_extent, source_}; + } - // used by view_as and reinterpret_as - array_view(const acc_buffer_t& cache, const hc::extent& ext, - int offset) __CPU__ __HC__ - : cache(cache), extent(ext), extent_base(ext), offset(offset) {} + ~array_view() [[cpu]][[hc]] + { + #if __HCC_ACCELERATOR__ != 1 + synchronize(access_type_read_write); - // used by section and projection - array_view(const acc_buffer_t& cache, const hc::extent& ext_now, - const hc::extent& ext_b, - const index& idx_b, int off) __CPU__ __HC__ - : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), - offset(off) {} + std::lock_guard lck{mutex_}; - acc_buffer_t cache; - hc::extent extent; - hc::extent extent_base; - index index_base; - int offset; + if (data_.use_count() == 2) cache_.erase(source_); + #endif + } }; // ------------------------------------------------------------------------ From 73484e2d14ec7f43c590c00ecb5e213def65e133 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 26 Aug 2018 23:16:09 +0100 Subject: [PATCH 005/134] Switch tests to use HC exclusively, pending C++AMP removal. --- .../AcceleratorViewCopy/avstress_0x18.cpp | 4 +- .../AcceleratorViewCopy/avstress_0xFF.cpp | 4 +- .../Negative/Test.04/test.cpp | 2 +- .../Overloading/Overloading.01/test.cpp | 6 +- .../Overloading/Overloading.04/test.cpp | 2 +- .../Overloading/Overloading.09/test.cpp | 2 +- .../Overloading/Overloading.12/test.cpp | 2 +- .../Overloading/Overloading.15/test.cpp | 2 +- .../Overloading/Overloading.66/test.cpp | 2 +- .../Overloading/Overloading.68/test.cpp | 2 +- .../Overloading/Overloading.69/test.cpp | 2 +- .../Overloading/Overloading.71/test.cpp | 2 +- .../Overloading/Overloading.72/test.cpp | 2 +- .../Overloading/Overloading.73/test.cpp | 2 +- .../typeid_operator/Test.01/test.cpp | 4 +- .../typeid_operator/Test.02/test.cpp | 4 +- tests/Unit/AMDGPU/ballot.cpp | 46 +++--- .../AcceleratorViewCopy/avcopy_classic.cpp | 4 +- .../AcceleratorViewCopy/copy_coherency.cpp | 2 +- .../AcceleratorViewCopy/copy_coherency2.cpp | 2 +- tests/Unit/AmpMath/amp_math_acos.cpp | 8 +- .../AmpMath/amp_math_acos_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_acosf.cpp | 8 +- .../AmpMath/amp_math_acosh_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_asin.cpp | 8 +- .../AmpMath/amp_math_asin_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_asinf.cpp | 8 +- .../AmpMath/amp_math_asinh_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_atan.cpp | 8 +- tests/Unit/AmpMath/amp_math_atan2.cpp | 8 +- .../AmpMath/amp_math_atan2_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_atan2f.cpp | 8 +- .../AmpMath/amp_math_atan2f_precise_math.cpp | 8 +- .../AmpMath/amp_math_atan_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_atanf.cpp | 8 +- .../AmpMath/amp_math_atanh_precise_math.cpp | 8 +- .../AmpMath/amp_math_cbrt_precise_math.cpp | 8 +- .../AmpMath/amp_math_cbrtf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_ceil.cpp | 8 +- .../AmpMath/amp_math_ceil_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_ceilf.cpp | 8 +- .../amp_math_copysign_precise_math.cpp | 8 +- .../amp_math_copysignf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_cos.cpp | 8 +- .../AmpMath/amp_math_cos_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_cosf.cpp | 8 +- tests/Unit/AmpMath/amp_math_cosh.cpp | 8 +- .../AmpMath/amp_math_cosh_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_coshf.cpp | 8 +- .../AmpMath/amp_math_coshf_precise_math.cpp | 8 +- .../AmpMath/amp_math_cospi_precise_math.cpp | 8 +- .../AmpMath/amp_math_cospif_precise_math.cpp | 8 +- .../AmpMath/amp_math_erf_precise_math.cpp | 8 +- .../AmpMath/amp_math_erfc_precise_math.cpp | 8 +- .../AmpMath/amp_math_erfcf_precise_math.cpp | 8 +- .../AmpMath/amp_math_erff_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_exp.cpp | 8 +- tests/Unit/AmpMath/amp_math_exp10.cpp | 8 +- .../AmpMath/amp_math_exp10_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_exp10f.cpp | 8 +- tests/Unit/AmpMath/amp_math_exp2.cpp | 8 +- .../AmpMath/amp_math_exp2_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_exp2f.cpp | 8 +- .../AmpMath/amp_math_exp_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_expf.cpp | 8 +- .../AmpMath/amp_math_expf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_expm1.cpp | 8 +- .../AmpMath/amp_math_expm1_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_expm1f.cpp | 8 +- .../AmpMath/amp_math_fdim_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_floor.cpp | 8 +- .../AmpMath/amp_math_floor_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_floorf.cpp | 8 +- .../AmpMath/amp_math_fma_precise_math.cpp | 8 +- .../AmpMath/amp_math_fmaf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_fmax.cpp | 8 +- .../AmpMath/amp_math_fmax_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_fmaxf.cpp | 8 +- tests/Unit/AmpMath/amp_math_fmin.cpp | 8 +- .../AmpMath/amp_math_fmin_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_fminf.cpp | 8 +- tests/Unit/AmpMath/amp_math_fmod.cpp | 8 +- .../AmpMath/amp_math_fmod_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_fmodf.cpp | 8 +- .../AmpMath/amp_math_hypot_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_ilogb.cpp | 8 +- .../AmpMath/amp_math_ilogb_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_ilogbf.cpp | 8 +- tests/Unit/AmpMath/amp_math_isfinite.cpp | 8 +- .../amp_math_isfinite_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_isinf.cpp | 8 +- .../AmpMath/amp_math_isinf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_isnan.cpp | 8 +- .../AmpMath/amp_math_isnan_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_isnormal.cpp | 8 +- .../amp_math_isnormal_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_ldexp.cpp | 8 +- .../AmpMath/amp_math_ldexp_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_ldexpf.cpp | 8 +- .../AmpMath/amp_math_ldexpf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_log.cpp | 8 +- tests/Unit/AmpMath/amp_math_log10.cpp | 8 +- .../AmpMath/amp_math_log10_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_log10f.cpp | 8 +- .../AmpMath/amp_math_log1p_precise_math.cpp | 8 +- .../AmpMath/amp_math_log1pf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_log2.cpp | 8 +- .../AmpMath/amp_math_log2_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_log2f.cpp | 8 +- .../AmpMath/amp_math_log_precise_math.cpp | 8 +- .../AmpMath/amp_math_logb_precise_math.cpp | 8 +- .../AmpMath/amp_math_logbf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_max.cpp | 8 +- .../AmpMath/amp_math_max_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_min.cpp | 8 +- .../AmpMath/amp_math_min_precise_math.cpp | 8 +- .../amp_math_nearbyint_precise_math.cpp | 8 +- .../amp_math_nextafter_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_pow.cpp | 8 +- .../AmpMath/amp_math_pow_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_powf.cpp | 8 +- .../AmpMath/amp_math_rcbrt_precise_math.cpp | 8 +- .../AmpMath/amp_math_rcbrtf_precise_math.cpp | 8 +- .../amp_math_remainder_precise_math.cpp | 8 +- .../amp_math_remainderf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_round.cpp | 8 +- .../AmpMath/amp_math_round_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_roundf.cpp | 8 +- tests/Unit/AmpMath/amp_math_rsqrt.cpp | 8 +- .../AmpMath/amp_math_rsqrt_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_rsqrtf.cpp | 8 +- .../AmpMath/amp_math_scalb_precise_math.cpp | 8 +- .../AmpMath/amp_math_scalbn_precise_math.cpp | 8 +- .../AmpMath/amp_math_scalbnf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_signbit.cpp | 8 +- .../AmpMath/amp_math_signbit_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_signbitf.cpp | 8 +- tests/Unit/AmpMath/amp_math_sin.cpp | 8 +- .../AmpMath/amp_math_sin_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_sinf.cpp | 8 +- tests/Unit/AmpMath/amp_math_sinh.cpp | 8 +- .../AmpMath/amp_math_sinh_precise_math.cpp | 8 +- .../AmpMath/amp_math_sinpi_precise_math.cpp | 8 +- .../AmpMath/amp_math_sinpif_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_sqrt.cpp | 8 +- .../AmpMath/amp_math_sqrt_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_sqrtf.cpp | 8 +- tests/Unit/AmpMath/amp_math_tan.cpp | 8 +- .../AmpMath/amp_math_tan_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_tanf.cpp | 8 +- tests/Unit/AmpMath/amp_math_tanh.cpp | 8 +- .../AmpMath/amp_math_tanh_precise_math.cpp | 8 +- .../AmpMath/amp_math_tanpi_precise_math.cpp | 8 +- .../AmpMath/amp_math_tgamma_precise_math.cpp | 8 +- .../AmpMath/amp_math_tgammaf_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_trunc.cpp | 8 +- .../AmpMath/amp_math_trunc_precise_math.cpp | 8 +- tests/Unit/AmpMath/amp_math_truncf.cpp | 8 +- .../amp_short_vectors_2files.cpp | 18 +-- .../amp_short_vectors_2files.h | 10 +- .../amp_short_vectors_2files_1.cpp | 18 +-- .../amp_short_vectors_double_3_addon.cpp | 14 +- .../amp_short_vectors_float_2_addon.cpp | 10 +- .../amp_short_vectors_int_4_addon.cpp | 14 +- .../amp_short_vectors_norm.cpp | 66 ++++----- .../amp_short_vectors_short_vector.cpp | 6 +- .../amp_short_vectors_short_vector_traits.cpp | 6 +- .../amp_short_vectors_uint_2_addon.cpp | 32 ++-- .../amp_short_vectors_unorm.cpp | 64 ++++---- .../hc_short_vector_device.cpp | 4 +- tests/Unit/AsyncPFE/accelerator_view_wait.cpp | 2 +- .../Unit/AsyncPFE/accelerator_view_wait2.cpp | 2 +- .../Unit/AsyncPFE/accelerator_view_wait3.cpp | 2 +- tests/Unit/AsyncPFE/async_array_add.cpp | 2 +- tests/Unit/AsyncPFE/async_array_add_2d.cpp | 2 +- tests/Unit/AsyncPFE/async_array_add_3d.cpp | 2 +- tests/Unit/AsyncPFE/async_array_add_4d.cpp | 2 +- .../AsyncPFE/async_array_add_multiple.cpp | 2 +- .../AsyncPFE/async_array_add_multiple_2d.cpp | 2 +- .../AsyncPFE/async_array_add_multiple_3d.cpp | 2 +- .../AsyncPFE/async_array_add_multiple_4d.cpp | 2 +- .../async_array_add_multiple_tiled.cpp | 2 +- .../async_array_add_multiple_tiled_2d.cpp | 2 +- .../async_array_add_multiple_tiled_3d.cpp | 2 +- tests/Unit/AsyncPFE/async_array_add_then.cpp | 6 +- tests/Unit/AsyncPFE/async_array_add_tiled.cpp | 2 +- .../AsyncPFE/async_array_add_tiled_2d.cpp | 2 +- .../AsyncPFE/async_array_add_tiled_3d.cpp | 2 +- tests/Unit/AsyncPFE/async_av_dependent1.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent2.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent3.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent4.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent5.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent6.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent7.cpp | 6 +- tests/Unit/AsyncPFE/async_av_dependent8.cpp | 6 +- tests/Unit/AsyncPFE/async_av_independent1.cpp | 6 +- tests/Unit/AsyncPFE/async_av_independent2.cpp | 6 +- tests/Unit/AsyncPFE/async_av_independent3.cpp | 6 +- tests/Unit/AsyncPFE/async_av_independent4.cpp | 6 +- .../Unit/AsyncPFE/completion_future_wait.cpp | 2 +- .../Unit/AsyncPFE/completion_future_wait2.cpp | 2 +- tests/Unit/Atomic/atomic_add_float_global.cpp | 6 +- tests/Unit/Atomic/atomic_add_float_local.cpp | 14 +- tests/Unit/Atomic/atomic_add_global.cpp | 6 +- tests/Unit/Atomic/atomic_add_local.cpp | 16 +- tests/Unit/Atomic/atomic_and_global.cpp | 6 +- tests/Unit/Atomic/atomic_and_local.cpp | 14 +- .../Atomic/atomic_compare_exchange_global.cpp | 6 +- .../Atomic/atomic_compare_exchange_local.cpp | 13 +- tests/Unit/Atomic/atomic_dec_global.cpp | 8 +- tests/Unit/Atomic/atomic_dec_local.cpp | 14 +- .../Atomic/atomic_exchange_float_global.cpp | 6 +- .../Atomic/atomic_exchange_float_local.cpp | 11 +- tests/Unit/Atomic/atomic_exchange_global.cpp | 6 +- tests/Unit/Atomic/atomic_exchange_local.cpp | 14 +- tests/Unit/Atomic/atomic_inc_global.cpp | 6 +- tests/Unit/Atomic/atomic_inc_local.cpp | 14 +- tests/Unit/Atomic/atomic_max_global.cpp | 6 +- tests/Unit/Atomic/atomic_max_local.cpp | 14 +- tests/Unit/Atomic/atomic_min_global.cpp | 6 +- tests/Unit/Atomic/atomic_min_local.cpp | 14 +- tests/Unit/Atomic/atomic_or_global.cpp | 6 +- tests/Unit/Atomic/atomic_or_local.cpp | 14 +- tests/Unit/Atomic/atomic_sub_float_global.cpp | 6 +- tests/Unit/Atomic/atomic_sub_float_local.cpp | 14 +- tests/Unit/Atomic/atomic_sub_global.cpp | 6 +- tests/Unit/Atomic/atomic_sub_local.cpp | 14 +- tests/Unit/Atomic/atomic_xor_global.cpp | 9 +- tests/Unit/Atomic/atomic_xor_local.cpp | 16 +- .../2_4_1_3_Comp_Type_OKCases.cpp | 23 --- tests/Unit/AutoRestricted/Case1.cpp | 42 ------ tests/Unit/AutoRestricted/MemberExpr.cpp | 50 ------- .../2_4_1_3_Comp_Type_Negative/1d.cpp | 39 ----- .../PointerArrayElementType.cpp | 30 ---- .../PointerToPointer.cpp | 30 ---- .../2_4_1_3_Comp_Type_Negative/bitfield.cpp | 38 ----- .../2_4_1_3_Comp_Type_Negative/bool_array.cpp | 30 ---- .../2_4_1_3_Comp_Type_Negative/char_array.cpp | 32 ---- .../2_4_1_3_Comp_Type_Negative/common.h | 17 --- .../function_pointer.cpp | 31 ---- .../function_reference.cpp | 34 ----- .../Negative/Stmt_Cases/CXXThrowExpr.cpp | 26 ---- .../Negative/Stmt_Cases/CXXTryStmt.cpp | 29 ---- .../Dtor_has_multiple_restrictions.cpp | 29 ---- .../Negative/Stmt_Cases/DynamicCastExpr.cpp | 30 ---- .../Negative/Stmt_Cases/Enum.cpp | 37 ----- .../Negative/Stmt_Cases/GotoStmt.cpp | 29 ---- .../Negative/Stmt_Cases/TypeidExpr.cpp | 29 ---- .../Negative/Stmt_Cases/Volatile.cpp | 39 ----- .../char_short_wchar_longlong_longdouble.cpp | 38 ----- .../after_mutable_keyword.cpp | 24 --- .../after_throw_and_mutable_keyword.cpp | 24 --- .../after_throw_keyword_1.cpp | 25 ---- .../after_throw_keyword_2.cpp | 27 ---- .../auto_in_function_prototype.cpp | 22 --- .../auto-on-wrong-place/before_CV.cpp | 27 ---- .../before_function_name.cpp | 27 ---- .../before_function_type.cpp | 22 --- .../auto-on-wrong-place/most_vexing_parse.cpp | 33 ----- .../on_more_declarations.cpp | 33 ----- .../Negative/declarator_not_definition.cpp | 16 -- .../diagnose_before_perform_inferring_AMP.cpp | 23 --- .../diagnose_before_perform_inferring_CPU.cpp | 21 --- .../Negative/function_reference.cpp | 31 ---- .../Negative/has_other_declarations.cpp | 15 -- .../Negative/infer_error_amp.cpp | 28 ---- .../Negative/infer_error_cpu.cpp | 24 --- ...uld_have_no_non-auto-restriction_added.cpp | 25 ---- tests/Unit/AutoRestricted/auto_auto.cpp | 43 ------ .../inferred_as_both_cpu_and_amp.cpp | 45 ------ tests/Unit/AutoRestricted/normal.cpp | 13 -- tests/Unit/AutoRestricted/on_lambda.cpp | 36 ----- tests/Unit/AutoRestricted/only_auto.cpp | 20 --- .../AutoRestricted/restriction_inferred.cpp | 36 ----- tests/Unit/CXXLangExt/array_array.cpp | 6 +- tests/Unit/CXXLangExt/array_pointer.cpp | 6 +- tests/Unit/CXXLangExt/enum.cpp | 6 +- .../function_declarator_Varargs.cpp | 4 +- tests/Unit/CXXLangExt/local_param_ret.cpp | 8 +- .../CXXLangExt/local_param_ret_half-float.cpp | 8 +- .../local_param_ret_pointer-to-function.cpp | 8 +- .../CXXLangExt/local_param_ret_pointer.cpp | 8 +- .../local_param_ret_ref-to-pointer.cpp | 8 +- tests/Unit/CXXLangExt/local_param_ret_ref.cpp | 8 +- .../local_param_ret_static-local.cpp | 8 +- tests/Unit/CXXLangExt/statement_asm.cpp | 8 +- .../CXXLangExt/statement_global-variable.cpp | 4 +- .../Unit/CXXLangExt/statement_goto_label.cpp | 8 +- tests/Unit/CXXLangExt/statement_recursion.cpp | 6 +- tests/Unit/CXXLangExt/struct_class_union.cpp | 6 +- .../struct_class_union_bitfields.cpp | 6 +- .../struct_class_union_half-float.cpp | 6 +- .../CXXLangExt/struct_class_union_pointer.cpp | 6 +- .../CXXLangExt/struct_class_union_ref.cpp | 6 +- .../struct_class_union_unaligned-member.cpp | 6 +- .../struct_class_virtual-base-class.cpp | 4 +- tests/Unit/CaptureByCopy/test1.cpp | 14 +- tests/Unit/CaptureByCopy/test2.cpp | 14 +- tests/Unit/CaptureByCopy/test3.cpp | 16 +- tests/Unit/CaptureByCopy/test4.cpp | 16 +- tests/Unit/CaptureByRef/test1.cpp | 6 +- tests/Unit/CaptureByRef/test10.cpp | 6 +- tests/Unit/CaptureByRef/test11.cpp | 6 +- tests/Unit/CaptureByRef/test12.cpp | 6 +- tests/Unit/CaptureByRef/test13.cpp | 16 +- tests/Unit/CaptureByRef/test14.cpp | 10 +- tests/Unit/CaptureByRef/test15.cpp | 6 +- tests/Unit/CaptureByRef/test2.cpp | 6 +- tests/Unit/CaptureByRef/test3.cpp | 6 +- tests/Unit/CaptureByRef/test4.cpp | 6 +- tests/Unit/CaptureByRef/test5.cpp | 6 +- tests/Unit/CaptureByRef/test6.cpp | 6 +- tests/Unit/CaptureByRef/test7.cpp | 6 +- tests/Unit/CaptureByRef/test8.cpp | 6 +- tests/Unit/CaptureByRef/test9.cpp | 6 +- .../Codegen/barrier_should_not_unwind.cpp | 12 +- .../Codegen/compile_error_for_arraytype.cpp | 4 +- tests/Unit/Codegen/index_operator_test.cpp | 4 +- tests/Unit/Codegen/opt_level0.cpp | 6 +- tests/Unit/Codegen/opt_level1.cpp | 6 +- tests/Unit/Codegen/restric_overload.cpp | 8 +- tests/Unit/Codegen/separate.cpp | 6 +- tests/Unit/Codegen/separate2.cpp | 10 +- tests/Unit/Codegen/tworef.cpp | 6 +- .../Codegen/vector_addition_using_array.cpp | 11 +- tests/Unit/Copy/copy.cpp | 23 +-- tests/Unit/DataContainers/array_view.cpp | 16 +- tests/Unit/DataContainers/array_view_2d.1.cpp | 6 +- tests/Unit/DataContainers/array_view_2d.2.cpp | 6 +- tests/Unit/DataContainers/array_view_2d.3.cpp | 6 +- tests/Unit/DataContainers/extent.cpp | 22 +-- tests/Unit/Design/2d.cpp | 134 +++++++++-------- tests/Unit/Design/5d.support.cpp | 6 +- tests/Unit/Design/addr_space.cpp | 10 +- tests/Unit/Design/array_view_extent.cpp | 6 +- tests/Unit/Design/array_view_extent_2d.cpp | 6 +- .../Unit/Design/array_view_extent_2d_tile.cpp | 15 +- .../Design/double_lamda_in_one_fuction.cpp | 8 +- tests/Unit/Design/lambda.cpp | 24 +-- tests/Unit/Design/lambda_tiled.cpp | 32 ++-- tests/Unit/Design/lambda_tiled_local.cpp | 32 ++-- tests/Unit/Design/overload.cpp | 15 +- tests/Unit/Design/pass_by_ref.cpp | 57 +++----- .../quick_prototype_vector_add_using_gmac.cpp | 50 +++---- tests/Unit/Design/transpose.cpp | 31 ++-- tests/Unit/Design/veccadd3.cpp | 6 +- .../Unit/DispatchAql/dispatch_hsa_kernel.cpp | 2 +- tests/Unit/DynamicTileStatic/test3.cpp | 94 +++++++----- tests/Unit/DynamicTileStatic/test8.cpp | 12 +- tests/Unit/DynamicTileStatic/test9.cpp | 26 ++-- tests/Unit/HC/auto_annotate_attribute.cpp | 138 ------------------ tests/Unit/HC/create_blocking_marker.cpp | 2 +- tests/Unit/HC/create_blocking_marker2.cpp | 2 +- tests/Unit/HC/test2.cpp | 6 +- tests/Unit/HC/wg_register_limit1.cpp | 2 +- tests/Unit/HC/wg_size_unsupported1.cpp | 2 +- tests/Unit/HC/wg_size_unsupported2.cpp | 4 +- tests/Unit/HC/wg_size_unsupported3.cpp | 6 +- tests/Unit/HC/wg_size_unsupported4.cpp | 6 +- tests/Unit/HSA/functor1.cpp | 11 +- tests/Unit/HSA/functor2.cpp | 10 +- tests/Unit/HSA/functor3.cpp | 12 +- tests/Unit/HSA/functor4.cpp | 8 +- tests/Unit/HSA/functor5.cpp | 10 +- tests/Unit/HSA/functor6.cpp | 12 +- tests/Unit/HSA/list.cpp | 6 +- tests/Unit/HSA/list2.cpp | 4 +- tests/Unit/HSA/no_printf.cpp | 2 +- tests/Unit/HSA/printf_minimal.cpp | 2 +- tests/Unit/HSA/sizeof.cpp | 6 +- tests/Unit/HSA/string.cpp | 6 +- tests/Unit/HSA/volatile_union.cpp | 8 +- tests/Unit/Indexing/extent.cpp | 6 +- tests/Unit/Indexing/index.cpp | 32 ++-- tests/Unit/Indexing/tile_index.cpp | 29 ++-- tests/Unit/InvalidLambda/empty_lambda2.cpp | 8 +- tests/Unit/InvalidLambda/qq.cpp | 24 +-- tests/Unit/Macro/check_hcc_accelerator.cpp | 6 +- tests/Unit/Macro/check_hcc_cpu.cpp | 6 +- ...Caller-amp-only-Callee-global-cpu-only.cpp | 6 +- tests/Unit/Overload/Disjoint_restrict.cpp | 26 ++-- ...ction_in_cpu_function_or_lambda_or_pfe.cpp | 10 +- .../Negative/call_amp_function_in_main.cpp | 6 +- .../Negative/call_amp_linking_error.cpp | 10 +- ...ntion_in_amp_function_or_lambda_or_pfe.cpp | 14 +- .../call_distinct_from_dual_context.cpp | 12 +- .../Unit/Overload/Negative/linking_error.cpp | 8 +- tests/Unit/Overload/Test_Overload.cpp | 26 ++-- .../Overload/amp-lambda_or_pfe_in_main.cpp | 10 +- ...a_cpu_or_cpu_elided_function_or_lambda.cpp | 24 +-- .../Overload/cpu_caller_distinct_callees.cpp | 6 +- .../cpu_function_or_lambda_in_main.cpp | 8 +- .../Overload/cpu_lambda_in_amp_function.cpp | 12 +- tests/Unit/Parse/amp_header_test.cpp | 2 +- tests/Unit/Parse/class_cross_referencing.cpp | 8 +- tests/Unit/Parse/function_declarator.cpp | 2 +- tests/Unit/Parse/lambda_attribute.cpp | 4 +- tests/Unit/Parse/lambda_attribute_hc.cpp | 4 +- tests/Unit/Parse/lambda_expr.amp.cpp | 2 +- tests/Unit/Parse/lambda_expr.both.cpp | 2 +- tests/Unit/Parse/lambda_expr.cpu.cpp | 2 +- tests/Unit/Parse/lambda_expr.cpu_only.cpp | 2 +- tests/Unit/Parse/lambda_expr.mixed.cpp | 2 +- tests/Unit/Parse/lambda_expr.recursive.cpp | 2 +- .../Unit/Parse/lambda_expr.without.params.cpp | 4 +- tests/Unit/Parse/method_declarator.cpp | 6 +- tests/Unit/PlatformAtomics/atomic_int.cpp | 6 +- tests/Unit/PlatformAtomics/pingpong.cpp | 6 +- tests/Unit/PlatformAtomics/sync_1way.cpp | 6 +- tests/Unit/PlatformAtomics/sync_2way.cpp | 6 +- tests/Unit/PlatformAtomics/syscall.cpp | 6 +- tests/Unit/RawPointer/array_add.cpp | 6 +- .../Negative/empty_restriction.cpp | 2 +- .../Negative/id_is_unrecognized.cpp | 2 +- .../Negative/non-comma_between_ids.cpp | 2 +- .../Negative/non-id_at_two_ends.cpp | 2 +- .../Negative/should_not_parse.cpp | 4 +- .../RestrictionSpecifier/Negative/space.cpp | 2 +- tests/Unit/RestrictionSpecifier/OKCase.cpp | 8 +- .../Override_Qualifier.cpp | 4 +- tests/Unit/SharedLibrary/shared_library2.cpp | 2 +- tests/Unit/SharedLibrary/shared_library3.cpp | 2 +- ...Specialization_Inheritate_Restrictions.cpp | 6 +- tests/Unit/decltype/TrailingReturn.cpp | 12 +- 425 files changed, 1706 insertions(+), 3187 deletions(-) delete mode 100644 tests/Unit/AutoRestricted/2_4_1_3_Comp_Type_OKCases.cpp delete mode 100644 tests/Unit/AutoRestricted/Case1.cpp delete mode 100644 tests/Unit/AutoRestricted/MemberExpr.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/1d.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerArrayElementType.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerToPointer.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bitfield.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bool_array.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/char_array.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/common.h delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_pointer.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_reference.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXThrowExpr.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXTryStmt.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/Dtor_has_multiple_restrictions.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/DynamicCastExpr.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/Enum.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/GotoStmt.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/TypeidExpr.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/Volatile.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/Stmt_Cases/char_short_wchar_longlong_longdouble.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_mutable_keyword.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_and_mutable_keyword.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_1.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_2.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/auto_in_function_prototype.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_CV.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_name.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_type.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/most_vexing_parse.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/on_more_declarations.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/declarator_not_definition.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_AMP.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_CPU.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/function_reference.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/has_other_declarations.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/infer_error_amp.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/infer_error_cpu.cpp delete mode 100644 tests/Unit/AutoRestricted/Negative/restriction_inferred_should_have_no_non-auto-restriction_added.cpp delete mode 100644 tests/Unit/AutoRestricted/auto_auto.cpp delete mode 100644 tests/Unit/AutoRestricted/inferred_as_both_cpu_and_amp.cpp delete mode 100644 tests/Unit/AutoRestricted/normal.cpp delete mode 100644 tests/Unit/AutoRestricted/on_lambda.cpp delete mode 100644 tests/Unit/AutoRestricted/only_auto.cpp delete mode 100644 tests/Unit/AutoRestricted/restriction_inferred.cpp delete mode 100644 tests/Unit/HC/auto_annotate_attribute.cpp diff --git a/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp b/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp index b4e4f0f27e7..d325eef79e7 100644 --- a/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp +++ b/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp @@ -1,8 +1,8 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 -DRUNMASK=0x18 && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out +// RUN: %hc %s -o %t.out -lhc_am -I/home/alexv/Programming/ROCR-Runtime/src/inc -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 -DRUNMASK=0x18 && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out #include #include -#include "/opt/rocm/include/hsa/hsa.h" +#include #include #include diff --git a/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp b/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp index b6ef2edcdb3..9788972dda1 100644 --- a/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp +++ b/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp @@ -1,8 +1,8 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 -DRUNMASK=0xff && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out +// RUN: %hc %s -o %t.out -lhc_am -I/home/alexv/Programming/ROCR-Runtime/src/inc -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 -DRUNMASK=0xff && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out #include #include -#include "/opt/rocm/include/hsa/hsa.h" +#include #include #include diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_1_Synt/2_1_1_Func_Decl_Synt/Negative/Test.04/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_1_Synt/2_1_1_Func_Decl_Synt/Negative/Test.04/test.cpp index 2766b6b07e2..ffb459f4d6b 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_1_Synt/2_1_1_Func_Decl_Synt/Negative/Test.04/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_1_Synt/2_1_1_Func_Decl_Synt/Negative/Test.04/test.cpp @@ -10,7 +10,7 @@ // RUN: %clang_cc1 -std=c++amp -fsyntax-only %ampneg -verify %s -void test() throw(...) restrict(cpu) // expected-error {{exception specifier is not allowed in C++AMP context}} +void test() throw(...) [[cpu]] // expected-error {{exception specifier is not allowed in C++AMP context}} {} // Main entry point diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.01/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.01/test.cpp index c89e61f1a0c..1210ec9ac50 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.01/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.01/test.cpp @@ -8,17 +8,17 @@ // RUN: %cxxamp %s -o %t.out && %t.out -int f(int) restrict(amp,cpu) +int f(int) [[cpu, hc]] { return 1; } -int f(float) restrict(amp,cpu) +int f(float) [[cpu, hc]] { return 0; } -bool test() restrict(amp,cpu) +bool test() [[cpu, hc]] { bool passed = true; diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.04/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.04/test.cpp index cb955d3062a..32c96627bd3 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.04/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.04/test.cpp @@ -11,7 +11,7 @@ class c { public: - int f(int) restrict(amp) + int f(int) [[hc]] { return 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.09/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.09/test.cpp index a3d0b43cb0e..0567dbbcb54 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.09/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.09/test.cpp @@ -8,7 +8,7 @@ // RUN: %cxxamp %s -o %t.out && %t.out -int f(float) restrict(amp,cpu) +int f(float) [[cpu, hc]] { return 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.12/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.12/test.cpp index 7944860100a..7ff434a2a99 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.12/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.12/test.cpp @@ -16,7 +16,7 @@ class c return 0; } - int f(const int *) restrict(amp,cpu) + int f(const int *) [[cpu, hc]] { return 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp index 23b891dc426..0c767b14a64 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp @@ -8,7 +8,7 @@ // RUN: %cxxamp %s -o %t.out && %t.out -int f(int &) restrict(amp) +int f(int &) [[hc]] { return 0; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.66/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.66/test.cpp index d4bf1f8f308..003b82ca71b 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.66/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.66/test.cpp @@ -14,7 +14,7 @@ class c1 { public: - operator c2() restrict(amp) + operator c2() [[hc]] { flag = 1; c2 o; diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.68/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.68/test.cpp index 2a8a27e9d05..1771370db74 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.68/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.68/test.cpp @@ -9,7 +9,7 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -void f(int &v) restrict(amp,cpu) +void f(int &v) [[cpu, hc]] { v = 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.69/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.69/test.cpp index 4896ef16b55..54d0940a4ac 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.69/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.69/test.cpp @@ -13,7 +13,7 @@ class c { public: - void f(int &v) restrict(amp,cpu) + void f(int &v) [[cpu, hc]] { v = 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.71/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.71/test.cpp index 52d48126682..66bbb452fdf 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.71/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.71/test.cpp @@ -17,7 +17,7 @@ int f(const int c::*) return 0; } -int f(int c::*) restrict(amp,cpu) +int f(int c::*) [[cpu, hc]] { return 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.72/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.72/test.cpp index 73c73082b09..a1d15a54057 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.72/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.72/test.cpp @@ -21,7 +21,7 @@ class c return 0; } - int f(int c2::*) restrict(amp,cpu) + int f(int c2::*) [[cpu, hc]] { return 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.73/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.73/test.cpp index bbf70f92720..63f5235ef0d 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.73/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.73/test.cpp @@ -12,7 +12,7 @@ void f(const int & i) { } -void f(int & i) restrict(amp,cpu) +void f(int & i) [[cpu, hc]] { i = 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.01/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.01/test.cpp index 4ea5da95ece..99e3b975b49 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.01/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.01/test.cpp @@ -4,7 +4,7 @@ // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache Version 2.0 License for specific language governing permissions and limitations under the License. /// P2 -/// Use typeid to compare two equal function pointers, one with restrict(cpu) +/// Use typeid to compare two equal function pointers, one with [[cpu]] // RUN: %cxxamp %s -o %t.out && %t.out @@ -19,6 +19,6 @@ int foo(float a, double b) int main() { int (*p1)(float a, double b) = &foo; - int (*p2)(float a, double b) restrict(cpu) = &foo; + int (*p2)(float a, double b) [[cpu]] = &foo; return typeid(p1) == typeid(p2) ? 0 : 1; } diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.02/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.02/test.cpp index 2c259b069fc..c3a167abfd6 100644 --- a/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.02/test.cpp +++ b/tests/Conformance/2_Cxx_Lang_Exte/2_x_general/typeid_operator/Test.02/test.cpp @@ -4,7 +4,7 @@ // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache Version 2.0 License for specific language governing permissions and limitations under the License. /// P2 -/// Use typeid to compare two equal member function pointers, one with restrict(cpu) +/// Use typeid to compare two equal member function pointers, one with [[cpu]] // RUN: %cxxamp %s -o %t.out && %t.out @@ -20,7 +20,7 @@ struct S int main() { int (S::*p1)(float a, double b) = &S::foo; - int (S::*p2)(float a, double b) restrict(cpu) = &S::foo; + int (S::*p2)(float a, double b) [[cpu]] = &S::foo; return typeid(p1) == typeid(p2) ? 0 : 1; } diff --git a/tests/Unit/AMDGPU/ballot.cpp b/tests/Unit/AMDGPU/ballot.cpp index 51f6dd419ee..f73c5d3a486 100644 --- a/tests/Unit/AMDGPU/ballot.cpp +++ b/tests/Unit/AMDGPU/ballot.cpp @@ -1,34 +1,34 @@ -// RUN: %hc %s -o %t.out && %t.out +// RUN: %hc %s -g3 -o %t.out && %t.out + +#include #include #include -#include #define WAVEFRONT_SIZE (64) // as of now, all HSA agents have wavefront size of 64 #define TEST_DEBUG (0) -int main() { - - hc::array_view a(WAVEFRONT_SIZE); - - hc::extent<1> e(WAVEFRONT_SIZE); - - int errors = 0; - for (int i = 0; i < WAVEFRONT_SIZE; i++) { - hc::parallel_for_each(e,[=](hc::index<1> idx) [[hc]] { - uint64_t d = hc::__ballot(1); - if (idx[0]==i) - a[0] = d; - }).wait(); - if (a[0] != 0xFFFFFFFFFFFFFFFF) { - errors++; +int main() +{ + hc::array_view a{WAVEFRONT_SIZE}; + + hc::extent<1> e{WAVEFRONT_SIZE}; + + int errors = 0; + for (int i = 0; i != WAVEFRONT_SIZE; ++i) { + hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] { + uint64_t d = hc::__ballot(1); + if (idx[0] == i) a[0] = d; + }); + if (a[0] != 0xFFFFFFFFFFFFFFFF) { + ++errors; + } + //#if TEST_DEBUG + std::cout << "(i=" << i << "): 0x" << std::hex << a[0] << std::endl; + std::cout << std::dec; + //#endif } -#if TEST_DEBUG - std::cout << "(i=" << i << "): 0x" << std::hex << a[0] << std::endl; - std::cout << std::dec; -#endif - } - return !(errors==0); + return errors != 0; } diff --git a/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp b/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp index e919a5acd0d..a95a6ef608f 100644 --- a/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp +++ b/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp @@ -1,4 +1,4 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 && %t.out +// RUN: %hc %s -o %t.out -lhc_am -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 && %t.out // // Test "classic" GPU pattern of H2D copies, followed by Kernels, followed by // D2H. @@ -7,8 +7,6 @@ #include #include -#include "/opt/rocm/include/hsa/hsa.h" - #include #include #include diff --git a/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp b/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp index e22e2ba4f6a..3ff07da25f2 100644 --- a/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp +++ b/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp @@ -1,4 +1,4 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 && %t.out +// RUN: %hc %s -o %t.out -lhc_am -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 && %t.out // // Test coherency and flushes. Need to flush GPU caches before H2D copy diff --git a/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp b/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp index 3862f2fa90a..8eb16441c01 100644 --- a/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp +++ b/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp @@ -1,4 +1,4 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 && %t.out +// RUN: %hc %s -o %t.out -lhc_am -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 && %t.out // // Test coherency and flushes. Need to flush GPU caches before H2D copy diff --git a/tests/Unit/AmpMath/amp_math_acos.cpp b/tests/Unit/AmpMath/amp_math_acos.cpp index 99801383045..9abb24a7e96 100644 --- a/tests/Unit/AmpMath/amp_math_acos.cpp +++ b/tests/Unit/AmpMath/amp_math_acos.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::acos(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp b/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp index c5fe4a649f9..238fbb3ef92 100644 --- a/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::acos(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_acosf.cpp b/tests/Unit/AmpMath/amp_math_acosf.cpp index 3434e000145..5ec24ef47ac 100644 --- a/tests/Unit/AmpMath/amp_math_acosf.cpp +++ b/tests/Unit/AmpMath/amp_math_acosf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::acosf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp index 6ab14aff9ed..d4a10f01fe4 100644 --- a/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -34,7 +34,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::acosh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_asin.cpp b/tests/Unit/AmpMath/amp_math_asin.cpp index dc0824eb569..9680e1ea4bd 100644 --- a/tests/Unit/AmpMath/amp_math_asin.cpp +++ b/tests/Unit/AmpMath/amp_math_asin.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::asin(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp b/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp index 9dee528ea5c..6a4e9c3a9de 100644 --- a/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::asin(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_asinf.cpp b/tests/Unit/AmpMath/amp_math_asinf.cpp index 96517b3470d..a9089332eb1 100644 --- a/tests/Unit/AmpMath/amp_math_asinf.cpp +++ b/tests/Unit/AmpMath/amp_math_asinf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::asinf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp index 8be7cf5949c..04ec1f3e58e 100644 --- a/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::asinh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atan.cpp b/tests/Unit/AmpMath/amp_math_atan.cpp index 1d4ab536164..3b4451b174f 100644 --- a/tests/Unit/AmpMath/amp_math_atan.cpp +++ b/tests/Unit/AmpMath/amp_math_atan.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::atan(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atan2.cpp b/tests/Unit/AmpMath/amp_math_atan2.cpp index 006c1a41b41..cc4cfe95c44 100644 --- a/tests/Unit/AmpMath/amp_math_atan2.cpp +++ b/tests/Unit/AmpMath/amp_math_atan2.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -40,7 +40,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::atan2(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp index 88b05d2f83c..3e8c4cf25b6 100644 --- a/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -40,7 +40,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::atan2(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atan2f.cpp b/tests/Unit/AmpMath/amp_math_atan2f.cpp index 83553eb6411..07570bdcea8 100644 --- a/tests/Unit/AmpMath/amp_math_atan2f.cpp +++ b/tests/Unit/AmpMath/amp_math_atan2f.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -40,7 +40,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::atan2f(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp index 3224cc3a091..589d2c8f613 100644 --- a/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -40,7 +40,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::atan2f(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp index 2ec4ec8705d..80ecfcc9710 100644 --- a/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::atan(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atanf.cpp b/tests/Unit/AmpMath/amp_math_atanf.cpp index e6264e994af..2dda8882c30 100644 --- a/tests/Unit/AmpMath/amp_math_atanf.cpp +++ b/tests/Unit/AmpMath/amp_math_atanf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::atanf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp index 4e20481342d..9ba2c066c3f 100644 --- a/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -34,7 +34,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::atanh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp index d2a5f096f29..177130cf982 100644 --- a/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::cbrt(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp index 9c0d34d1622..096132089c3 100644 --- a/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::cbrtf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ceil.cpp b/tests/Unit/AmpMath/amp_math_ceil.cpp index 07c040e611d..9fa9408e35a 100644 --- a/tests/Unit/AmpMath/amp_math_ceil.cpp +++ b/tests/Unit/AmpMath/amp_math_ceil.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -36,7 +36,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::ceil(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp index f156b3c43d4..85ab3f9ada9 100644 --- a/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -36,7 +36,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::ceil(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ceilf.cpp b/tests/Unit/AmpMath/amp_math_ceilf.cpp index 7551e54100f..5380c04bdc8 100644 --- a/tests/Unit/AmpMath/amp_math_ceilf.cpp +++ b/tests/Unit/AmpMath/amp_math_ceilf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -33,7 +33,7 @@ bool test() { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::ceilf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp b/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp index dcb18d896f5..8c73534f785 100644 --- a/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::copysign(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp index 5a0a880ea1c..a75eb7abb50 100644 --- a/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::copysignf(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cos.cpp b/tests/Unit/AmpMath/amp_math_cos.cpp index 858857f5486..75455e3c889 100644 --- a/tests/Unit/AmpMath/amp_math_cos.cpp +++ b/tests/Unit/AmpMath/amp_math_cos.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::cos(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp index 0faa5321a8f..ea14cb4c3df 100644 --- a/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::cos(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cosf.cpp b/tests/Unit/AmpMath/amp_math_cosf.cpp index 35ceeae52aa..3cd40409fe8 100644 --- a/tests/Unit/AmpMath/amp_math_cosf.cpp +++ b/tests/Unit/AmpMath/amp_math_cosf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::cosf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cosh.cpp b/tests/Unit/AmpMath/amp_math_cosh.cpp index 0636ee59f6a..e687b424f44 100644 --- a/tests/Unit/AmpMath/amp_math_cosh.cpp +++ b/tests/Unit/AmpMath/amp_math_cosh.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::cosh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp index 80beb25db85..477e4382119 100644 --- a/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::cosh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_coshf.cpp b/tests/Unit/AmpMath/amp_math_coshf.cpp index 1e4e5a00481..972c95d3360 100644 --- a/tests/Unit/AmpMath/amp_math_coshf.cpp +++ b/tests/Unit/AmpMath/amp_math_coshf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::coshf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp index 10e807bd682..eb2df4ae66e 100644 --- a/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::coshf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp index d2e0849e550..1663ce5a165 100644 --- a/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::cospi(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp index d2e0849e550..1663ce5a165 100644 --- a/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::cospi(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp index 41d360b491f..cce847c3789 100644 --- a/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::erf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp index 49520ff51e7..4c7a576ce13 100644 --- a/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::erfc(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp index 71993158a28..d4fa4a617f1 100644 --- a/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::erfcf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp index 383f3111185..ad216c20aae 100644 --- a/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::erff(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp.cpp b/tests/Unit/AmpMath/amp_math_exp.cpp index 1a09d78b04f..fd834fac3d6 100644 --- a/tests/Unit/AmpMath/amp_math_exp.cpp +++ b/tests/Unit/AmpMath/amp_math_exp.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::exp(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp10.cpp b/tests/Unit/AmpMath/amp_math_exp10.cpp index a51708ba2bc..66650f0d006 100644 --- a/tests/Unit/AmpMath/amp_math_exp10.cpp +++ b/tests/Unit/AmpMath/amp_math_exp10.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::exp10(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp b/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp index 435ee87a5cf..b99e6fce951 100644 --- a/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::exp10(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp10f.cpp b/tests/Unit/AmpMath/amp_math_exp10f.cpp index 9b7f8e489dc..df81ec74530 100644 --- a/tests/Unit/AmpMath/amp_math_exp10f.cpp +++ b/tests/Unit/AmpMath/amp_math_exp10f.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::exp10f(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp2.cpp b/tests/Unit/AmpMath/amp_math_exp2.cpp index 9885e1e6ca9..30ad66243d3 100644 --- a/tests/Unit/AmpMath/amp_math_exp2.cpp +++ b/tests/Unit/AmpMath/amp_math_exp2.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::exp2(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp b/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp index ad8d48a112b..a763caa8a43 100644 --- a/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::exp2(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp2f.cpp b/tests/Unit/AmpMath/amp_math_exp2f.cpp index 83740e37685..daf80343061 100644 --- a/tests/Unit/AmpMath/amp_math_exp2f.cpp +++ b/tests/Unit/AmpMath/amp_math_exp2f.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::exp2f(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp b/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp index 5dc2dbd7d4c..c22643b5218 100644 --- a/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp @@ -5,15 +5,15 @@ // random failure on fiji. Re-enable it after JIRA // ticket 136805 is fixed. -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -41,7 +41,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::exp(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_expf.cpp b/tests/Unit/AmpMath/amp_math_expf.cpp index 144468201bd..a75ab23e944 100644 --- a/tests/Unit/AmpMath/amp_math_expf.cpp +++ b/tests/Unit/AmpMath/amp_math_expf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::expf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp index a99373c9f29..2cb11c0efcc 100644 --- a/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::expf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_expm1.cpp b/tests/Unit/AmpMath/amp_math_expm1.cpp index 917721a6384..37c26d55550 100644 --- a/tests/Unit/AmpMath/amp_math_expm1.cpp +++ b/tests/Unit/AmpMath/amp_math_expm1.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::expm1(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp b/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp index f87a0868a7b..7edd5c46923 100644 --- a/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::expm1(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_expm1f.cpp b/tests/Unit/AmpMath/amp_math_expm1f.cpp index 0a9fb795c70..989188c4c6b 100644 --- a/tests/Unit/AmpMath/amp_math_expm1f.cpp +++ b/tests/Unit/AmpMath/amp_math_expm1f.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::expm1f(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp index 54d3fca0af4..2d16239175d 100644 --- a/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::fdim(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_floor.cpp b/tests/Unit/AmpMath/amp_math_floor.cpp index ba4febc3af5..3fafec888c3 100644 --- a/tests/Unit/AmpMath/amp_math_floor.cpp +++ b/tests/Unit/AmpMath/amp_math_floor.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::floor(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp b/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp index 6789260348b..7979f24978d 100644 --- a/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::floor(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_floorf.cpp b/tests/Unit/AmpMath/amp_math_floorf.cpp index 8ae5044579c..b0b6aab026c 100644 --- a/tests/Unit/AmpMath/amp_math_floorf.cpp +++ b/tests/Unit/AmpMath/amp_math_floorf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::floorf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp index 45b785dd691..671e68e9ae3 100644 --- a/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -41,7 +41,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gd[idx] = precise_math::fma(ga[idx], gb[idx], gc[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp index 64335954f8a..b6372b0cb85 100644 --- a/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -41,7 +41,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gd[idx] = precise_math::fmaf(ga[idx], gb[idx], gc[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmax.cpp b/tests/Unit/AmpMath/amp_math_fmax.cpp index 1e4a6dfe68c..607e24b4bf9 100644 --- a/tests/Unit/AmpMath/amp_math_fmax.cpp +++ b/tests/Unit/AmpMath/amp_math_fmax.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::fmax(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp index e20f9322052..67cdb67443b 100644 --- a/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::fmax(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmaxf.cpp b/tests/Unit/AmpMath/amp_math_fmaxf.cpp index 2cb87c79a90..ffc61ddc861 100644 --- a/tests/Unit/AmpMath/amp_math_fmaxf.cpp +++ b/tests/Unit/AmpMath/amp_math_fmaxf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::fmaxf(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmin.cpp b/tests/Unit/AmpMath/amp_math_fmin.cpp index db8d76291c9..fdd7268a429 100644 --- a/tests/Unit/AmpMath/amp_math_fmin.cpp +++ b/tests/Unit/AmpMath/amp_math_fmin.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::fmin(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp index 574383baab4..8d29f3e07ac 100644 --- a/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::fmin(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fminf.cpp b/tests/Unit/AmpMath/amp_math_fminf.cpp index 693d8396f32..71e29460c01 100644 --- a/tests/Unit/AmpMath/amp_math_fminf.cpp +++ b/tests/Unit/AmpMath/amp_math_fminf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::fminf(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmod.cpp b/tests/Unit/AmpMath/amp_math_fmod.cpp index 24665d79f93..a8f90298d3a 100644 --- a/tests/Unit/AmpMath/amp_math_fmod.cpp +++ b/tests/Unit/AmpMath/amp_math_fmod.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::fmod(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp index 18afafc098a..35e308a1524 100644 --- a/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::fmod(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_fmodf.cpp b/tests/Unit/AmpMath/amp_math_fmodf.cpp index 8946979e6d1..eca54443b20 100644 --- a/tests/Unit/AmpMath/amp_math_fmodf.cpp +++ b/tests/Unit/AmpMath/amp_math_fmodf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::fmodf(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp b/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp index 66ae354f455..9ab5e532f78 100644 --- a/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::hypot(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ilogb.cpp b/tests/Unit/AmpMath/amp_math_ilogb.cpp index 0a6f00b0316..5812a69ac0a 100644 --- a/tests/Unit/AmpMath/amp_math_ilogb.cpp +++ b/tests/Unit/AmpMath/amp_math_ilogb.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::ilogb(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp index bf8bb05710f..37d8c8621ad 100644 --- a/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::ilogb(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ilogbf.cpp b/tests/Unit/AmpMath/amp_math_ilogbf.cpp index c07d70938c7..bd7de28f7e6 100644 --- a/tests/Unit/AmpMath/amp_math_ilogbf.cpp +++ b/tests/Unit/AmpMath/amp_math_ilogbf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::ilogbf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isfinite.cpp b/tests/Unit/AmpMath/amp_math_isfinite.cpp index 5b04df11f22..c3798778b51 100644 --- a/tests/Unit/AmpMath/amp_math_isfinite.cpp +++ b/tests/Unit/AmpMath/amp_math_isfinite.cpp @@ -1,11 +1,11 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 2; @@ -20,7 +20,7 @@ int main(void) { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = fast_math::isfinite(6.5f/in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp index d1e47ffdda9..ed27f4d845f 100644 --- a/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -25,7 +25,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = precise_math::isfinite(6.5/in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isinf.cpp b/tests/Unit/AmpMath/amp_math_isinf.cpp index 3a5508b0d24..c88e2eee8c0 100644 --- a/tests/Unit/AmpMath/amp_math_isinf.cpp +++ b/tests/Unit/AmpMath/amp_math_isinf.cpp @@ -1,12 +1,12 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 2; @@ -21,7 +21,7 @@ int main(void) { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = fast_math::isinf(6.5f/in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp index d8c225368f2..638de6ff1bb 100644 --- a/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -23,7 +23,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = precise_math::isinf(6.5/in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isnan.cpp b/tests/Unit/AmpMath/amp_math_isnan.cpp index d354857a1c6..fec6d69fa2a 100644 --- a/tests/Unit/AmpMath/amp_math_isnan.cpp +++ b/tests/Unit/AmpMath/amp_math_isnan.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -24,7 +24,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = fast_math::isnan(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp index f04f5f5b561..d9bb1851170 100644 --- a/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -24,7 +24,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = precise_math::isnan(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isnormal.cpp b/tests/Unit/AmpMath/amp_math_isnormal.cpp index 619884754a3..fca610102fc 100644 --- a/tests/Unit/AmpMath/amp_math_isnormal.cpp +++ b/tests/Unit/AmpMath/amp_math_isnormal.cpp @@ -1,11 +1,11 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 2; @@ -20,7 +20,7 @@ int main(void) { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = precise_math::isnormal(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp index 1e16a270913..c7781acba7e 100644 --- a/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -25,7 +25,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = precise_math::isnormal(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ldexp.cpp b/tests/Unit/AmpMath/amp_math_ldexp.cpp index c09d636adb1..e38f9aff1ae 100644 --- a/tests/Unit/AmpMath/amp_math_ldexp.cpp +++ b/tests/Unit/AmpMath/amp_math_ldexp.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = fast_math::ldexp(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp index 8acac0b1b19..d3084093adc 100644 --- a/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = precise_math::ldexp(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ldexpf.cpp b/tests/Unit/AmpMath/amp_math_ldexpf.cpp index 17d05800cc5..d777d3e03b6 100644 --- a/tests/Unit/AmpMath/amp_math_ldexpf.cpp +++ b/tests/Unit/AmpMath/amp_math_ldexpf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = fast_math::ldexpf(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp index d28514eb1de..71c3bd4cd1c 100644 --- a/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = precise_math::ldexpf(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log.cpp b/tests/Unit/AmpMath/amp_math_log.cpp index 41b06c15f68..9b4b814ac6d 100644 --- a/tests/Unit/AmpMath/amp_math_log.cpp +++ b/tests/Unit/AmpMath/amp_math_log.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::log(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log10.cpp b/tests/Unit/AmpMath/amp_math_log10.cpp index 87bc9046142..e075cbefbf4 100644 --- a/tests/Unit/AmpMath/amp_math_log10.cpp +++ b/tests/Unit/AmpMath/amp_math_log10.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::log10(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp index ab5aaa886e9..eb19cab2a30 100644 --- a/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::log10(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log10f.cpp b/tests/Unit/AmpMath/amp_math_log10f.cpp index cb218e2aa7c..21b3d1489bb 100644 --- a/tests/Unit/AmpMath/amp_math_log10f.cpp +++ b/tests/Unit/AmpMath/amp_math_log10f.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::log10f(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp index cb3b71095b3..b87d0152527 100644 --- a/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::log1p(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp index 5a4b0ab6507..c2fe37042d8 100644 --- a/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::log1pf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log2.cpp b/tests/Unit/AmpMath/amp_math_log2.cpp index acf9d22c5fc..b07e8c7634c 100644 --- a/tests/Unit/AmpMath/amp_math_log2.cpp +++ b/tests/Unit/AmpMath/amp_math_log2.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::log2(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp index 07e820c6bc7..53a05051165 100644 --- a/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::log2(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log2f.cpp b/tests/Unit/AmpMath/amp_math_log2f.cpp index a9b89d86ff4..701f2b0e410 100644 --- a/tests/Unit/AmpMath/amp_math_log2f.cpp +++ b/tests/Unit/AmpMath/amp_math_log2f.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::log2f(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_log_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log_precise_math.cpp index 128e4208f2b..605d05ab85f 100644 --- a/tests/Unit/AmpMath/amp_math_log_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_log_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::log(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp b/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp index b1fff79aa58..3d1e95f0545 100644 --- a/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::logb(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp index 8b1e30d44fd..760befeaeaf 100644 --- a/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::logbf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_max.cpp b/tests/Unit/AmpMath/amp_math_max.cpp index 9f37cf1aafe..1425ee96dd4 100644 --- a/tests/Unit/AmpMath/amp_math_max.cpp +++ b/tests/Unit/AmpMath/amp_math_max.cpp @@ -2,15 +2,15 @@ #if !DISABLED_PENDING_REMOVAL // RUN: %cxxamp %s -o %t.out && %t.out - #include - #include + #include + #include #include #include #include #include - using namespace concurrency; + using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -42,7 +42,7 @@ parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::max(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_max_precise_math.cpp b/tests/Unit/AmpMath/amp_math_max_precise_math.cpp index 50c16238e12..21c226bbfb8 100644 --- a/tests/Unit/AmpMath/amp_math_max_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_max_precise_math.cpp @@ -2,15 +2,15 @@ #if !DISABLED_PENDING_REMOVAL // RUN: %cxxamp %s -o %t.out && %t.out - #include - #include + #include + #include #include #include #include #include - using namespace concurrency; + using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -42,7 +42,7 @@ parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::max(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_min.cpp b/tests/Unit/AmpMath/amp_math_min.cpp index c85533973a3..a90ac8705b0 100644 --- a/tests/Unit/AmpMath/amp_math_min.cpp +++ b/tests/Unit/AmpMath/amp_math_min.cpp @@ -2,15 +2,15 @@ #if !DISABLED_PENDING_REMOVAL // RUN: %cxxamp %s -o %t.out && %t.out - #include - #include + #include + #include #include #include #include #include - using namespace concurrency; + using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -42,7 +42,7 @@ parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::min(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_min_precise_math.cpp b/tests/Unit/AmpMath/amp_math_min_precise_math.cpp index e9f8dbf4caa..bde8bd48892 100644 --- a/tests/Unit/AmpMath/amp_math_min_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_min_precise_math.cpp @@ -2,15 +2,15 @@ #if !DISABLED_PENDING_REMOVAL // RUN: %cxxamp %s -o %t.out && %t.out - #include - #include + #include + #include #include #include #include #include - using namespace concurrency; + using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -42,7 +42,7 @@ parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::min(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp b/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp index dc47c0f92e9..227696d658b 100644 --- a/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::nearbyint(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp b/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp index afdf61d8a6d..f8f6b33d4fb 100644 --- a/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::nextafter(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_pow.cpp b/tests/Unit/AmpMath/amp_math_pow.cpp index 569ae8fdbdd..dcadf3ef8dc 100644 --- a/tests/Unit/AmpMath/amp_math_pow.cpp +++ b/tests/Unit/AmpMath/amp_math_pow.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::pow(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp b/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp index aa785339bff..ba151313935 100644 --- a/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::pow(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_powf.cpp b/tests/Unit/AmpMath/amp_math_powf.cpp index ca7a39655f2..bb40e3474aa 100644 --- a/tests/Unit/AmpMath/amp_math_powf.cpp +++ b/tests/Unit/AmpMath/amp_math_powf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::powf(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp index db92e750ff6..417c7f5ee03 100644 --- a/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::rcbrt(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp index 63d9032f477..c8a5e51a649 100644 --- a/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::rcbrtf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp b/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp index 83a935f5432..c5b07f5b0d5 100644 --- a/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::remainder(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp index 8acc3ad8fee..97c5620ac71 100644 --- a/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::remainderf(ga[idx], gb[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_round.cpp b/tests/Unit/AmpMath/amp_math_round.cpp index 1e63c0b5524..c7d856d448a 100644 --- a/tests/Unit/AmpMath/amp_math_round.cpp +++ b/tests/Unit/AmpMath/amp_math_round.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::round(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_round_precise_math.cpp b/tests/Unit/AmpMath/amp_math_round_precise_math.cpp index f13a9d22cf0..11684ab785c 100644 --- a/tests/Unit/AmpMath/amp_math_round_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_round_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::round(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_roundf.cpp b/tests/Unit/AmpMath/amp_math_roundf.cpp index 4d5762e7b3c..89a902810ac 100644 --- a/tests/Unit/AmpMath/amp_math_roundf.cpp +++ b/tests/Unit/AmpMath/amp_math_roundf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::roundf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_rsqrt.cpp b/tests/Unit/AmpMath/amp_math_rsqrt.cpp index 0c72f61cab0..4392363bc22 100644 --- a/tests/Unit/AmpMath/amp_math_rsqrt.cpp +++ b/tests/Unit/AmpMath/amp_math_rsqrt.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::rsqrt(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp index a63f3e973ba..57951c36293 100644 --- a/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::rsqrt(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_rsqrtf.cpp b/tests/Unit/AmpMath/amp_math_rsqrtf.cpp index 6d40437afc5..42a09cf3afe 100644 --- a/tests/Unit/AmpMath/amp_math_rsqrtf.cpp +++ b/tests/Unit/AmpMath/amp_math_rsqrtf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::rsqrtf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp b/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp index 7c265fb9b7e..cad62bb926e 100644 --- a/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = precise_math::scalb(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp b/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp index 4852197cbff..fb09922fd79 100644 --- a/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = precise_math::scalbn(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp index 6369c9707d7..41992683df1 100644 --- a/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -39,7 +39,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gb[idx] = precise_math::scalbnf(ga[idx], gexp[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_signbit.cpp b/tests/Unit/AmpMath/amp_math_signbit.cpp index 3dccbf56e4e..380682bcded 100644 --- a/tests/Unit/AmpMath/amp_math_signbit.cpp +++ b/tests/Unit/AmpMath/amp_math_signbit.cpp @@ -1,11 +1,11 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 3; @@ -21,7 +21,7 @@ int main(void) { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = fast_math::signbit(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp b/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp index 0b4626e3d7f..03d41914cf0 100644 --- a/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -24,7 +24,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = precise_math::signbit(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_signbitf.cpp b/tests/Unit/AmpMath/amp_math_signbitf.cpp index 998d412a52f..4be8d8d012f 100644 --- a/tests/Unit/AmpMath/amp_math_signbitf.cpp +++ b/tests/Unit/AmpMath/amp_math_signbitf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -#include +#include #include #include -using namespace concurrency; +using namespace hc; template bool test() { @@ -24,7 +24,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { out[idx] = fast_math::signbit(in[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sin.cpp b/tests/Unit/AmpMath/amp_math_sin.cpp index ce55f4a3172..b77a65de3e1 100644 --- a/tests/Unit/AmpMath/amp_math_sin.cpp +++ b/tests/Unit/AmpMath/amp_math_sin.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::sin(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp index 035add0d6f3..26db8298551 100644 --- a/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::sin(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sinf.cpp b/tests/Unit/AmpMath/amp_math_sinf.cpp index 9d7d98fa19b..bdb53ac477b 100644 --- a/tests/Unit/AmpMath/amp_math_sinf.cpp +++ b/tests/Unit/AmpMath/amp_math_sinf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::sinf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sinh.cpp b/tests/Unit/AmpMath/amp_math_sinh.cpp index 932c6bb4690..ada64d00a6a 100644 --- a/tests/Unit/AmpMath/amp_math_sinh.cpp +++ b/tests/Unit/AmpMath/amp_math_sinh.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::sinh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp index cc9c804d9fe..b214938bafd 100644 --- a/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::sinh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp index 8971ce71da1..1890e848c35 100644 --- a/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::sinpi(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp index 3efd8b385ac..e1d2b2b8131 100644 --- a/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::sinpif(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sqrt.cpp b/tests/Unit/AmpMath/amp_math_sqrt.cpp index cea2d5dca96..5846a7b1d75 100644 --- a/tests/Unit/AmpMath/amp_math_sqrt.cpp +++ b/tests/Unit/AmpMath/amp_math_sqrt.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::sqrt(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp index cbea2cd2758..4102cd6e5cf 100644 --- a/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::sqrt(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_sqrtf.cpp b/tests/Unit/AmpMath/amp_math_sqrtf.cpp index 4c34b4540bf..437b4d87ac9 100644 --- a/tests/Unit/AmpMath/amp_math_sqrtf.cpp +++ b/tests/Unit/AmpMath/amp_math_sqrtf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::sqrtf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tan.cpp b/tests/Unit/AmpMath/amp_math_tan.cpp index 5562d13deba..2b2ca7a0f3b 100644 --- a/tests/Unit/AmpMath/amp_math_tan.cpp +++ b/tests/Unit/AmpMath/amp_math_tan.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::tan(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp index 85842147ae6..5bae874aa49 100644 --- a/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::tan(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tanf.cpp b/tests/Unit/AmpMath/amp_math_tanf.cpp index c924fde18d1..ccb20c00a3d 100644 --- a/tests/Unit/AmpMath/amp_math_tanf.cpp +++ b/tests/Unit/AmpMath/amp_math_tanf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::tanf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tanh.cpp b/tests/Unit/AmpMath/amp_math_tanh.cpp index ac4617d3548..24c5b1c156c 100644 --- a/tests/Unit/AmpMath/amp_math_tanh.cpp +++ b/tests/Unit/AmpMath/amp_math_tanh.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -36,7 +36,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = fast_math::tanh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp index a0cb3c2ba74..7878e43976e 100644 --- a/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -36,7 +36,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::tanh(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp index 981bfd621ec..2afbda1df50 100644 --- a/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -34,7 +34,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::tanpi(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp index 502a5f68729..9228636cc93 100644 --- a/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-2) @@ -36,7 +36,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::tgamma(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp index 37493a3bfba..3264f253573 100644 --- a/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -36,7 +36,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::tgammaf(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_trunc.cpp b/tests/Unit/AmpMath/amp_math_trunc.cpp index c04a0901730..2f9006c319b 100644 --- a/tests/Unit/AmpMath/amp_math_trunc.cpp +++ b/tests/Unit/AmpMath/amp_math_trunc.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::trunc(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp b/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp index d758d95a439..be0507ae015 100644 --- a/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp +++ b/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-4) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::trunc(ga[idx]); }); diff --git a/tests/Unit/AmpMath/amp_math_truncf.cpp b/tests/Unit/AmpMath/amp_math_truncf.cpp index c6ea2daef5b..dcdb8dcca17 100644 --- a/tests/Unit/AmpMath/amp_math_truncf.cpp +++ b/tests/Unit/AmpMath/amp_math_truncf.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; #define ERROR_THRESHOLD (1e-1) @@ -35,7 +35,7 @@ bool test() { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = precise_math::truncf(ga[idx]); }); diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_2files.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_2files.cpp index 68e500bad98..d6dc9caffa5 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_2files.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_2files.cpp @@ -10,10 +10,10 @@ void add(const array_view &gbIn,const array_view &gbOut) { - Concurrency::extent<2> grdExt(64, 1); - Concurrency::tiled_extent<64, 1> t_ext(grdExt); + hc::extent<2> grdExt(64, 1); + hc::tiled_extent<64, 1> t_ext(grdExt); - Concurrency::parallel_for_each(t_ext, [=] (Concurrency::tiled_index<64,1> tidx) restrict(amp) + hc::parallel_for_each(t_ext, [=] (hc::tiled_index<64,1> tidx) [[hc]] { unsigned int me = tidx.global[0]; @@ -39,8 +39,8 @@ int main() gbOut[i].set_y(i + 1); } - const Concurrency::array_view gbInA(64, gbIn); - const Concurrency::array_view gbOutAB(64, gbOut); + const hc::array_view gbInA(64, gbIn); + const hc::array_view gbOutAB(64, gbOut); add(gbInA, gbOutAB); @@ -59,14 +59,14 @@ int main() #else #include "amp_short_vectors_2files.h" -concurrency::array_view *gbOutA; +hc::array_view *gbOutA; void sub(const array_view &gbIn,const array_view &gbOut) { - Concurrency::extent<2> grdExt(64, 1); - Concurrency::tiled_extent<64, 1> t_ext(grdExt); + hc::extent<2> grdExt(64, 1); + hc::tiled_extent<64, 1> t_ext(grdExt); - Concurrency::parallel_for_each(t_ext, [=] (Concurrency::tiled_index<64,1> tidx) restrict(amp) + hc::parallel_for_each(t_ext, [=] (hc::tiled_index<64,1> tidx) [[hc]] { unsigned int me = tidx.global[0]; diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h b/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h index 60b497c56ad..54baa7d7ff6 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h @@ -1,11 +1,11 @@ -#include -#include +#include +#include #include -using namespace Concurrency; -using namespace Concurrency::graphics; +using namespace hc; +using namespace hc::graphics; -extern Concurrency::array_view *gbOutA; +extern hc::array_view *gbOutA; extern void add(const array_view &gbIn,const array_view &gbOut); diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp index 54f7f5a95fb..246d465c681 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp @@ -10,10 +10,10 @@ void add(const array_view &gbIn,const array_view &gbOut) { - Concurrency::extent<2> grdExt(64, 1); - Concurrency::tiled_extent<64, 1> t_ext(grdExt); + hc::extent<2> grdExt(64, 1); + hc::tiled_extent<64, 1> t_ext(grdExt); - Concurrency::parallel_for_each(t_ext, [=] (Concurrency::tiled_index<64,1> tidx) restrict(amp) + hc::parallel_for_each(t_ext, [=] (hc::tiled_index<64,1> tidx) [[hc]] { unsigned int me = tidx.global[0]; @@ -39,8 +39,8 @@ int main() gbOut[i].y = i + 1; } - const Concurrency::array_view gbInA(64, gbIn); - const Concurrency::array_view gbOutAB(64, gbOut); + const hc::array_view gbInA(64, gbIn); + const hc::array_view gbOutAB(64, gbOut); add(gbInA, gbOutAB); @@ -59,14 +59,14 @@ int main() #else #include "amp_short_vectors_2files.h" -concurrency::array_view *gbOutA; +hc::array_view *gbOutA; void sub(const array_view &gbIn,const array_view &gbOut) { - Concurrency::extent<2> grdExt(64, 1); - Concurrency::tiled_extent<64, 1> t_ext(grdExt); + hc::extent<2> grdExt(64, 1); + hc::tiled_extent<64, 1> t_ext(grdExt); - Concurrency::parallel_for_each(t_ext, [=] (Concurrency::tiled_index<64,1> tidx) restrict(amp) + hc::parallel_for_each(t_ext, [=] (hc::tiled_index<64,1> tidx) [[hc]] { unsigned int me = tidx.global[0]; diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp index 662191dd75f..a14ca631f3c 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp @@ -1,14 +1,14 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; int main(void) { // Two-component Access - // double_2 get_Sxz() const restrict(cpu, amp); + // double_2 get_Sxz() const [[cpu, hc]]; { double a = 1.2f, b = 3.4f, c = -5.6f; double_3 d(a, b, c); @@ -25,7 +25,7 @@ int main(void) { assert(e == f); } - // void set_Sxz(double_2 v) restrict(cpu, amp); + // void set_Sxz(double_2 v) [[cpu, hc]]; { double a = 1.2f, b = 3.4f, c = -5.6f; double_3 d(a, b, c), e(a, b, b); @@ -44,7 +44,7 @@ int main(void) { // Three-component Access - // double_3 get_Sxyz() const restrict(cpu, amp); + // double_3 get_Sxyz() const [[cpu, hc]]; { double a = -1.2f, b = 3.4f, c = -5.6f; double_3 d(a, b, c), e(a, b, c), f; @@ -87,7 +87,7 @@ int main(void) { assert(e == f); } - // void set_Sxyz(double_3 v) restrict(cpu, amp); + // void set_Sxyz(double_3 v) [[cpu, hc]]; { double a = -1.2f, b = 3.4f, c = -5.6f; double_3 d(a, b, c), e(a, b, c), f; diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp index 226fe9b1a96..f95035ab934 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; int main(void) { - // float_2 operator=(const float_2& other) restrict(cpu, amp); + // float_2 operator=(const float_2& other) [[cpu, hc]]; { float_2 a(1.0f); float_2 b = a; @@ -16,7 +16,7 @@ int main(void) { // Unary Negation - // float_2 operator-() const restrict(cpu, amp); + // float_2 operator-() const [[cpu, hc]]; { float a = 2.0f; float b = -a; diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp index 5918d81e0aa..5b55340ca57 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp @@ -1,14 +1,14 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; int main(void) { // Three-component Access - // int_3 get_Sxyw() const restrict(cpu, amp); + // int_3 get_Sxyw() const [[cpu, hc]]; { int a = -1, b = 2, c = -3, d = 4; int_4 e(a, b, c, d); @@ -57,7 +57,7 @@ int main(void) { assert(f == g); } - // void set_Sxyw() restrict(cpu, amp); + // void set_Sxyw() [[cpu, hc]]; { int a = -1, b = 2, c = -3, d = 4; int_4 e(a, b, c, d), f(a, b, c, c); @@ -108,7 +108,7 @@ int main(void) { // Four-component Access - // int_4 get_Sxyzw() const restrict(cpu, amp); + // int_4 get_Sxyzw() const [[cpu, hc]]; { int a = -1, b = 2, c = -3, d = 4; int_4 e(a, b, c, d), f(a, b, c, d), g; @@ -193,7 +193,7 @@ int main(void) { assert(f == g); } - // void set_Sxyzw(int_4 v) restrict(cpu, amp); + // void set_Sxyzw(int_4 v) [[cpu, hc]]; { int a = -1, b = 2, c = -3, d = 4; int_4 e(a, b, c, d), f(a, b, c, d), g; diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp index 6ac0e0f6cc7..fe131acba93 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp @@ -1,40 +1,40 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; int main(void) { // Constructor - // norm() restrict(cpu, amp); + // norm() [[cpu, hc]]; { norm a; } - // explicit norm(float v) restrict(cpu, amp); + // explicit norm(float v) [[cpu, hc]]; { norm a(-2.0f), b(-1.0f), c(-0.5f), d(0.0f), e(0.5f), f(1.0f), g(2.0f); assert(a == b); assert(f == g); } - // explicit norm(unsigned int v) restrict(cpu, amp); + // explicit norm(unsigned int v) [[cpu, hc]]; { norm a(0u), b(1u), c(2u); assert(b == c); } - // explicit norm(int v) restrict(cpu, amp); + // explicit norm(int v) [[cpu, hc]]; { norm a(-2), b(-1), c(0), d(1), e(2); assert(a == b); assert(d == e); } - // explicit norm(double v) restrict(cpu, amp); + // explicit norm(double v) [[cpu, hc]]; { double a = -2.0f, b = -1.0f, c = -0.5f, d = 0.0f, e = 0.5f, f = 1.0f, g = 2.0f; norm h(a), i(b), j(c), k(d), l(e), m(f), n(g); @@ -42,35 +42,35 @@ int main(void) { assert(m == n); } - // norm(const norm& other) restrict(cpu, amp); + // norm(const norm& other) [[cpu, hc]]; { norm a(-0.3f); norm b(a); assert(a == b); } - // norm(const unorm& other) restrict(cpu, amp); + // norm(const unorm& other) [[cpu, hc]]; { unorm a(0.4f); norm b(a); assert(a == b); } - // norm& operator=(const norm& other) restrict(cpu, amp); + // norm& operator=(const norm& other) [[cpu, hc]]; { norm a(0.8f), b; b = a; assert(a == b); } - // operator float(void) const restrict(cpu, amp); + // operator float(void) const [[cpu, hc]]; { norm a(0.8f); float b = static_cast(a); assert(b == 0.8f); } - // norm& operator+=(const norm& other) restrict(cpu, amp); + // norm& operator+=(const norm& other) [[cpu, hc]]; { norm a(0.8f), b(0.4f); a += b; @@ -78,7 +78,7 @@ int main(void) { assert(c == 1.0f); } - // norm& operator-=(const norm& other) restrict(cpu, amp); + // norm& operator-=(const norm& other) [[cpu, hc]]; { norm a(0.8f); a -= a; @@ -86,7 +86,7 @@ int main(void) { assert(b == 0.0f); } - // norm& operator*=(const norm& other) restrict(cpu, amp); + // norm& operator*=(const norm& other) [[cpu, hc]]; { norm a(1.0f), b(2.0f); a *= b; @@ -94,7 +94,7 @@ int main(void) { assert(c == 1.0f); } - // norm& operator/=(const norm& other) restrict(cpu, amp); + // norm& operator/=(const norm& other) [[cpu, hc]]; { norm a(1.0f), b(-1.0f); a /= b; @@ -102,7 +102,7 @@ int main(void) { assert(c == -1.0f); } - // norm& operator++() restrict(cpu, amp); + // norm& operator++() [[cpu, hc]]; { norm a(0.5f); ++a; @@ -110,7 +110,7 @@ int main(void) { assert(b == 1.0f); } - // norm& operator++(int) restrict(cpu, amp); + // norm& operator++(int) [[cpu, hc]]; { norm a(0.5f); a++; @@ -118,7 +118,7 @@ int main(void) { assert(b == 1.0f); } - // norm& operator--() restrict(cpu, amp); + // norm& operator--() [[cpu, hc]]; { norm a(-0.5f); --a; @@ -126,7 +126,7 @@ int main(void) { assert(b == -1.0f); } - // norm& operator--(int) restrict(cpu, amp); + // norm& operator--(int) [[cpu, hc]]; { norm a(-0.5f); a--; @@ -134,14 +134,14 @@ int main(void) { assert(b == -1.0f); } - // norm operator-() restrict(cpu, amp); + // norm operator-() [[cpu, hc]]; { norm a(-2.0f); float b = static_cast(-a); assert(b == 1.0f); } - // norm operator+(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // norm operator+(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.5f), b(0.6f); norm c = a + b; @@ -149,7 +149,7 @@ int main(void) { assert(d == 1.0f); } - // norm operator-(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // norm operator-(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.5f), b(0.5f); norm c = a - b; @@ -157,7 +157,7 @@ int main(void) { assert(d == 0.0f); } - // norm operator*(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // norm operator*(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(1.0f), b(-1.0f); norm c = a * b; @@ -165,7 +165,7 @@ int main(void) { assert(d == -1.0f); } - // norm operator/(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // norm operator/(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(1.0f), b(-1.0f); norm c = a / b; @@ -173,38 +173,38 @@ int main(void) { assert(d == -1.0f); } - // bool operator==(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // bool operator==(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.5f), b(0.5f); assert(a == b); } - // bool operator!=(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // bool operator!=(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.5f), b(0.6f); assert(a != b); } - // bool operator>(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // bool operator>(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.6f), b(-0.7f); assert(a > b); } - // bool operator<(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // bool operator<(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(-0.6f), b(2.0f); assert(a < b); } - // bool operator>=(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // bool operator>=(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.6f), b(-0.4f), c(-0.4f); assert(a >= b); assert(b >= c); } - // bool operator<=(const norm& lhs, const norm& rhs) restrict(cpu, amp); + // bool operator<=(const norm& lhs, const norm& rhs) [[cpu, hc]]; { norm a(0.6f), b(1.5f), c(2.0f); assert(a <= b); @@ -258,7 +258,7 @@ int main(void) { } parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = -ga[idx]; gc[idx] += (ga[idx] + gb[idx]); gc[idx] -= (ga[idx] - gb[idx]); diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp index 2b8be36888e..71be8401aae 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; // type trait and helper function template struct is_same diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp index 40b4b2594db..2fe97451ae5 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; // type trait and helper function template struct is_same diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp index 43a83545d6b..88f6ae1b44c 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp @@ -1,14 +1,14 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; int main(void) { // More Interger Operators - // uint_2 operator~() const restrict(cpu, amp); + // uint_2 operator~() const [[cpu, hc]]; { unsigned int a = 5u; unsigned int b = ~a; @@ -17,7 +17,7 @@ int main(void) { assert(d == e); } - // uint_2& operator%=(const uint_2& rhs) restrict(cpu, amp); + // uint_2& operator%=(const uint_2& rhs) [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -27,7 +27,7 @@ int main(void) { assert(c == e); } - // uint_2& operator^=(const uint_2& rhs) restrict(cpu, amp); + // uint_2& operator^=(const uint_2& rhs) [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -37,7 +37,7 @@ int main(void) { assert(c == e); } - // uint_2& operator|=(const uint_2& rhs) restrict(cpu, amp); + // uint_2& operator|=(const uint_2& rhs) [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -47,7 +47,7 @@ int main(void) { assert(c == e); } - // uint_2& operator&=(const uint_2& rhs) restrict(cpu, amp); + // uint_2& operator&=(const uint_2& rhs) [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -57,7 +57,7 @@ int main(void) { assert(c == e); } - // uint_2& operator>>=(const uint_2& rhs) restrict(cpu, amp); + // uint_2& operator>>=(const uint_2& rhs) [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -67,7 +67,7 @@ int main(void) { assert(c == e); } - // uint_2& operator<<=(const uint_2& rhs) restrict(cpu, amp); + // uint_2& operator<<=(const uint_2& rhs) [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -78,7 +78,7 @@ int main(void) { } // uint_2 operator%(const uint_2& lhs, const uint_2& rhs) - // restrict(cpu, amp); + // [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -88,7 +88,7 @@ int main(void) { } // uint_2 operator^(const uint_2& lhs, const uint_2& rhs) - // restrict(cpu, amp); + // [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -98,7 +98,7 @@ int main(void) { } // uint_2 operator|(const uint_2& lhs, const uint_2& rhs) - // restrict(cpu, amp); + // [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -108,7 +108,7 @@ int main(void) { } // uint_2 operator&(const uint_2& lhs, const uint_2& rhs) - // restrict(cpu, amp); + // [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -118,7 +118,7 @@ int main(void) { } // uint_2 operator<<(const uint_2& lhs, const uint_2& rhs) - // restrict(cpu, amp); + // [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); @@ -128,7 +128,7 @@ int main(void) { } // uint_2 operator>>(const uint_2& lhs, const uint_2& rhs) - // restrict(cpu, amp); + // [[cpu, hc]]; { unsigned int a = 5u, b = 10u; uint_2 c(a), d(b); diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp index 61dfcc06c34..cfb48f33757 100644 --- a/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp +++ b/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp @@ -1,40 +1,40 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include +#include #include -using namespace concurrency; -using namespace concurrency::graphics; +using namespace hc; +using namespace hc::graphics; int main(void) { // Constructor - // unorm() restrict(cpu, amp); + // unorm() [[cpu, hc]]; { unorm a; } - // explicit unorm(float v) restrict(cpu, amp); + // explicit unorm(float v) [[cpu, hc]]; { unorm a(-0.5f), b(0.0f), c(0.5f), d(1.0f), e(2.0f); assert(a == b); assert(d == e); } - // explicit unorm(unsigned int v) restrict(cpu, amp); + // explicit unorm(unsigned int v) [[cpu, hc]]; { unorm a(0u), b(1u), c(2u); assert(b == c); } - // explicit unorm(int v) restrict(cpu, amp); + // explicit unorm(int v) [[cpu, hc]]; { unorm a(-1), b(0), c(1), d(2); assert(a == b); assert(c == d); } - // explicit unorm(double v) restrict(cpu, amp); + // explicit unorm(double v) [[cpu, hc]]; { double a = -0.5f, b = 0.0f, c = 0.5f, d = 1.0f, e = 2.0f; unorm f(a), g(b), h(c), i(d), j(e); @@ -42,14 +42,14 @@ int main(void) { assert(i == j); } - // unorm(const unorm& other) restrict(cpu, amp); + // unorm(const unorm& other) [[cpu, hc]]; { unorm a(0.3f); unorm b(a); assert(a == b); } - // explicit unorm(const norm& other) restrict(cpu, amp); + // explicit unorm(const norm& other) [[cpu, hc]]; { norm a(0.4f), b(-0.3f); unorm c(a), d(b); @@ -57,21 +57,21 @@ int main(void) { assert(b != d); } - // unorm& operator=(const unorm& other) restrict(cpu, amp); + // unorm& operator=(const unorm& other) [[cpu, hc]]; { unorm a(0.8f), b; b = a; assert(a == b); } - // operator float(void) const restrict(cpu, amp); + // operator float(void) const [[cpu, hc]]; { unorm a(0.8f); float b = static_cast(a); assert(b == 0.8f); } - // unorm& operator+=(const unorm& other) restrict(cpu, amp); + // unorm& operator+=(const unorm& other) [[cpu, hc]]; { unorm a(0.8f), b(0.4f); a += b; @@ -79,7 +79,7 @@ int main(void) { assert(c == 1.0f); } - // unorm& operator-=(const unorm& other) restrict(cpu, amp); + // unorm& operator-=(const unorm& other) [[cpu, hc]]; { unorm a(0.8f); a -= a; @@ -87,7 +87,7 @@ int main(void) { assert(b == 0.0f); } - // unorm& operator*=(const unorm& other) restrict(cpu, amp); + // unorm& operator*=(const unorm& other) [[cpu, hc]]; { unorm a(1.0f), b(2.0f); a *= b; @@ -95,7 +95,7 @@ int main(void) { assert(c == 1.0f); } - // unorm& operator/=(const unorm& other) restrict(cpu, amp); + // unorm& operator/=(const unorm& other) [[cpu, hc]]; { unorm a(1.0f), b(2.0f); a /= b; @@ -103,7 +103,7 @@ int main(void) { assert(c == 1.0f); } - // unorm& operator++() restrict(cpu, amp); + // unorm& operator++() [[cpu, hc]]; { unorm a(0.5f); ++a; @@ -111,7 +111,7 @@ int main(void) { assert(b == 1.0f); } - // unorm& operator++(int) restrict(cpu, amp); + // unorm& operator++(int) [[cpu, hc]]; { unorm a(0.5f); a++; @@ -119,7 +119,7 @@ int main(void) { assert(b == 1.0f); } - // unorm& operator--() restrict(cpu, amp); + // unorm& operator--() [[cpu, hc]]; { unorm a(0.5f); --a; @@ -127,7 +127,7 @@ int main(void) { assert(b == 0.0f); } - // unorm& operator--(int) restrict(cpu, amp); + // unorm& operator--(int) [[cpu, hc]]; { unorm a(0.5f); a--; @@ -135,7 +135,7 @@ int main(void) { assert(b == 0.0f); } - // unorm operator+(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // unorm operator+(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.5f), b(0.6f); unorm c = a + b; @@ -143,7 +143,7 @@ int main(void) { assert(d == 1.0f); } - // unorm operator-(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // unorm operator-(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.5f), b(0.5f); unorm c = a - b; @@ -151,7 +151,7 @@ int main(void) { assert(d == 0.0f); } - // unorm operator*(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // unorm operator*(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(1.0f), b(-1.0f); unorm c = a * b; @@ -159,7 +159,7 @@ int main(void) { assert(d == 0.0f); } - // unorm operator/(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // unorm operator/(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(1.0f), b(0.5f); unorm c = a / b; @@ -167,38 +167,38 @@ int main(void) { assert(d == 1.0f); } - // bool operator==(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // bool operator==(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.5f), b(0.5f); assert(a == b); } - // bool operator!=(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // bool operator!=(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.5f), b(0.6f); assert(a != b); } - // bool operator>(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // bool operator>(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.6f), b(0.3f); assert(a > b); } - // bool operator<(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // bool operator<(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(-0.6f), b(2.0f); assert(a < b); } - // bool operator>=(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // bool operator>=(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.6f), b(-0.4f), c(-0.4f); assert(a >= b); assert(b >= c); } - // bool operator<=(const unorm& lhs, const unorm& rhs) restrict(cpu, amp); + // bool operator<=(const unorm& lhs, const unorm& rhs) [[cpu, hc]]; { unorm a(0.6f), b(1.5f), c(2.0f); assert(a <= b); @@ -251,7 +251,7 @@ int main(void) { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = ga[idx]; gc[idx] += (ga[idx] + gb[idx]); gc[idx] -= (ga[idx] - gb[idx]); diff --git a/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp b/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp index 5c33fec597d..956187fdb5c 100644 --- a/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp +++ b/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp @@ -11,7 +11,7 @@ template bool test_norm() { extent<1> ex(GRID_SIZE); array_view av(GRID_SIZE); - parallel_for_each(ex, [=](index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=](index<1>& idx) [[hc]] { T val; av[idx] = (int)val; }).wait(); @@ -24,7 +24,7 @@ template bool test() { extent<1> ex(GRID_SIZE); array_view av(GRID_SIZE); - parallel_for_each(ex, [=](index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=](index<1>& idx) [[hc]] { T val; av[idx] = (int)(val.get_x()); }).wait(); diff --git a/tests/Unit/AsyncPFE/accelerator_view_wait.cpp b/tests/Unit/AsyncPFE/accelerator_view_wait.cpp index da1bfa74e50..61ee2fffd13 100644 --- a/tests/Unit/AsyncPFE/accelerator_view_wait.cpp +++ b/tests/Unit/AsyncPFE/accelerator_view_wait.cpp @@ -25,7 +25,7 @@ hc::completion_future execute(hc::array_view& av1, hc::array_view& av2, hc::array_view& av3) { // run HC parallel_for_each - return hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) restrict(amp) { + return hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) { av3(idx) = av1(idx) + av2(idx); } diff --git a/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp b/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp index f3fcda0393e..7b055dd14d3 100644 --- a/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp +++ b/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp @@ -15,7 +15,7 @@ void execute(hc::array_view& av1, hc::array_view& av2, hc::array_view& av3) { // run HC parallel_for_each - hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) { av3(idx) = av1(idx) + av2(idx); } diff --git a/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp b/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp index ecec1765b78..69f4ee939a6 100644 --- a/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp +++ b/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp @@ -16,7 +16,7 @@ void execute(hc::array_view& av1, hc::array_view& av2, hc::array_view& av3) { // run HC parallel_for_each - hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) { av3(idx) = av1(idx) + av2(idx); } diff --git a/tests/Unit/AsyncPFE/async_array_add.cpp b/tests/Unit/AsyncPFE/async_array_add.cpp index c6ed9dc214d..0fb8ec0a85f 100644 --- a/tests/Unit/AsyncPFE/async_array_add.cpp +++ b/tests/Unit/AsyncPFE/async_array_add.cpp @@ -36,7 +36,7 @@ bool test() { hc::extent<1> e(vecSize); hc::completion_future fut = hc::parallel_for_each( e, - [=](hc::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; diff --git a/tests/Unit/AsyncPFE/async_array_add_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_2d.cpp index 5672a76d5c8..622a6d015ad 100644 --- a/tests/Unit/AsyncPFE/async_array_add_2d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_2d.cpp @@ -37,7 +37,7 @@ bool test() { hc::extent<2> e(dimSize, dimSize); hc::completion_future fut = hc::parallel_for_each( e, - [=](hc::index<2> idx) restrict(amp) { + [=](hc::index<2> idx) [[hc]] { int fidx = idx[0] * dimSize + idx[1]; for (int i = 0; i < LOOP_COUNT; ++i) p_c[fidx] = p_a[fidx] + p_b[fidx]; diff --git a/tests/Unit/AsyncPFE/async_array_add_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_3d.cpp index 50fcf66dad7..8c3581a97ad 100644 --- a/tests/Unit/AsyncPFE/async_array_add_3d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_3d.cpp @@ -37,7 +37,7 @@ bool test() { hc::extent<3> e(dimSize, dimSize, dimSize); hc::completion_future fut = hc::parallel_for_each( e, - [=](hc::index<3> idx) restrict(amp) { + [=](hc::index<3> idx) [[hc]] { int fidx = idx[0] * dimSize * dimSize + idx[1] * dimSize + idx[2]; for (int i = 0; i < LOOP_COUNT; ++i) p_c[fidx] = p_a[fidx] + p_b[fidx]; diff --git a/tests/Unit/AsyncPFE/async_array_add_4d.cpp b/tests/Unit/AsyncPFE/async_array_add_4d.cpp index 2fd7ec0b370..8fdab391577 100644 --- a/tests/Unit/AsyncPFE/async_array_add_4d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_4d.cpp @@ -38,7 +38,7 @@ bool test() { hc::extent<4> e(dim); hc::completion_future fut = hc::parallel_for_each( e, - [=](hc::index<4> idx) restrict(amp) { + [=](hc::index<4> idx) [[hc]] { int fidx = idx[0] * dimSize * dimSize * dimSize + idx[1] * dimSize * dimSize + idx[2] * dimSize + idx[3]; for (int i = 0; i < LOOP_COUNT; ++i) p_c[fidx] = p_a[fidx] + p_b[fidx]; diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple.cpp index 655a3951e41..c1a3ff39cd9 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple.cpp @@ -44,7 +44,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e, \ - [=](hc::index<1> idx) restrict(amp) { \ + [=](hc::index<1> idx) [[hc]] { \ for (int i = 0; i < LOOP_COUNT; ++i) \ p_c[idx[0] + vecSize/(x)*(y)] = p_a[idx[0] + vecSize/(x)*(y)] + p_b[idx[0] + vecSize/(x)*(y)]; \ }) diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp index d5d6f310cb7..61dac054291 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp @@ -47,7 +47,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e, \ - [=](hc::index<2> idx) restrict(amp) { \ + [=](hc::index<2> idx) [[hc]] { \ const int offset = vecSize / (x) * (y); \ const int fidx = idx[0] * dimSize + idx[1]; \ for (int i = 0; i < LOOP_COUNT; ++i) \ diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp index edd25505f38..55bd22a5488 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp @@ -47,7 +47,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e, \ - [=](hc::index<3> idx) restrict(amp) { \ + [=](hc::index<3> idx) [[hc]] { \ const int offset = vecSize / (x) * (y); \ const int fidx = idx[0] * dimSize * dimSize + idx[1] * dimSize + idx[2]; \ for (int i = 0; i < LOOP_COUNT; ++i) \ diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp index 142bf1c48e3..abe25da5a8e 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp @@ -48,7 +48,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e, \ - [=](hc::index<4> idx) restrict(amp) { \ + [=](hc::index<4> idx) [[hc]] { \ const int offset = vecSize / (x) * (y); \ const int fidx = idx[0] * dimSize * dimSize * dimSize + idx[1] * dimSize * dimSize + idx[2] * dimSize + idx[3]; \ for (int i = 0; i < LOOP_COUNT; ++i) \ diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp index 488f4f1930f..9fdf0e055d0 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp @@ -44,7 +44,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e.tile(256), \ - [=](hc::tiled_index<1> idx) restrict(amp) { \ + [=](hc::tiled_index<1> idx) [[hc]] { \ const int offset = vecSize/(x)*(y); \ const int fidx = idx.global[0]; \ for (int i = 0; i < LOOP_COUNT; ++i) \ diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp index 1eb108fc232..ae58fc0b103 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp @@ -47,7 +47,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e.tile(4,4), \ - [=](hc::tiled_index<2> idx) restrict(amp) { \ + [=](hc::tiled_index<2> idx) [[hc]] { \ const int offset = vecSize/(x)*(y); \ const int fidx = idx.global[0] * dimSize + idx.global[1]; \ for (int i = 0; i < LOOP_COUNT; ++i) \ diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp index 919cfea3cd5..e19add7e735 100644 --- a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp @@ -47,7 +47,7 @@ bool test() { #define ASYNC_KERNEL_DISPATCH(x, y) \ hc::parallel_for_each( \ e.tile(2,2,2), \ - [=](hc::tiled_index<3> idx) restrict(amp) { \ + [=](hc::tiled_index<3> idx) [[hc]] { \ const int offset = vecSize/(x)*(y); \ const int fidx = idx.global[0] * dimSize * dimSize + idx.global[1] * dimSize + idx.global[2]; \ for (int i = 0; i < LOOP_COUNT; ++i) \ diff --git a/tests/Unit/AsyncPFE/async_array_add_then.cpp b/tests/Unit/AsyncPFE/async_array_add_then.cpp index a276ccd4a5b..bf944d64ce3 100644 --- a/tests/Unit/AsyncPFE/async_array_add_then.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_then.cpp @@ -33,7 +33,7 @@ bool test() { hc::extent<1> e(vecSize); hc::completion_future fut = hc::parallel_for_each( e, - [=](hc::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; }); @@ -44,7 +44,7 @@ bool test() { std::cout << "async launch the 2nd kernel\n"; hc::completion_future fut2 = hc::parallel_for_each( e, - [=](hc::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { p_c[idx[0]] += p_a[idx[0]] + p_b[idx[0]]; }); @@ -53,7 +53,7 @@ bool test() { std::cout << "sync launch the 3rd kernel\n"; parallel_for_each( e, - [=](hc::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { p_c[idx[0]] += p_a[idx[0]] + p_b[idx[0]]; }); done_promise.set_value(); diff --git a/tests/Unit/AsyncPFE/async_array_add_tiled.cpp b/tests/Unit/AsyncPFE/async_array_add_tiled.cpp index 4a8161f6fe1..811894e6d1a 100644 --- a/tests/Unit/AsyncPFE/async_array_add_tiled.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_tiled.cpp @@ -36,7 +36,7 @@ bool test() { hc::extent<1> e(vecSize); hc::completion_future fut = hc::parallel_for_each( e.tile(256), - [=](hc::tiled_index<1> idx) restrict(amp) { + [=](hc::tiled_index<1> idx) [[hc]] { int fidx = idx.global[0]; for (int i = 0; i < LOOP_COUNT; ++i) p_c[fidx] = p_a[fidx] + p_b[fidx]; diff --git a/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp index 9b71b8f71ce..a7e996e907b 100644 --- a/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp @@ -37,7 +37,7 @@ bool test() { hc::extent<2> e(dimSize, dimSize); hc::completion_future fut = hc::parallel_for_each( e.tile(4, 4), - [=](hc::tiled_index<2> idx) restrict(amp) { + [=](hc::tiled_index<2> idx) [[hc]] { int fidx = idx.global[0] * dimSize + idx.global[1]; for (int i = 0; i < LOOP_COUNT; ++i) p_c[fidx] = p_a[fidx] + p_b[fidx]; diff --git a/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp index cb4df0beb26..feb08c8cadc 100644 --- a/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp +++ b/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp @@ -37,7 +37,7 @@ bool test() { hc::extent<3> e(dimSize, dimSize, dimSize); hc::completion_future fut = hc::parallel_for_each( e.tile(2, 2, 2), - [=](hc::tiled_index<3> idx) restrict(amp) { + [=](hc::tiled_index<3> idx) [[hc]] { int fidx = idx.global[0] * dimSize * dimSize + idx.global[1] * dimSize + idx.global[2]; for (int i = 0; i < LOOP_COUNT; ++i) p_c[fidx] = p_a[fidx] + p_b[fidx]; diff --git a/tests/Unit/AsyncPFE/async_av_dependent1.cpp b/tests/Unit/AsyncPFE/async_av_dependent1.cpp index c310cfa1018..fd2dcb4a5cd 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent1.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent1.cpp @@ -48,7 +48,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -71,7 +71,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av1 = i * 3 for (int i = 0; i < LOOP_COUNT; ++i) av1(idx) = av2(idx) + av3(idx); @@ -98,7 +98,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av2 = i * 5 for (int i = 0; i < LOOP_COUNT; ++i) av2(idx) = av1(idx) + av3(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent2.cpp b/tests/Unit/AsyncPFE/async_av_dependent2.cpp index 715e7f27041..aa877a2def7 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent2.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent2.cpp @@ -48,7 +48,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -71,7 +71,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av1 = i * 3 for (int i = 0; i < LOOP_COUNT; ++i) av1(idx) = av2(idx) + av3(idx); @@ -98,7 +98,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av2 = i * 5 for (int i = 0; i < LOOP_COUNT; ++i) av2(idx) = av1(idx) + av3(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent3.cpp b/tests/Unit/AsyncPFE/async_av_dependent3.cpp index 893840c2cae..ba6881d5fca 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent3.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent3.cpp @@ -41,7 +41,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -57,7 +57,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av1 = i * 3 for (int i = 0; i < LOOP_COUNT; ++i) av1(idx) = av2(idx) + av3(idx); @@ -73,7 +73,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av2 = i * 5 for (int i = 0; i < LOOP_COUNT; ++i) av2(idx) = av1(idx) + av3(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent4.cpp b/tests/Unit/AsyncPFE/async_av_dependent4.cpp index f08e471d66b..445bbaff5b1 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent4.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent4.cpp @@ -52,7 +52,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -75,7 +75,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because av1 and av2 are read-only, and this kernel writes to av4, NOT av3 - hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) - av2(idx); @@ -100,7 +100,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av3(idx) * av4(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent5.cpp b/tests/Unit/AsyncPFE/async_av_dependent5.cpp index 2e24484db69..06132e15bed 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent5.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent5.cpp @@ -52,7 +52,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -75,7 +75,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because av1 and av2 are read-only, and this kernel writes to av4, NOT av3 - hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) - av2(idx); @@ -100,7 +100,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av3(idx) * av4(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent6.cpp b/tests/Unit/AsyncPFE/async_av_dependent6.cpp index 90a397e338d..5b6059173b6 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent6.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent6.cpp @@ -45,7 +45,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -61,7 +61,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because av1 and av2 are read-only, and this kernel writes to av4, NOT av3 - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) - av2(idx); @@ -77,7 +77,7 @@ bool test1D() { // this kernel dispatch shall implicitly wait for the previous two to complete // because they access the same array_view instances and write to them - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av3(idx) * av4(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent7.cpp b/tests/Unit/AsyncPFE/async_av_dependent7.cpp index 9955d23c60a..4872eda95c4 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent7.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent7.cpp @@ -40,7 +40,7 @@ void test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -56,7 +56,7 @@ void test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av1 = i * 3 for (int i = 0; i < LOOP_COUNT; ++i) av1(idx) = av2(idx) + av3(idx); @@ -72,7 +72,7 @@ void test1D() { // this kernel dispatch shall implicitly wait for the previous one to complete // because they access the same array_view instances and write to them - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av2 = i * 5 for (int i = 0; i < LOOP_COUNT; ++i) av2(idx) = av1(idx) + av3(idx); diff --git a/tests/Unit/AsyncPFE/async_av_dependent8.cpp b/tests/Unit/AsyncPFE/async_av_dependent8.cpp index b1fba7f4b14..db3f080e95b 100644 --- a/tests/Unit/AsyncPFE/async_av_dependent8.cpp +++ b/tests/Unit/AsyncPFE/async_av_dependent8.cpp @@ -47,7 +47,7 @@ void test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -65,7 +65,7 @@ void test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because av1 and av2 are read-only, and this kernel writes to av4, NOT av3 - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) - av2(idx); @@ -90,7 +90,7 @@ void test1D() { // this kernel dispatch shall implicitly wait for the previous two to complete // because they access the same array_view instances and write to them - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = 0 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av3(idx) * av4(idx); diff --git a/tests/Unit/AsyncPFE/async_av_independent1.cpp b/tests/Unit/AsyncPFE/async_av_independent1.cpp index 61ada7c4b0f..118a7b2c0dd 100644 --- a/tests/Unit/AsyncPFE/async_av_independent1.cpp +++ b/tests/Unit/AsyncPFE/async_av_independent1.cpp @@ -56,7 +56,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut1 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -78,7 +78,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut2 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) + av2(idx); @@ -96,7 +96,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut3 = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av1(idx) + av2(idx); diff --git a/tests/Unit/AsyncPFE/async_av_independent2.cpp b/tests/Unit/AsyncPFE/async_av_independent2.cpp index 1f46cfdd26d..08315767a5e 100644 --- a/tests/Unit/AsyncPFE/async_av_independent2.cpp +++ b/tests/Unit/AsyncPFE/async_av_independent2.cpp @@ -49,7 +49,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -65,7 +65,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) + av2(idx); @@ -81,7 +81,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av1(idx) + av2(idx); diff --git a/tests/Unit/AsyncPFE/async_av_independent3.cpp b/tests/Unit/AsyncPFE/async_av_independent3.cpp index 61cf10c0728..20bca907d97 100644 --- a/tests/Unit/AsyncPFE/async_av_independent3.cpp +++ b/tests/Unit/AsyncPFE/async_av_independent3.cpp @@ -49,7 +49,7 @@ bool test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -65,7 +65,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) + av2(idx); @@ -81,7 +81,7 @@ bool test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av1(idx) + av2(idx); diff --git a/tests/Unit/AsyncPFE/async_av_independent4.cpp b/tests/Unit/AsyncPFE/async_av_independent4.cpp index f0d4b95e61c..ceb2bd9c236 100644 --- a/tests/Unit/AsyncPFE/async_av_independent4.cpp +++ b/tests/Unit/AsyncPFE/async_av_independent4.cpp @@ -52,7 +52,7 @@ void test1D() { std::cout << "launch pfe1\n"; #endif - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av3 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); @@ -68,7 +68,7 @@ void test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av4 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) + av2(idx); @@ -84,7 +84,7 @@ void test1D() { // this kernel dispatch shall NOT implicitly wait for the previous one to complete // because the array_view written is NOT used by the previous kernels - hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { // av5 = i * 2 for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av1(idx) + av2(idx); diff --git a/tests/Unit/AsyncPFE/completion_future_wait.cpp b/tests/Unit/AsyncPFE/completion_future_wait.cpp index 9e06d096fd5..d7b1118e00d 100644 --- a/tests/Unit/AsyncPFE/completion_future_wait.cpp +++ b/tests/Unit/AsyncPFE/completion_future_wait.cpp @@ -25,7 +25,7 @@ hc::completion_future execute(hc::array_view& av1, hc::array_view& av2, hc::array_view& av3) { // run HC parallel_for_each - return hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) restrict(amp) { + return hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) { av3(idx) = av1(idx) + av2(idx); } diff --git a/tests/Unit/AsyncPFE/completion_future_wait2.cpp b/tests/Unit/AsyncPFE/completion_future_wait2.cpp index 2213f4c9357..0e5b50bb127 100644 --- a/tests/Unit/AsyncPFE/completion_future_wait2.cpp +++ b/tests/Unit/AsyncPFE/completion_future_wait2.cpp @@ -17,7 +17,7 @@ hc::completion_future execute(hc::array_view& av1, hc::array_view& av2, hc::array_view& av3) { // run HC parallel_for_each - return hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) restrict(amp) { + return hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) { av3(idx) = av1(idx) + av2(idx); } diff --git a/tests/Unit/Atomic/atomic_add_float_global.cpp b/tests/Unit/Atomic/atomic_add_float_global.cpp index f3823b232bd..ee64993f865 100644 --- a/tests/Unit/Atomic/atomic_add_float_global.cpp +++ b/tests/Unit/Atomic/atomic_add_float_global.cpp @@ -1,12 +1,12 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include #include #include -using namespace concurrency; +using namespace hc; #define T float #define TOLERANCE 1e-5 @@ -19,7 +19,7 @@ int main(void) { std::vector init(vecSize, INIT); array count(vecSize, init.begin()); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) { atomic_fetch_add(&count[i], INIT); } diff --git a/tests/Unit/Atomic/atomic_add_float_local.cpp b/tests/Unit/Atomic/atomic_add_float_local.cpp index df53dc757b0..f28e55a642b 100644 --- a/tests/Unit/Atomic/atomic_add_float_local.cpp +++ b/tests/Unit/Atomic/atomic_add_float_local.cpp @@ -1,10 +1,10 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; #define T float #define INIT 0.5f @@ -20,12 +20,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static T localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -34,7 +32,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); // accumlate tile_size * tile_size times diff --git a/tests/Unit/Atomic/atomic_add_global.cpp b/tests/Unit/Atomic/atomic_add_global.cpp index d76aa0f092b..52bd2acf3bd 100644 --- a/tests/Unit/Atomic/atomic_add_global.cpp +++ b/tests/Unit/Atomic/atomic_add_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -12,7 +12,7 @@ int main(void) { int init[vecSize] { 0 }; array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) { atomic_fetch_add(&count[i], 1); } diff --git a/tests/Unit/Atomic/atomic_add_local.cpp b/tests/Unit/Atomic/atomic_add_local.cpp index d04d36a5303..bc14f4511cb 100644 --- a/tests/Unit/Atomic/atomic_add_local.cpp +++ b/tests/Unit/Atomic/atomic_add_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,21 +18,19 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static unsigned localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { for(int j = 0; j < tile_size; j++) { - atomic_fetch_add(&(localA[i][j]), 1); + atomic_fetch_add(&(localA[i][j]), 1u); } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_and_global.cpp b/tests/Unit/Atomic/atomic_and_global.cpp index 425ffcc06b7..13d1ad75cbb 100644 --- a/tests/Unit/Atomic/atomic_and_global.cpp +++ b/tests/Unit/Atomic/atomic_and_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -15,7 +15,7 @@ int main(void) { } array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(int i = 0; i < vecSize; i++) { atomic_fetch_and(&count[i], 1); } diff --git a/tests/Unit/Atomic/atomic_and_local.cpp b/tests/Unit/Atomic/atomic_and_local.cpp index cd60929d834..a2ae892b51c 100644 --- a/tests/Unit/Atomic/atomic_and_local.cpp +++ b/tests/Unit/Atomic/atomic_and_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_compare_exchange_global.cpp b/tests/Unit/Atomic/atomic_compare_exchange_global.cpp index 1b3db0ebdf9..b800e8a9516 100644 --- a/tests/Unit/Atomic/atomic_compare_exchange_global.cpp +++ b/tests/Unit/Atomic/atomic_compare_exchange_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -15,7 +15,7 @@ int main(void) { } array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { // 0 -> 2 // 1 -> 1 int v = 0; diff --git a/tests/Unit/Atomic/atomic_compare_exchange_local.cpp b/tests/Unit/Atomic/atomic_compare_exchange_local.cpp index a4fa4fe6a10..5839e61da34 100644 --- a/tests/Unit/Atomic/atomic_compare_exchange_local.cpp +++ b/tests/Unit/Atomic/atomic_compare_exchange_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,13 +18,12 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { int v = 0; tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -33,7 +32,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(int i = 0; i < vecSize; ++i) { diff --git a/tests/Unit/Atomic/atomic_dec_global.cpp b/tests/Unit/Atomic/atomic_dec_global.cpp index 62015a3014f..73c2c4b8357 100644 --- a/tests/Unit/Atomic/atomic_dec_global.cpp +++ b/tests/Unit/Atomic/atomic_dec_global.cpp @@ -1,18 +1,18 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; // Alloc & init input data - int init[vecSize] { 0 }; + int init[vecSize]{}; array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) { atomic_fetch_dec(&count[i]); } diff --git a/tests/Unit/Atomic/atomic_dec_local.cpp b/tests/Unit/Atomic/atomic_dec_local.cpp index ba93c0685bb..87929b25e19 100644 --- a/tests/Unit/Atomic/atomic_dec_local.cpp +++ b/tests/Unit/Atomic/atomic_dec_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_exchange_float_global.cpp b/tests/Unit/Atomic/atomic_exchange_float_global.cpp index a62622745dd..a2c92214e9e 100644 --- a/tests/Unit/Atomic/atomic_exchange_float_global.cpp +++ b/tests/Unit/Atomic/atomic_exchange_float_global.cpp @@ -1,12 +1,12 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include #include #include -using namespace concurrency; +using namespace hc; #define T float #define INIT 0.5f @@ -19,7 +19,7 @@ int main(void) { std::vector init(vecSize, INIT); array count(vecSize, init.begin()); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { atomic_exchange(&count(idx), NEW_VALUE); }); diff --git a/tests/Unit/Atomic/atomic_exchange_float_local.cpp b/tests/Unit/Atomic/atomic_exchange_float_local.cpp index 3d3d3dd4d74..3f918099204 100644 --- a/tests/Unit/Atomic/atomic_exchange_float_local.cpp +++ b/tests/Unit/Atomic/atomic_exchange_float_local.cpp @@ -1,10 +1,10 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; #define T float #define INIT 0.5f @@ -20,12 +20,13 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { index<2> localIdx = tidx.local; index<2> globalIdx = tidx.global; tile_static T localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -34,7 +35,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(int i = 0; i < vecSize; ++i) { diff --git a/tests/Unit/Atomic/atomic_exchange_global.cpp b/tests/Unit/Atomic/atomic_exchange_global.cpp index 7b9e7503c13..2ef397a0e42 100644 --- a/tests/Unit/Atomic/atomic_exchange_global.cpp +++ b/tests/Unit/Atomic/atomic_exchange_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -12,7 +12,7 @@ int main(void) { int init[vecSize] { 0 }; array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { atomic_exchange(&count(idx), 1); }); diff --git a/tests/Unit/Atomic/atomic_exchange_local.cpp b/tests/Unit/Atomic/atomic_exchange_local.cpp index 45804b4f36a..cb70b31f841 100644 --- a/tests/Unit/Atomic/atomic_exchange_local.cpp +++ b/tests/Unit/Atomic/atomic_exchange_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(int i = 0; i < vecSize; ++i) { diff --git a/tests/Unit/Atomic/atomic_inc_global.cpp b/tests/Unit/Atomic/atomic_inc_global.cpp index bc593bc0983..63e0dc3d0da 100644 --- a/tests/Unit/Atomic/atomic_inc_global.cpp +++ b/tests/Unit/Atomic/atomic_inc_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -12,7 +12,7 @@ int main(void) { int init[vecSize] { 0 }; array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) { atomic_fetch_inc(&count[i]); } diff --git a/tests/Unit/Atomic/atomic_inc_local.cpp b/tests/Unit/Atomic/atomic_inc_local.cpp index 8c0f14cd630..c4883185a46 100644 --- a/tests/Unit/Atomic/atomic_inc_local.cpp +++ b/tests/Unit/Atomic/atomic_inc_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static unsigned localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_max_global.cpp b/tests/Unit/Atomic/atomic_max_global.cpp index d25c464e4c8..b45d672e705 100644 --- a/tests/Unit/Atomic/atomic_max_global.cpp +++ b/tests/Unit/Atomic/atomic_max_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -15,7 +15,7 @@ int main(void) { } array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(int i = 0; i < vecSize; i++) { atomic_fetch_max(&count[i], vecSize / 2); } diff --git a/tests/Unit/Atomic/atomic_max_local.cpp b/tests/Unit/Atomic/atomic_max_local.cpp index 7156b4bdf04..3851f607955 100644 --- a/tests/Unit/Atomic/atomic_max_local.cpp +++ b/tests/Unit/Atomic/atomic_max_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_min_global.cpp b/tests/Unit/Atomic/atomic_min_global.cpp index e88218cf179..858668cf405 100644 --- a/tests/Unit/Atomic/atomic_min_global.cpp +++ b/tests/Unit/Atomic/atomic_min_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -15,7 +15,7 @@ int main(void) { } array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(int i = 0; i < vecSize; i++) { atomic_fetch_min(&count[i], vecSize / 2); } diff --git a/tests/Unit/Atomic/atomic_min_local.cpp b/tests/Unit/Atomic/atomic_min_local.cpp index 523307a58be..231d39ce11a 100644 --- a/tests/Unit/Atomic/atomic_min_local.cpp +++ b/tests/Unit/Atomic/atomic_min_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_or_global.cpp b/tests/Unit/Atomic/atomic_or_global.cpp index 8f60007ba60..35e3e41736e 100644 --- a/tests/Unit/Atomic/atomic_or_global.cpp +++ b/tests/Unit/Atomic/atomic_or_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -15,7 +15,7 @@ int main(void) { } array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(int i = 0; i < vecSize; i++) { atomic_fetch_or(&count[i], 1); } diff --git a/tests/Unit/Atomic/atomic_or_local.cpp b/tests/Unit/Atomic/atomic_or_local.cpp index 5d8f01bd9da..dd58cbdfbbd 100644 --- a/tests/Unit/Atomic/atomic_or_local.cpp +++ b/tests/Unit/Atomic/atomic_or_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_sub_float_global.cpp b/tests/Unit/Atomic/atomic_sub_float_global.cpp index c49390ae6f2..92fb93dd001 100644 --- a/tests/Unit/Atomic/atomic_sub_float_global.cpp +++ b/tests/Unit/Atomic/atomic_sub_float_global.cpp @@ -1,12 +1,12 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include #include #include -using namespace concurrency; +using namespace hc; #define T float #define TOLERANCE 1e-5 @@ -19,7 +19,7 @@ int main(void) { std::vector init(vecSize, INIT); array count(vecSize, init.begin()); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) { atomic_fetch_sub(&count[i], INIT); } diff --git a/tests/Unit/Atomic/atomic_sub_float_local.cpp b/tests/Unit/Atomic/atomic_sub_float_local.cpp index c07139adc38..ce86e915cb2 100644 --- a/tests/Unit/Atomic/atomic_sub_float_local.cpp +++ b/tests/Unit/Atomic/atomic_sub_float_local.cpp @@ -1,10 +1,10 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; #define T float #define INIT 0.5f @@ -20,12 +20,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static T localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -34,7 +32,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); // accumlate tile_size * tile_size times diff --git a/tests/Unit/Atomic/atomic_sub_global.cpp b/tests/Unit/Atomic/atomic_sub_global.cpp index 54ea95c39a6..aec8d22eaaa 100644 --- a/tests/Unit/Atomic/atomic_sub_global.cpp +++ b/tests/Unit/Atomic/atomic_sub_global.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -12,7 +12,7 @@ int main(void) { int init[vecSize] { 0 }; array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) { atomic_fetch_sub(&count[i], 1); } diff --git a/tests/Unit/Atomic/atomic_sub_local.cpp b/tests/Unit/Atomic/atomic_sub_local.cpp index 1acf278af26..f4c5d4572dc 100644 --- a/tests/Unit/Atomic/atomic_sub_local.cpp +++ b/tests/Unit/Atomic/atomic_sub_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -32,7 +30,7 @@ int main(void) { } } tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/Atomic/atomic_xor_global.cpp b/tests/Unit/Atomic/atomic_xor_global.cpp index e28ce7a025b..c00f474186c 100644 --- a/tests/Unit/Atomic/atomic_xor_global.cpp +++ b/tests/Unit/Atomic/atomic_xor_global.cpp @@ -1,9 +1,10 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include + #include -#include #include -using namespace concurrency; + +using namespace hc; int main(void) { const int vecSize = 100; @@ -15,7 +16,7 @@ int main(void) { } array count(vecSize, std::begin(init)); - parallel_for_each(count.get_extent(), [=, &count](index<1> idx) restrict(amp) { + parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] { atomic_fetch_xor(&count(idx), 1); }); diff --git a/tests/Unit/Atomic/atomic_xor_local.cpp b/tests/Unit/Atomic/atomic_xor_local.cpp index c7921ad336b..198b8b704f2 100644 --- a/tests/Unit/Atomic/atomic_xor_local.cpp +++ b/tests/Unit/Atomic/atomic_xor_local.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -using namespace concurrency; +using namespace hc; int main(void) { const int vecSize = 100; @@ -18,12 +18,10 @@ int main(void) { array_view av_a(e_a, va); extent<2> compute_domain(e_a); - parallel_for_each(compute_domain.tile(), [=] (tiled_index tidx) restrict(amp,cpu) { - index<2> localIdx = tidx.local; - index<2> globalIdx = tidx.global; - + parallel_for_each( + compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] { tile_static int localA[tile_size][tile_size]; - localA[localIdx[0]][localIdx[1]] = 0; + localA[tidx.local[0]][tidx.local[1]] = 0; tidx.barrier.wait(); for(int i = 0; i < tile_size; i++) { @@ -31,8 +29,8 @@ int main(void) { atomic_fetch_xor(&(localA[i][j]), 1); } } - tidx.barrier.wait(); - av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]]; + tidx.barrier.wait(); + av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]]; }); for(unsigned i = 0; i < vecSize; i++) { diff --git a/tests/Unit/AutoRestricted/2_4_1_3_Comp_Type_OKCases.cpp b/tests/Unit/AutoRestricted/2_4_1_3_Comp_Type_OKCases.cpp deleted file mode 100644 index 8375284d38d..00000000000 --- a/tests/Unit/AutoRestricted/2_4_1_3_Comp_Type_OKCases.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include -using std::vector; -using namespace concurrency; - -//2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/BoolPointer/test.cpp -void f_boolpointer() restrict(auto) // Not a negative test anymore since pointer to bool is now supported -{ - bool b; - bool * pb = &b; - *pb = true; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) -{ - f_boolpointer(); // OK -} - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Case1.cpp b/tests/Unit/AutoRestricted/Case1.cpp deleted file mode 100644 index ad2b884932f..00000000000 --- a/tests/Unit/AutoRestricted/Case1.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -class B -{ -public: - float f1(int &flag) - { - flag = 1; - return 0.0; - } -}; - -bool test() restrict(auto) -{ - bool passed = true; - int flag = 0; - - class D: public B - { - public: - float f2(int &flag) {return 0.0;} - }; - - D o; - - o.f1(flag); // OK since test is inferred as CPU - - if (flag != 1) - { - return false; - } - - return passed; -} - -int main(int argc, char **argv) -{ - return test() ? 0 : 1; -} - - diff --git a/tests/Unit/AutoRestricted/MemberExpr.cpp b/tests/Unit/AutoRestricted/MemberExpr.cpp deleted file mode 100644 index 4d76160af18..00000000000 --- a/tests/Unit/AutoRestricted/MemberExpr.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -class c2 -{ -public: - int f(int) restrict(cpu) - { - return 1; - } - - int f(float) restrict(cpu,amp) - { - return 0; - } - -}; - -class c1 -{ -public: - int b(int) restrict(auto) // Use 'auto' to select good compilation path - { - c2 o; // Check SMF is after the 'auto' inferring - - int i; - - return o.f(i); // if not inferred, undefined reference to `c2::f(int)' - } -}; - -bool test() -{ - c1 o; - - int i = 0; - - int flag = o.b(i); - - return ((flag == 1) ? true : false); -} - -int main(int argc, char **argv) -{ - int ret = test(); - - return ret?0:1; -} - - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/1d.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/1d.cpp deleted file mode 100644 index 8877134d0e0..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/1d.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include > -#include "common.h" - -using std::vector; -using namespace concurrency; - -// From 2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/1d/Test01/test.cpp -void f_1d() restrict(auto) { - struct s1 - { - s1(array_view a) restrict(cpu,amp) : m(a) {} - ~s1() restrict(cpu,amp) {} - - array_view &m; - }; -} -// CHECK: 1d.cpp:[[@LINE-3]]:22: error: pointer or reference is not allowed as pointed to type, array element type or data member type (except reference to concurrency::array/texture) -// CHECK-NEXT: array_view &m; -// CHECK-NEXT: ^ - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_1d(); -} -// CHECK: 1d.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_1d(); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerArrayElementType.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerArrayElementType.cpp deleted file mode 100644 index cc320be4777..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerArrayElementType.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -//2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/PointerArrayElementType/test.cpp -void PointerArrayElmentTypeNotSupported(int x) __AUTO { - int * arr[5]; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - PointerArrayElmentTypeNotSupported(1); -} -// CHECK: PointerArrayElementType.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: PointerArrayElmentTypeNotSupported(1); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerToPointer.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerToPointer.cpp deleted file mode 100644 index 8fd7cf7a689..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/PointerToPointer.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -void PointerToPointerNotSupported(int x) __AUTO { - int ** ptr; - return; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - PointerToPointerNotSupported(1); -} -// CHECK: PointerToPointer.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: PointerToPointerNotSupported(1); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bitfield.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bitfield.cpp deleted file mode 100644 index 21bf01b29b6..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bitfield.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -// From 2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/BitField/test.cpp -struct MyBitField -{ - unsigned number: 31; -}; - - -void f_BitfieldNotSupported(int x) restrict(auto) -{ - MyBitField bField; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_BitfieldNotSupported(1); -} -// CHECK: bitfield.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_BitfieldNotSupported(1); -// CHECK-NEXT: ^ - - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bool_array.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bool_array.cpp deleted file mode 100644 index 5fe705674cf..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/bool_array.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -void BoolNotAllowedAsArrayElementType(int x) restrict(auto) -{ - bool arr[5]; // expected error{{bool is not allowed element type of array in amp restricted code}} -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - BoolNotAllowedAsArrayElementType(1); -} -// CHECK: bool_array.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: BoolNotAllowedAsArrayElementType(1); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/char_array.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/char_array.cpp deleted file mode 100644 index 7584efd062a..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/char_array.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -// From 2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/CharArray/test.cpp -void f_chararray() __AUTO -{ - char ca[10]; - ca[2] = 'c'; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_chararray(); -} -// CHECK: char_array.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_chararray(); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/common.h b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/common.h deleted file mode 100644 index ef9dfcf91c5..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/common.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#ifndef __GPU_ONLY -#define __GPU_ONLY restrict(amp) -#endif - -#ifndef __CPU_ONLY -#define __CPU_ONLY restrict(cpu) -#endif - -#ifndef __GPU -#define __GPU restrict(amp,cpu) -#endif - -#ifndef __AUTO -#define __AUTO restrict(auto) -#endif diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_pointer.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_pointer.cpp deleted file mode 100644 index 652afd8fd4d..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_pointer.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -// From 2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/FunctionPointer/test.cpp -void FunctionPointerNotSupported(int x) __AUTO -{ - int (*pt2Function)(float, char, char) = NULL; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - FunctionPointerNotSupported(1); -} -// CHECK: function_pointer.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: FunctionPointerNotSupported(1); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_reference.cpp b/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_reference.cpp deleted file mode 100644 index d499d6c0eb7..00000000000 --- a/tests/Unit/AutoRestricted/Negative/2_4_1_3_Comp_Type_Negative/function_reference.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -#include "common.h" - -using std::vector; -using namespace concurrency; - -// From 2_Cxx_Lang_Exte/2_4_amp_Rest_Modi/2_4_1_Rest_on_Type/2_4_1_3_Comp_Type/Negative/FunctionReference/test.cpp -void FunctionReferenceNotSupported(int x) __AUTO -{ - int (&pt2Function)(float); -} -// CHECK: function_reference.cpp:[[@LINE-2]]:9: error: declaration of reference variable 'pt2Function' requires an initializer -// CHECK-NEXT: int (&pt2Function)(float); -// CHECK-NEXT: ^~~~~~~~~~~ - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - FunctionReferenceNotSupported(1); -} -// CHECK: function_reference.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: FunctionReferenceNotSupported(1); -// CHECK-NEXT: ^ - -int main(void) -{ - exit(1); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXThrowExpr.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXThrowExpr.cpp deleted file mode 100644 index a0019210064..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXThrowExpr.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// CXXThrowExpr -void f_throw() restrict(auto) { - throw 1; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_throw(); -} -// CHECK: CXXThrowExpr.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_throw(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXTryStmt.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXTryStmt.cpp deleted file mode 100644 index e9669924606..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/CXXTryStmt.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// CXXTryStmt -void f_try_catch() restrict(auto) { - try { - } - catch(...){ - } -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_try_catch(); -} -// CHECK: CXXTryStmt.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_try_catch(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Dtor_has_multiple_restrictions.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Dtor_has_multiple_restrictions.cpp deleted file mode 100644 index dafdb75f3e4..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Dtor_has_multiple_restrictions.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -int f_dtor_mulitple() restrict(auto) { - class MyClass - { - public: - MyClass() {} - MyClass() restrict(amp) {} - - ~MyClass(); - }; - MyClass A; -} -// CHECK: Dtor_has_multiple_restrictions.cpp:[[@LINE-4]]:7: error: Destructor's restriction specifiers must cover the union of restrictions on all constructors -// CHECK-NEXT: ~MyClass(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/DynamicCastExpr.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/DynamicCastExpr.cpp deleted file mode 100644 index ae4c5880fd5..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/DynamicCastExpr.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// DynamicCast -class A {}; -class B : public A {}; - -void f_dynamiccast() restrict(auto) -{ - B * b = 0; - A * a = dynamic_cast(b); -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_dynamiccast(); -} -// CHECK: DynamicCastExpr.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_dynamiccast(); -// CHECK-NEXT: ^ -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Enum.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Enum.cpp deleted file mode 100644 index a2a9d48c5cc..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Enum.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// test enum with illegal underlying type -enum Suit : char { - Diamonds, - Hearts, - Clubs, - Spades -}; - -bool foo_enum(Suit suit) restrict(auto) -{ - if (suit == Diamonds) - return true; - else - return false; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - foo_enum(Hearts); -} -// CHECK: Enum.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: foo_enum(Hearts); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/GotoStmt.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/GotoStmt.cpp deleted file mode 100644 index 2cec13aefa3..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/GotoStmt.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// GotoStmt, LabelStmt -void f_goto_label() restrict(auto) -{ - goto _label; -_label: - ; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_goto_label(); -} -// CHECK: GotoStmt.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_goto_label(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/TypeidExpr.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/TypeidExpr.cpp deleted file mode 100644 index 827f4ddb66d..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/TypeidExpr.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -//typeid -void f_typeid() restrict(auto) -{ - int n; - int & r = n; - typeid(r); -} -void AMP_AND_CPU_Func() restrict(cpu,amp) -{ - f_typeid(); -} -// CHECK: TypeidExpr.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_typeid(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Volatile.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Volatile.cpp deleted file mode 100644 index 280ad84acd4..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/Volatile.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// volatile is not allowed in amp context -class s_volatile -{ -public: - int i; - double d; - unsigned long ul; - float f; -}; - -void f_volatile() restrict(auto) -{ - int a = 0; - double d = 0; - volatile int &pi1 = (volatile int&)a; - volatile double &pd1 = (volatile double&)d; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_volatile(); -} -// CHECK: Volatile.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_volatile(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/char_short_wchar_longlong_longdouble.cpp b/tests/Unit/AutoRestricted/Negative/Stmt_Cases/char_short_wchar_longlong_longdouble.cpp deleted file mode 100644 index 2eddf0831bc..00000000000 --- a/tests/Unit/AutoRestricted/Negative/Stmt_Cases/char_short_wchar_longlong_longdouble.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -short foo_short(unsigned short s) restrict(amp) { - return (s + 2); -} -// CHECK: char_short_wchar_longlong_longdouble.cpp:[[@LINE-3]]:1: error: short type can't be used as function return type in AMP-restricted functions -// CHECK-NEXT: short foo_short(unsigned short s) restrict(amp) { -// CHECK-NEXT: ^ - -int f_char_short_wchar_longlong_longdouble() restrict(auto) -{ - char c = 65; - long double ld = 6LL; - long long ll = 6LL; - foo_short(2); - wchar_t c1 = 65; - return 0; -} - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_char_short_wchar_longlong_longdouble(); -} -// CHECK: char_short_wchar_longlong_longdouble.cpp:[[@LINE-2]]:3: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: f_char_short_wchar_longlong_longdouble(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_mutable_keyword.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_mutable_keyword.cpp deleted file mode 100644 index b21ba78ecc6..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_mutable_keyword.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -void f_wrong_order_of_trailing_return() { - // error: inner lambda has incorrect lamda-declarator clause - parallel_for_each(extent<1>(1), [&](index<1> idx) restrict(amp) { - []() mutable -> void restrict(auto) {}(); // expected_error{{expected body of lambda expression}} - }); -} -// CHECK: after_mutable_keyword.cpp:[[@LINE-3]]:25: error: expected body of lambda expression -// CHECK-NEXT: []() mutable -> void restrict(auto) {}(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_and_mutable_keyword.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_and_mutable_keyword.cpp deleted file mode 100644 index e797abf5f51..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_and_mutable_keyword.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -void f_wrong_order_of_mutable_throw() { - // error: inner lambda has incorrect lamda-declarator clause - parallel_for_each(extent<1>(1), [&](index<1> idx) restrict(amp) { - []() mutable throw() -> void restrict(auto) {}(); - }); -} -// CHECK: after_throw_and_mutable_keyword.cpp:[[@LINE-3]]:17: error: exception specifier is not allowed in C++AMP context -// CHECK-NEXT: []() mutable throw() -> void restrict(auto) {}(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_1.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_1.cpp deleted file mode 100644 index b4e59abfa89..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_1.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -//restriction after throw -void f_after_throw() throw(...) restrct(auto) -{} -// CHECK: after_throw_keyword_1.cpp:[[@LINE-2]]:33: error: expected function body after function declarator -// CHECK-NEXT:void f_after_throw() throw(...) restrct(auto) -// CHECK-NEXT: ^ - -void AMP_AND_CPU_Func() restrict(cpu,amp) { - f_throw(); -} - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_2.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_2.cpp deleted file mode 100644 index 9758a94fae0..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/after_throw_keyword_2.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -void f_wrong_order_of_throw() { - // error: inner lambda has incorrect lamda-declarator clause - parallel_for_each(extent<1>(1), [&](index<1> idx) restrict(amp) { - []() throw() -> void restrict(auto) {}(); - }); -} -// CHECK: after_throw_keyword_2.cpp:[[@LINE-3]]:9: error: exception specifier is not allowed in C++AMP context -// CHECK-NEXT: []() throw() -> void restrict(auto) {}(); -// CHECK-NEXT: ^ -// CHECK-NEXT: after_throw_keyword_2.cpp:[[@LINE-6]]:25: error: expected body of lambda expression -// CHECK-NEXT: []() throw() -> void restrict(auto) {}(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/auto_in_function_prototype.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/auto_in_function_prototype.cpp deleted file mode 100644 index 3589a3d1467..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/auto_in_function_prototype.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -using namespace concurrency; - -int test(int (*p)(int, int) restrict(auto)) // expected-error{{'auto' not allowed in function prototype}} -{ - return 1; -} -// CHECK: auto_in_function_prototype.cpp:[[@LINE-4]]:42: error: 'auto' restriction specifier is only allowed on function definition -// CHECK-NEXT:int test(int (*p)(int, int) restrict(auto)) -// CHECK-NEXT: ^ -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_CV.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_CV.cpp deleted file mode 100644 index 298312c5992..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_CV.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -using namespace concurrency; - -// before VC -int const_FUNC() restrict(auto) const {return 1;} -// CHECK: before_CV.cpp:[[@LINE-1]]:31: error: 'auto' restriction specifier is only allowed on function definition -// CHECK-NEXT:int const_FUNC() restrict(auto) const {return 1;} -// CHECK-NEXT: ^ -// CHECK-NEXT:before_CV.cpp:[[@LINE-4]]:32: error: expected ';' after top level declarator -// CHECK-NEXT:int const_FUNC() restrict(auto) const {return 1;} -// CHECK-NEXT: ^ -// CHECK-NEXT: ; -// CHECK-NEXT:before_CV.cpp:[[@LINE-8]]:39: error: expected unqualified-id -// CHECK-NEXT:int const_FUNC() restrict(auto) const {return 1;} - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_name.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_name.cpp deleted file mode 100644 index b4785b744ce..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_name.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// before function name -int restrict(auto) f_before_function_name() { - return 1; -} -// CHECK: before_function_name.cpp:[[@LINE-3]]:14: error: 'auto' not allowed in function prototype -// CHECK-NEXT:int restrict(auto) f_before_function_name() { -// CHECK-NEXT: ^~~~ -// CHECK-NEXT:before_function_name.cpp:[[@LINE-6]]:20: error: expected 'restrict' specifier -// CHECK-NEXT:int restrict(auto) f_before_function_name() { -// CHECK-NEXT: ^ -// CHECK-NEXT:before_function_name.cpp:[[@LINE-9]]:13: error: function cannot return function type 'int ()' -// CHECK-NEXT:int restrict(auto) f_before_function_name() { - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_type.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_type.cpp deleted file mode 100644 index 6bc1048eb83..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/before_function_type.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// before function type -restrict(auto) int f_before_function_type() restrict(amp) {return 1;} -// CHECK: before_function_type.cpp:[[@LINE-1]]:10: error: 'auto' not allowed in function prototype -// CHECK-NEXT:restrict(auto) int f_before_function_type() restrict(amp) {return 1;} -// CHECK-NEXT: ^~~~ -// CHECK-NEXT:before_function_type.cpp:[[@LINE-4]]:1: error: C++ requires a type specifier for all declarations -// CHECK-NEXT:restrict(auto) int f_before_function_type() restrict(amp) {return 1;} - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/most_vexing_parse.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/most_vexing_parse.cpp deleted file mode 100644 index 49f8b346712..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/most_vexing_parse.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -//Most vexing parse -struct S_vex { - S_vex(int) {}; -}; - -int f_most_vexing_parse() { - int a = 1; - S_vex foo((int) restrict(auto) a); -// CHECK: most_vexing_parse.cpp:[[@LINE-1]]:30: error: expected expression -// CHECK-NEXT: S_vex foo((int) restrict(auto) a); -// CHECK-NEXT: ^ - S_vex foo1((int)a) restrict(auto); // expected_error{{expected ';' at end of declaration}} -// CHECK: most_vexing_parse.cpp:[[@LINE-1]]:23: error: expected ';' at end of declaration -// CHECK-NEXT: S_vex foo1((int)a) restrict(auto); -// CHECK-NEXT: ^ -// CHECK-NEXT: ; - return 1; -} - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/on_more_declarations.cpp b/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/on_more_declarations.cpp deleted file mode 100644 index 03d08c46749..00000000000 --- a/tests/Unit/AutoRestricted/Negative/auto-on-wrong-place/on_more_declarations.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -// different restriction specifier on function declaration and definition -struct S -{ - int test() restrict(amp); -}; - -int S::test() restrict(auto) { - return 1; -} -// CHECK: on_more_declarations.cpp:[[@LINE-3]]:28: error: 'test': expected no other declaration since it is auto restricted -// CHECK-NEXT:int S::test() restrict(auto) -// CHECK-NEXT: ^ -// CHECK-NEXT:note: previous declaration is here -// CHECK-NEXT: int test() restrict(amp); -// CHECK-NEXT: ^ -// CHECK-NEXT:on_more_declarations.cpp:[[@LINE-9]]:8: error: out-of-line definition of 'test' does not match any declaration in 'S' -// CHECK-NEXT:int S::test() restrict(auto) -// CHECK-NEXT ^~~~ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/declarator_not_definition.cpp b/tests/Unit/AutoRestricted/Negative/declarator_not_definition.cpp deleted file mode 100644 index 72d9807cc7c..00000000000 --- a/tests/Unit/AutoRestricted/Negative/declarator_not_definition.cpp +++ /dev/null @@ -1,16 +0,0 @@ -//XFAIL:* -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -int f1() restrict(cpu,amp) {return 1;} -int f2xx() restrict(cpu,auto); // expected-error{{'auto' restriction specifier is only allowed on function definition}} -int f2xx() restrict(cpu) -{ - return f1(); -} -int main(void) -{ - f2xx(); - return 0; // should not compile -} - diff --git a/tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_AMP.cpp b/tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_AMP.cpp deleted file mode 100644 index 61f31f7d808..00000000000 --- a/tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_AMP.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: %cxxamp %s -o %t.out 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -int f1() restrict(amp) {return 1;} -int f2() restrict(cpu,auto) { - return f1(); -} -// CHECK: diagnose_before_perform_inferring_AMP.cpp:[[@LINE-2]]:12: error: 'f1': no overloaded function has restriction specifiers that are compatible with the ambient context 'f2' -// CHECK-NEXT: return f1(); -// CHECK-NEXT: ^ - - -int main(void) -{ - f2(); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_CPU.cpp b/tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_CPU.cpp deleted file mode 100644 index 84f8700f0fb..00000000000 --- a/tests/Unit/AutoRestricted/Negative/diagnose_before_perform_inferring_CPU.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -int f1() restrict(cpu) {return 1;} -int f2() restrict(amp,auto) { - return f1(); -} -// CHECK: diagnose_before_perform_inferring_CPU.cpp:[[@LINE-2]]:10: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: return f1(); -// CHECK-NEXT: ^ -int main(void) -{ - f2(); - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/function_reference.cpp b/tests/Unit/AutoRestricted/Negative/function_reference.cpp deleted file mode 100644 index ab7cfe50953..00000000000 --- a/tests/Unit/AutoRestricted/Negative/function_reference.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// RUN: %cxxamp %s -o %t.out 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; - -//initialize function reference with a function with incompatible restriction specifier -int glorp(int x) restrict(amp) { - return 668 + x; -} - -int f_func_ref() restrict(auto) { - typedef int FT(int); - FT& p = glorp; - return 1; -} - -void CPU_Func() restrict(cpu) -{ - f_func_ref(); -} - -int main(void) -{ - return 0; -} -// CHECK: In function `f_func_ref()': -// CHECK: undefined reference to `glorp(int)' diff --git a/tests/Unit/AutoRestricted/Negative/has_other_declarations.cpp b/tests/Unit/AutoRestricted/Negative/has_other_declarations.cpp deleted file mode 100644 index 69a5f97fd8b..00000000000 --- a/tests/Unit/AutoRestricted/Negative/has_other_declarations.cpp +++ /dev/null @@ -1,15 +0,0 @@ -//XFAIL:* -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -int f1() restrict(cpu,amp) {return 1;} -int f2() restrict(cpu); // expected-note{{previous declaration is here}} -int f2() restrict(cpu,auto) { // expected-error{{'f2': expected no other declaration since it is auto restricted}} - return f1(); -} -int main(void) -{ - f2(); - return 0; // should not compile -} - diff --git a/tests/Unit/AutoRestricted/Negative/infer_error_amp.cpp b/tests/Unit/AutoRestricted/Negative/infer_error_amp.cpp deleted file mode 100644 index 26096ad89bb..00000000000 --- a/tests/Unit/AutoRestricted/Negative/infer_error_amp.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: %cxxamp %s -o %t.out 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -int f1() restrict(amp) {return 1;} -int f2() restrict(auto) { - return f1(); -} -// CHECK: infer_error_amp.cpp:[[@LINE-2]]:12: error: 'f1': no overloaded function has restriction specifiers that are compatible with the ambient context 'f2' -// CHECK-NEXT: return f1(); -// CHECK-NEXT: ^ - -int CPU_Func() restrict(cpu) { - return f2(); -} -// CHECK: infer_error_amp.cpp:[[@LINE-2]]:12: error: 'f2': no overloaded function has restriction specifiers that are compatible with the ambient context 'CPU_Func' -// CHECK-NEXT: return f2(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/infer_error_cpu.cpp b/tests/Unit/AutoRestricted/Negative/infer_error_cpu.cpp deleted file mode 100644 index 93588868abf..00000000000 --- a/tests/Unit/AutoRestricted/Negative/infer_error_cpu.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -int f1() restrict(cpu) {return 1;} -int f2() restrict(auto) { - return f1(); -} - -int AMP_Func() restrict(amp) { - return f2(); -} -// CHECK: infer_error_cpu.cpp:[[@LINE-2]]:10: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: return f2(); -// CHECK-NEXT: ^ -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/Negative/restriction_inferred_should_have_no_non-auto-restriction_added.cpp b/tests/Unit/AutoRestricted/Negative/restriction_inferred_should_have_no_non-auto-restriction_added.cpp deleted file mode 100644 index 21c7b25c8f1..00000000000 --- a/tests/Unit/AutoRestricted/Negative/restriction_inferred_should_have_no_non-auto-restriction_added.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s - -////////////////////////////////////////////////////////////////////////////////// -// Do not delete or add any line; it is referred to by absolute line number in the -// FileCheck lines below -////////////////////////////////////////////////////////////////////////////////// -#include - -int f1() restrict(cpu) {return 1;} -int f2() restrict(cpu,auto) { - return f1(); -} - -int AMP_Func() restrict(amp) { - return f2(); -} -// CHECK: restriction_inferred_should_have_no_non-auto-restriction_added.cpp:[[@LINE-2]]:10: error: call from AMP-restricted function to CPU-restricted function -// CHECK-NEXT: return f2(); -// CHECK-NEXT: ^ - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/auto_auto.cpp b/tests/Unit/AutoRestricted/auto_auto.cpp deleted file mode 100644 index a320a850d18..00000000000 --- a/tests/Unit/AutoRestricted/auto_auto.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// RUN: %hc -Xclang -fauto-auto %s -o %t.out && %t.out - -#include -#include -#include - -using namespace concurrency; - -bool CopyArray() { - std::vector va{1, 2, 3, 4, 5}; - std::vector vb{6, 7, 8, 9, 10}; - - array a(va.size(), va.data()); - array b(vb.size(), vb.data()); - array c(va.size()); - - parallel_for_each( - extent<1>(va.size()), - [&](index<1> idx) - { - c(idx) = a(idx) + b(idx); - } - ); - - std::vector vsum = c; - - // verify - for (int i = 0; i < 5; i++) { - if (vsum[i] != va[i] + vb[i]) { - return false; - } - } - return true; -} - -int main() { - bool ret = true; - - ret &= CopyArray(); - - return !(ret == true); -} - diff --git a/tests/Unit/AutoRestricted/inferred_as_both_cpu_and_amp.cpp b/tests/Unit/AutoRestricted/inferred_as_both_cpu_and_amp.cpp deleted file mode 100644 index a8aa7d318de..00000000000 --- a/tests/Unit/AutoRestricted/inferred_as_both_cpu_and_amp.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include -#include -using namespace std; - -int f1() restrict(cpu) {return 1;} -int f1() restrict(amp) {return 2;} - -int f2() restrict(auto) { - return f1(); -} - -// If not inferred or wrongly inferred -int CPU_Func() restrict(cpu) -{ - if(f2() != 1) // if referred to be amp only, expected-error{{call from CPU-restricted function to AMP-restricted function}} - std::cout<<"Fail to verify result of f2() in CPU path!\n"; - - return f2(); -} - - -// If not inferred or wrongly inferred -int AMP_Func() restrict(amp) -{ - if(f2() != 2) // if referred to be cpu only, expected-error{{call from AMP-restricted function to CPU-restricted function}} - { - std::cout<<"Fail to verify result of f2() in GPU path!\n"; - exit(1); - } - - return f2(); -} - -int AMP_AND_CPU_Func() restrict(cpu,amp) -{ - return f2(); // OK -} - - -int main(void) -{ - return 0; -} - diff --git a/tests/Unit/AutoRestricted/normal.cpp b/tests/Unit/AutoRestricted/normal.cpp deleted file mode 100644 index 2bcbed11b74..00000000000 --- a/tests/Unit/AutoRestricted/normal.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -int f1() restrict(cpu,amp) {return 1;} -int f2() restrict(cpu,auto) { - return f1(); -} -int main(void) -{ - f2(); - return 0; // expected: success -} - diff --git a/tests/Unit/AutoRestricted/on_lambda.cpp b/tests/Unit/AutoRestricted/on_lambda.cpp deleted file mode 100644 index 71cbea2bbc5..00000000000 --- a/tests/Unit/AutoRestricted/on_lambda.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include -#include -#include -int main(void){ - const int vecSize = 100; - - // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::array a(vecSize); - Concurrency::array b(vecSize); - Concurrency::array c(vecSize); - int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); - Concurrency::array_view gc(c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { - ga[i] = 100.0f * rand() / RAND_MAX; - gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; - } - - Concurrency::parallel_for_each( - e, - [=](Concurrency::index<1> idx) restrict(amp,auto) { - gc[idx] = ga[idx]+gb[idx]; - }); - - int error = 0; - for(unsigned i = 0; i < vecSize; i++) { - error += gc[i] - (ga[i] + gb[i]); - } - return (error != 0); -} -// SPIR code generation test -// CHECK: metadata !{metadata !"kernel_arg_addr_space", i32 0, diff --git a/tests/Unit/AutoRestricted/only_auto.cpp b/tests/Unit/AutoRestricted/only_auto.cpp deleted file mode 100644 index ed38a3e05db..00000000000 --- a/tests/Unit/AutoRestricted/only_auto.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -int f1() restrict(cpu,amp) {return 1;} -int f2() restrict(auto) { - static int i; - return f1(); -} - -int AMP_AND_CPU_Func() restrict(cpu,amp) -{ - f2(); // OK. 'auto' is inferred to (cpu,amp) - return 1; -} - -int main(void) -{ - return 0; // expected: success -} - diff --git a/tests/Unit/AutoRestricted/restriction_inferred.cpp b/tests/Unit/AutoRestricted/restriction_inferred.cpp deleted file mode 100644 index efd2c2b5cbe..00000000000 --- a/tests/Unit/AutoRestricted/restriction_inferred.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// RUN: %cxxamp %s -o %t.out && %t.out -#include - -int f1() restrict(cpu,amp) {return 1;} -// DeclRefExpr -int f2() restrict(cpu,auto) { - return f1(); -} - -// null -void f_null() restrict(cpu,auto) { -} - - -// ReturnStmt -int f_return() restrict(cpu,auto) { - return 1; -} - -// CXXTryStmt -// GotoStmt -// LabelStmt - - -int AMP_CPU_Func() restrict(cpu,amp) -{ - f2(); // OK, 'auto' is inferred to amp, so f2 is both (cpu,amp) restricted - f_null(); // OK - f_return(); // OK -} - -int main(void) -{ - return 0; // expected: success -} - diff --git a/tests/Unit/CXXLangExt/array_array.cpp b/tests/Unit/CXXLangExt/array_array.cpp index e305ecce2b1..09a4bf3454a 100644 --- a/tests/Unit/CXXLangExt/array_array.cpp +++ b/tests/Unit/CXXLangExt/array_array.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -19,8 +19,8 @@ bool test() int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { int arr[vecSize][vecSize]; diff --git a/tests/Unit/CXXLangExt/array_pointer.cpp b/tests/Unit/CXXLangExt/array_pointer.cpp index 14c3e532d9a..43531acce7b 100644 --- a/tests/Unit/CXXLangExt/array_pointer.cpp +++ b/tests/Unit/CXXLangExt/array_pointer.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -18,8 +18,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { int var1 = idx[0]; int var2 = idx[0] * 2; diff --git a/tests/Unit/CXXLangExt/enum.cpp b/tests/Unit/CXXLangExt/enum.cpp index 241edc4f376..609b142c508 100644 --- a/tests/Unit/CXXLangExt/enum.cpp +++ b/tests/Unit/CXXLangExt/enum.cpp @@ -21,7 +21,7 @@ // RUN: %hc -DTYPE="unsigned long long" %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -49,8 +49,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { p_ans[idx[0]] = (int)E::ZERO + (int)EC::ZERO + (int)ES::ZERO; }); diff --git a/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp b/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp index f730aeebab4..aa5ad0fc04f 100644 --- a/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp +++ b/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp @@ -1,9 +1,9 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include -void NoEllipsisAllowed(int x, ...) restrict(amp) {} +void NoEllipsisAllowed(int x, ...) [[hc]] {} int main() { diff --git a/tests/Unit/CXXLangExt/local_param_ret.cpp b/tests/Unit/CXXLangExt/local_param_ret.cpp index 492483d3ec4..e7fec540c5e 100644 --- a/tests/Unit/CXXLangExt/local_param_ret.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret.cpp @@ -30,7 +30,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -40,7 +40,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -TYPE func(TYPE arg) restrict(amp) +TYPE func(TYPE arg) [[hc]] { TYPE local = arg; return local; @@ -54,8 +54,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { p_ans[idx[0]] = func((TYPE)idx[0]); }); diff --git a/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp b/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp index 53aa2fdfeff..710112f7d18 100644 --- a/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp @@ -2,9 +2,9 @@ // RUN: %hc -DTYPE="half float" %s -o %t.out && %t.out #include -#include +#include -TYPE func(TYPE arg) restrict(amp) +TYPE func(TYPE arg) [[hc]] { TYPE local = arg; return local; @@ -20,8 +20,8 @@ int main () int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { p_ans[idx[0]] = func((TYPE)idx[0]); }); diff --git a/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp b/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp index 4056646031f..181999a4ffc 100644 --- a/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -10,7 +10,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -int func(float arg1, char arg2, char arg3) restrict(amp, cpu) +int func(float arg1, char arg2, char arg3) [[cpu, hc]] { return (int)(arg2 + arg3); } @@ -23,8 +23,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { int (*pt2Function)(float, char, char) = &func; p_ans[idx[0]] = (*pt2Function)(0, (char)idx[0], (char)idx[0]); diff --git a/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp b/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp index 7868656f4ba..33aac7f0b13 100644 --- a/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp @@ -30,7 +30,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -40,7 +40,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -TYPE * func(TYPE * arg) restrict(amp) +TYPE * func(TYPE * arg) [[hc]] { TYPE * local = arg; return local; @@ -54,8 +54,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { TYPE var = (TYPE)idx[0]; p_ans[idx[0]] = *(func(&var)); diff --git a/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp b/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp index fe0379b98d5..f86eeb0e762 100644 --- a/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp @@ -30,7 +30,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -40,7 +40,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -TYPE *& func(TYPE *& arg) restrict(amp) +TYPE *& func(TYPE *& arg) [[hc]] { TYPE *& local = arg; return local; @@ -54,8 +54,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { TYPE var = (TYPE)idx[0]; TYPE * p_var = &var; diff --git a/tests/Unit/CXXLangExt/local_param_ret_ref.cpp b/tests/Unit/CXXLangExt/local_param_ret_ref.cpp index 83d30e1714c..9ef2038bee9 100644 --- a/tests/Unit/CXXLangExt/local_param_ret_ref.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret_ref.cpp @@ -30,7 +30,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -40,7 +40,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -TYPE & func(TYPE & arg) restrict(amp) +TYPE & func(TYPE & arg) [[hc]] { TYPE & local = arg; return local; @@ -54,8 +54,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { TYPE var = (TYPE)idx[0]; p_ans[idx[0]] = func(var); diff --git a/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp b/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp index 45f5039cbdf..c10e3e1fd52 100644 --- a/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp +++ b/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp @@ -28,7 +28,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -39,7 +39,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -TYPE func(TYPE arg) restrict(amp) +TYPE func(TYPE arg) [[hc]] { static TYPE local = 0; local += arg; @@ -54,8 +54,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { func((TYPE)idx[0]); p_ans[idx[0]] = func((TYPE)idx[0]); diff --git a/tests/Unit/CXXLangExt/statement_asm.cpp b/tests/Unit/CXXLangExt/statement_asm.cpp index 720e594fdb3..00f500324d0 100644 --- a/tests/Unit/CXXLangExt/statement_asm.cpp +++ b/tests/Unit/CXXLangExt/statement_asm.cpp @@ -2,9 +2,9 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include -void func () restrict(amp) +void func () [[hc]] { asm("ret"); } @@ -19,8 +19,8 @@ int main () int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { func(); p_ans[idx[0]] = idx[0]; diff --git a/tests/Unit/CXXLangExt/statement_global-variable.cpp b/tests/Unit/CXXLangExt/statement_global-variable.cpp index 07e208c9767..5a03208c98e 100644 --- a/tests/Unit/CXXLangExt/statement_global-variable.cpp +++ b/tests/Unit/CXXLangExt/statement_global-variable.cpp @@ -2,11 +2,11 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include [[hc]] int flag; -void foo(bool set) restrict(amp, cpu) +void foo(bool set) [[cpu, hc]] { flag = set ? 1 : 0; } diff --git a/tests/Unit/CXXLangExt/statement_goto_label.cpp b/tests/Unit/CXXLangExt/statement_goto_label.cpp index d0aa1eaa0ae..5aff19ae8ed 100644 --- a/tests/Unit/CXXLangExt/statement_goto_label.cpp +++ b/tests/Unit/CXXLangExt/statement_goto_label.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -10,7 +10,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -void foo() restrict(amp) +void foo() [[hc]] { goto L; L: @@ -25,8 +25,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { foo(); p_ans[idx[0]] = idx[0]; diff --git a/tests/Unit/CXXLangExt/statement_recursion.cpp b/tests/Unit/CXXLangExt/statement_recursion.cpp index 168a9919fe5..991a160fb84 100644 --- a/tests/Unit/CXXLangExt/statement_recursion.cpp +++ b/tests/Unit/CXXLangExt/statement_recursion.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include int fib(int x) restrict (amp, cpu) { if (x == 0) return 0; @@ -20,8 +20,8 @@ int main () int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { p_ans[idx[0]] = fib(idx[0]); }); diff --git a/tests/Unit/CXXLangExt/struct_class_union.cpp b/tests/Unit/CXXLangExt/struct_class_union.cpp index ef592675047..cd6f7315ea7 100644 --- a/tests/Unit/CXXLangExt/struct_class_union.cpp +++ b/tests/Unit/CXXLangExt/struct_class_union.cpp @@ -29,7 +29,7 @@ // RUN: %hc -DTYPE="bool" %s -o %t.out && %t.out // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -60,8 +60,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { S s; s.var = (TYPE)idx[0]; diff --git a/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp b/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp index cbacfa7b94c..2ac0e5700bd 100644 --- a/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp +++ b/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -31,8 +31,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { S s; s.bit = 7; diff --git a/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp b/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp index 2ded3c1dae3..9cb2e53e379 100644 --- a/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp +++ b/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp @@ -2,7 +2,7 @@ // RUN: %hc -DTYPE="half float" %s -o %t.out && %t.out #include -#include +#include struct S { TYPE var; @@ -27,8 +27,8 @@ int main () int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { S s; s.var = (TYPE)idx[0]; diff --git a/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp b/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp index 490b558a3dd..b8697ad48e2 100644 --- a/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp +++ b/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp @@ -30,7 +30,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -61,8 +61,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { TYPE var = (TYPE)idx[0]; S s; diff --git a/tests/Unit/CXXLangExt/struct_class_union_ref.cpp b/tests/Unit/CXXLangExt/struct_class_union_ref.cpp index 5176e799e3f..64cd0e48bda 100644 --- a/tests/Unit/CXXLangExt/struct_class_union_ref.cpp +++ b/tests/Unit/CXXLangExt/struct_class_union_ref.cpp @@ -30,7 +30,7 @@ // RUN: %hc -DTYPE="wchar_t" %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -59,8 +59,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { TYPE var = (TYPE)idx[0]; S s(var); diff --git a/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp b/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp index cbce20080d3..e2c4ed63233 100644 --- a/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp +++ b/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -26,8 +26,8 @@ bool test() { int *p_ans = &ans[0]; parallel_for_each( - Concurrency::extent<1>(vecSize), - [=](Concurrency::index<1> idx) restrict(amp) { + hc::extent<1>(vecSize), + [=](hc::index<1> idx) [[hc]] { S s; s.a = (int)idx[0]; diff --git a/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp b/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp index 8793d3ba589..0d854eeac0d 100644 --- a/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp +++ b/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include class MyBaseClass { @@ -12,7 +12,7 @@ class MyBaseClass class MyDerivedClass : virtual public MyBaseClass {}; -void VirtualBaseClassNotAllowed(int x) restrict(amp) +void VirtualBaseClassNotAllowed(int x) [[hc]] { MyDerivedClass obj; } diff --git a/tests/Unit/CaptureByCopy/test1.cpp b/tests/Unit/CaptureByCopy/test1.cpp index 02df146ef15..fb1b6b009d5 100644 --- a/tests/Unit/CaptureByCopy/test1.cpp +++ b/tests/Unit/CaptureByCopy/test1.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -15,13 +15,13 @@ #define SIZE (128) -using namespace concurrency; +using namespace hc; class user_functor { public: - user_functor() restrict(amp,cpu) {} + user_functor() [[cpu, hc]] {} - long value(const int& i) const restrict(amp,cpu) { return i + 1; } + long value(const int& i) const [[cpu, hc]] { return i + 1; } }; // test get the result from the functor, store the value on stack and use it @@ -37,7 +37,7 @@ bool test1(const user_functor& functor) { *accumulator = 0; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { long t = functor.value(idx[0]); terms[idx[0]] = t; accumulator->fetch_add(t); @@ -76,7 +76,7 @@ bool test2(const user_functor& functor) { *accumulator = 0; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { terms[idx[0]] = functor.value(idx[0]); accumulator->fetch_add(terms[idx[0]]); }); @@ -114,7 +114,7 @@ bool test3(const user_functor& functor) { *accumulator = 0; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { long t = idx[0] + 1; terms[idx[0]] = t; accumulator->fetch_add(t); diff --git a/tests/Unit/CaptureByCopy/test2.cpp b/tests/Unit/CaptureByCopy/test2.cpp index c02ae5383a2..ee30b9eaba7 100644 --- a/tests/Unit/CaptureByCopy/test2.cpp +++ b/tests/Unit/CaptureByCopy/test2.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -16,14 +16,14 @@ #define SIZE (128) -using namespace concurrency; +using namespace hc; template class user_functor { public: - user_functor() restrict(amp,cpu) {} + user_functor() [[cpu, hc]] {} - _Tp value(const _Tp& i) const restrict(amp,cpu) { return i + 1; } + _Tp value(const _Tp& i) const [[cpu, hc]] { return i + 1; } }; // test get the result from the functor, store the value on stack and use it @@ -40,7 +40,7 @@ bool test1(const user_functor<_Tp>& functor) { *accumulator = _Tp{}; extent<1> ex(N); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { _Tp t = functor.value(idx[0]); terms[idx[0]] = t; accumulator->fetch_add(t); @@ -80,7 +80,7 @@ bool test2(const user_functor<_Tp>& functor) { *accumulator = _Tp{}; extent<1> ex(N); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { terms[idx[0]] = functor.value(idx[0]); accumulator->fetch_add(terms[idx[0]]); }); @@ -119,7 +119,7 @@ bool test3(const user_functor<_Tp>& functor) { *accumulator = _Tp{}; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { _Tp t = idx[0] + 1; terms[idx[0]] = t; accumulator->fetch_add(t); diff --git a/tests/Unit/CaptureByCopy/test3.cpp b/tests/Unit/CaptureByCopy/test3.cpp index 49938ef2127..eb8bb87b389 100644 --- a/tests/Unit/CaptureByCopy/test3.cpp +++ b/tests/Unit/CaptureByCopy/test3.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -16,16 +16,16 @@ #define SIZE (128) -using namespace concurrency; +using namespace hc; class user_functor { long val; public: - user_functor(const user_functor& other) restrict(amp,cpu) : val(other.val) {} + user_functor(const user_functor& other) [[cpu, hc]] : val(other.val) {} - user_functor(long v) restrict(amp,cpu) : val(v) {} + user_functor(long v) [[cpu, hc]] : val(v) {} - long value(const int& i) const restrict(amp,cpu) { return static_cast(i) + val; } + long value(const int& i) const [[cpu, hc]] { return static_cast(i) + val; } }; // test get the result from the functor, store the value on stack and use it @@ -41,7 +41,7 @@ bool test1(const user_functor& functor, long val) { *accumulator = 0; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { long t = functor.value(idx[0]); terms[idx[0]] = t; accumulator->fetch_add(t); @@ -80,7 +80,7 @@ bool test2(const user_functor& functor, long val) { *accumulator = 0; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { terms[idx[0]] = functor.value(idx[0]); accumulator->fetch_add(terms[idx[0]]); }); @@ -118,7 +118,7 @@ bool test3(const user_functor& functor, long val) { *accumulator = 0; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { long t = idx[0] + val; terms[idx[0]] = t; accumulator->fetch_add(t); diff --git a/tests/Unit/CaptureByCopy/test4.cpp b/tests/Unit/CaptureByCopy/test4.cpp index 950d5b309e4..911ee980964 100644 --- a/tests/Unit/CaptureByCopy/test4.cpp +++ b/tests/Unit/CaptureByCopy/test4.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -17,17 +17,17 @@ #define SIZE (128) -using namespace concurrency; +using namespace hc; template class user_functor { _Tp val; public: - user_functor(const user_functor& other) restrict(amp,cpu) : val(other.val) {} + user_functor(const user_functor& other) [[cpu, hc]] : val(other.val) {} - user_functor(_Tp v) restrict(amp,cpu) : val(v) {} + user_functor(_Tp v) [[cpu, hc]] : val(v) {} - _Tp value(const _Tp& i) const restrict(amp,cpu) { return i + val; } + _Tp value(const _Tp& i) const [[cpu, hc]] { return i + val; } }; // test get the result from the functor, store the value on stack and use it @@ -44,7 +44,7 @@ bool test1(const user_functor<_Tp>& functor, _Tp val) { *accumulator = _Tp{}; extent<1> ex(N); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { _Tp t = functor.value(idx[0]); terms[idx[0]] = t; accumulator->fetch_add(t); @@ -84,7 +84,7 @@ bool test2(const user_functor<_Tp>& functor, _Tp val) { *accumulator = _Tp{}; extent<1> ex(N); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { terms[idx[0]] = functor.value(idx[0]); accumulator->fetch_add(terms[idx[0]]); }); @@ -123,7 +123,7 @@ bool test3(const user_functor<_Tp>& functor, _Tp val) { *accumulator = _Tp{}; extent<1> ex(SIZE); - parallel_for_each(ex, [=] (index<1>& idx) restrict(amp) { + parallel_for_each(ex, [=] (index<1>& idx) [[hc]] { _Tp t = idx[0] + val; terms[idx[0]] = t; accumulator->fetch_add(t); diff --git a/tests/Unit/CaptureByRef/test1.cpp b/tests/Unit/CaptureByRef/test1.cpp index f1125fb7c31..5241c4ac94f 100644 --- a/tests/Unit/CaptureByRef/test1.cpp +++ b/tests/Unit/CaptureByRef/test1.cpp @@ -1,6 +1,6 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -13,7 +13,7 @@ #define VECTOR_SIZE (1024) bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -24,7 +24,7 @@ bool test() { extent<1> ex(VECTOR_SIZE); array_view av(ex, table); - parallel_for_each(av.get_extent(), [&, av](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [&, av](index<1> idx) [[hc]] { // capture scalar type by reference av[idx] *= (val * val); }); diff --git a/tests/Unit/CaptureByRef/test10.cpp b/tests/Unit/CaptureByRef/test10.cpp index 1264868f5d4..f06ed28dca8 100644 --- a/tests/Unit/CaptureByRef/test10.cpp +++ b/tests/Unit/CaptureByRef/test10.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -19,7 +19,7 @@ struct POD { }; bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -32,7 +32,7 @@ bool test() { extent<1> ex(VECTOR_SIZE); array_view av(ex, table); - parallel_for_each(av.get_extent(), [&, av](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [&, av](index<1> idx) [[hc]] { // capture POD type by reference av[idx] *= (p.foo + p.bar); }); diff --git a/tests/Unit/CaptureByRef/test11.cpp b/tests/Unit/CaptureByRef/test11.cpp index 687f37728bd..1b036fe9791 100644 --- a/tests/Unit/CaptureByRef/test11.cpp +++ b/tests/Unit/CaptureByRef/test11.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -33,7 +33,7 @@ class POD3 { }; bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -57,7 +57,7 @@ bool test() { extent<1> ex(VECTOR_SIZE); array_view av(ex, table); - parallel_for_each(av.get_extent(), [&, av](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [&, av](index<1> idx) [[hc]] { // capture multitple POD types by reference av[idx] *= ((p.foo + p.bar) + (p2.foo + p2.bar + p2.baz) + (p3.foo + p3.bar + p3.baz + p3.qux)); }); diff --git a/tests/Unit/CaptureByRef/test12.cpp b/tests/Unit/CaptureByRef/test12.cpp index 6b0d387ff46..2fe0b880581 100644 --- a/tests/Unit/CaptureByRef/test12.cpp +++ b/tests/Unit/CaptureByRef/test12.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -19,7 +19,7 @@ struct POD { }; bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -31,7 +31,7 @@ bool test() { p.bar = rand() % 15 + 1; extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture array type, and POD type by reference table[idx[0]] *= (p.foo * p.bar); }); diff --git a/tests/Unit/CaptureByRef/test13.cpp b/tests/Unit/CaptureByRef/test13.cpp index 0328c0a20f7..4f570985f13 100644 --- a/tests/Unit/CaptureByRef/test13.cpp +++ b/tests/Unit/CaptureByRef/test13.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -15,18 +15,18 @@ class POD { public: - int getFoo() restrict(cpu,amp) { return foo; } - int getBar() restrict(cpu,amp) { return bar; } - int getFooCrossBar() restrict(cpu,amp) { return foo * bar; } - void setFoo(int f) restrict(cpu) { foo = f; } - void setBar(int b) restrict(cpu) { bar = b; } + int getFoo() [[cpu, hc]] { return foo; } + int getBar() [[cpu, hc]] { return bar; } + int getFooCrossBar() [[cpu, hc]] { return foo * bar; } + void setFoo(int f) [[cpu]] { foo = f; } + void setBar(int b) [[cpu]] { bar = b; } private: int foo; int bar; }; bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -38,7 +38,7 @@ bool test() { p.setBar(rand() % 15 + 1); extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture array type, and POD type by reference // use member function to access POD type table[idx[0]] *= (p.getFoo() * p.getBar()); diff --git a/tests/Unit/CaptureByRef/test14.cpp b/tests/Unit/CaptureByRef/test14.cpp index 4cafdd6dc77..8d2cda5bb93 100644 --- a/tests/Unit/CaptureByRef/test14.cpp +++ b/tests/Unit/CaptureByRef/test14.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -15,14 +15,14 @@ class Cell { public: - int get() restrict(cpu,amp) { return value; } - void set(int v) restrict(cpu,amp) { value = v; } + int get() [[cpu, hc]] { return value; } + void set(int v) [[cpu, hc]] { value = v; } private: int value; }; bool test() { - using namespace Concurrency; + using namespace hc; Cell matrixA[VECTOR_SIZE][VECTOR_SIZE]; Cell matrixB[VECTOR_SIZE][VECTOR_SIZE]; @@ -35,7 +35,7 @@ bool test() { } extent<2> ex(VECTOR_SIZE, VECTOR_SIZE); - parallel_for_each(ex, [&](index<2> idx) restrict(amp) { + parallel_for_each(ex, [&](index<2> idx) [[hc]] { // capture array type, and POD type by reference // use member function to access POD type int result = 0; diff --git a/tests/Unit/CaptureByRef/test15.cpp b/tests/Unit/CaptureByRef/test15.cpp index cebfd0509b4..b246f034e30 100644 --- a/tests/Unit/CaptureByRef/test15.cpp +++ b/tests/Unit/CaptureByRef/test15.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -30,7 +30,7 @@ class POD3 : public POD2 { }; bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -44,7 +44,7 @@ bool test() { p.qux = rand() % 15 + 1; extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture array type, and an inherited type by reference table[idx[0]] = (p.foo * p.bar * p.baz * p.qux); }); diff --git a/tests/Unit/CaptureByRef/test2.cpp b/tests/Unit/CaptureByRef/test2.cpp index ebd5e8efa7d..ad22024d8b1 100644 --- a/tests/Unit/CaptureByRef/test2.cpp +++ b/tests/Unit/CaptureByRef/test2.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (1024) bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -26,7 +26,7 @@ bool test() { extent<1> ex(VECTOR_SIZE); array_view av(ex, table); - parallel_for_each(av.get_extent(), [&, av](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [&, av](index<1> idx) [[hc]] { // capture multiple scalar types by reference av[idx] *= (val + val2); }); diff --git a/tests/Unit/CaptureByRef/test3.cpp b/tests/Unit/CaptureByRef/test3.cpp index 0cdf1b553ad..db7d0b8495a 100644 --- a/tests/Unit/CaptureByRef/test3.cpp +++ b/tests/Unit/CaptureByRef/test3.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (1024) bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -24,7 +24,7 @@ bool test() { int val = rand() % 15 + 1; extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture array type, and scalar type by reference table[idx[0]] *= (val * val); }); diff --git a/tests/Unit/CaptureByRef/test4.cpp b/tests/Unit/CaptureByRef/test4.cpp index 396af92b655..2386def9df9 100644 --- a/tests/Unit/CaptureByRef/test4.cpp +++ b/tests/Unit/CaptureByRef/test4.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (1024) bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; for (int i = 0; i < VECTOR_SIZE; ++i) { @@ -25,7 +25,7 @@ bool test() { int val2 = rand() % 15 + 1; extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture multiple scalar types by reference table[idx[0]] *= (val + val2); }); diff --git a/tests/Unit/CaptureByRef/test5.cpp b/tests/Unit/CaptureByRef/test5.cpp index f4b87facf5b..0a570cff190 100644 --- a/tests/Unit/CaptureByRef/test5.cpp +++ b/tests/Unit/CaptureByRef/test5.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (1024) bool test() { - using namespace Concurrency; + using namespace hc; int table[VECTOR_SIZE]; int table2[VECTOR_SIZE]; @@ -27,7 +27,7 @@ bool test() { int val2 = rand() % 15 + 1; extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture multiple scalar types by reference table[idx[0]] += table2[idx[0]]; }); diff --git a/tests/Unit/CaptureByRef/test6.cpp b/tests/Unit/CaptureByRef/test6.cpp index 6bc2592f0ec..1395cec6486 100644 --- a/tests/Unit/CaptureByRef/test6.cpp +++ b/tests/Unit/CaptureByRef/test6.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (1024) bool test() { - using namespace Concurrency; + using namespace hc; int p1 = rand() % 15 + 1; int p2 = rand() % 15 + 1; @@ -28,7 +28,7 @@ bool test() { } extent<1> ex(VECTOR_SIZE); - parallel_for_each(ex, [&](index<1> idx) restrict(amp) { + parallel_for_each(ex, [&](index<1> idx) [[hc]] { // capture multiple array types and scalar types by reference table3[idx[0]] = (p1 * table1[idx[0]]) + (p2 * table2[idx[0]]); }); diff --git a/tests/Unit/CaptureByRef/test7.cpp b/tests/Unit/CaptureByRef/test7.cpp index 81e28c3c4d0..3fba370dbb2 100644 --- a/tests/Unit/CaptureByRef/test7.cpp +++ b/tests/Unit/CaptureByRef/test7.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (256) bool test() { - using namespace Concurrency; + using namespace hc; int p = rand() % 15 + 1; @@ -27,7 +27,7 @@ bool test() { } extent<2> ex(VECTOR_SIZE, VECTOR_SIZE); - parallel_for_each(ex, [&](index<2> idx) restrict(amp) { + parallel_for_each(ex, [&](index<2> idx) [[hc]] { // capture multiple 2D array types and scalar type by reference table2[idx[0]][idx[1]] = table[idx[0]][idx[1]] * p; }); diff --git a/tests/Unit/CaptureByRef/test8.cpp b/tests/Unit/CaptureByRef/test8.cpp index f247abe1e82..95411117af3 100644 --- a/tests/Unit/CaptureByRef/test8.cpp +++ b/tests/Unit/CaptureByRef/test8.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (64) bool test() { - using namespace Concurrency; + using namespace hc; int p = rand() % 15 + 1; @@ -29,7 +29,7 @@ bool test() { } extent<3> ex(VECTOR_SIZE, VECTOR_SIZE, VECTOR_SIZE); - parallel_for_each(ex, [&](index<3> idx) restrict(amp) { + parallel_for_each(ex, [&](index<3> idx) [[hc]] { // capture multiple 3D array types and scalar type by reference table2[idx[0]][idx[1]][idx[2]] = table[idx[0]][idx[1]][idx[2]] * p; }); diff --git a/tests/Unit/CaptureByRef/test9.cpp b/tests/Unit/CaptureByRef/test9.cpp index 7014ff4472a..e2045e9e36f 100644 --- a/tests/Unit/CaptureByRef/test9.cpp +++ b/tests/Unit/CaptureByRef/test9.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,7 +14,7 @@ #define VECTOR_SIZE (16) bool test() { - using namespace Concurrency; + using namespace hc; int p = rand() % 15 + 1; @@ -32,7 +32,7 @@ bool test() { int dim[4] { VECTOR_SIZE, VECTOR_SIZE, VECTOR_SIZE, VECTOR_SIZE }; extent<4> ex(dim); - parallel_for_each(ex, [&](index<4> idx) restrict(amp) { + parallel_for_each(ex, [&](index<4> idx) [[hc]] { // capture multiple 4D array types and scalar type by reference table2[idx[0]][idx[1]][idx[2]][idx[3]] = table[idx[0]][idx[1]][idx[2]][idx[3]] * p; }); diff --git a/tests/Unit/Codegen/barrier_should_not_unwind.cpp b/tests/Unit/Codegen/barrier_should_not_unwind.cpp index 65e942d4f04..62a695bdc71 100644 --- a/tests/Unit/Codegen/barrier_should_not_unwind.cpp +++ b/tests/Unit/Codegen/barrier_should_not_unwind.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace Concurrency; +#include +using namespace hc; void -FwdPass0(const array_view &twiddles, Concurrency::tiled_index<64, 1> tidx) restrict(amp) +FwdPass0(const array_view &twiddles, hc::tiled_index<2> tidx) [[hc]] { tidx.barrier.wait(); } @@ -12,9 +12,9 @@ int main() { int num[1]; const array_view& twiddles = array_view(1, num); - Concurrency::extent<2> grdExt( 64, 1 ); - Concurrency::tiled_extent< 64, 1> t_ext(grdExt); - Concurrency::parallel_for_each(t_ext, [=] (Concurrency::tiled_index<64, 1> tidx) restrict(amp) { + hc::extent<2> grdExt( 64, 1 ); + hc::tiled_extent<2> t_ext(grdExt.tile(64, 1)); + hc::parallel_for_each(t_ext, [=] (hc::tiled_index<2> tidx) [[hc]] { FwdPass0(twiddles,tidx); FwdPass0(twiddles,tidx); }); diff --git a/tests/Unit/Codegen/compile_error_for_arraytype.cpp b/tests/Unit/Codegen/compile_error_for_arraytype.cpp index 0c34a9d3047..27210b977ee 100644 --- a/tests/Unit/Codegen/compile_error_for_arraytype.cpp +++ b/tests/Unit/Codegen/compile_error_for_arraytype.cpp @@ -6,7 +6,7 @@ ////////////////////////////////////////////////////////////////////////////////// class baz { public: - void cho(void) restrict(amp) {}; + void cho(void) [[hc]] {}; int bar; int* n[10]; }; @@ -15,7 +15,7 @@ class baz { // CHECK-NEXT: ^ -int kerker(void) restrict(amp,cpu) { +int kerker(void) [[cpu, hc]] { baz bl; return 0; } diff --git a/tests/Unit/Codegen/index_operator_test.cpp b/tests/Unit/Codegen/index_operator_test.cpp index cb8371caca9..81f57c81e25 100644 --- a/tests/Unit/Codegen/index_operator_test.cpp +++ b/tests/Unit/Codegen/index_operator_test.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include int main(void) { - concurrency::index<1> a(1), b; + hc::index<1> a(1), b; a = b + 5566; return 0; } diff --git a/tests/Unit/Codegen/opt_level0.cpp b/tests/Unit/Codegen/opt_level0.cpp index a6c92ac44ae..3bfe1063632 100644 --- a/tests/Unit/Codegen/opt_level0.cpp +++ b/tests/Unit/Codegen/opt_level0.cpp @@ -3,8 +3,8 @@ #include #include #include -#include -using namespace Concurrency; +#include +using namespace hc; #define N 10 @@ -24,7 +24,7 @@ void vectorAdd_by_array(const std::vector& vecA, const std::vector extent<1> e(N); parallel_for_each(e, - [=](index<1> idx) restrict(amp) { cv[idx] = av[idx] + bv[idx]; }); + [=](index<1> idx) [[hc]] { cv[idx] = av[idx] + bv[idx]; }); } int main(void) diff --git a/tests/Unit/Codegen/opt_level1.cpp b/tests/Unit/Codegen/opt_level1.cpp index 47447a9544b..9ff7ebe1163 100644 --- a/tests/Unit/Codegen/opt_level1.cpp +++ b/tests/Unit/Codegen/opt_level1.cpp @@ -3,8 +3,8 @@ #include #include #include -#include -using namespace Concurrency; +#include +using namespace hc; #define N 10 @@ -24,7 +24,7 @@ void vectorAdd_by_array(const std::vector& vecA, const std::vector extent<1> e(N); parallel_for_each(e, - [=](index<1> idx) restrict(amp) { cv[idx] = av[idx] + bv[idx]; }); + [=](index<1> idx) [[hc]] { cv[idx] = av[idx] + bv[idx]; }); } int main(void) diff --git a/tests/Unit/Codegen/restric_overload.cpp b/tests/Unit/Codegen/restric_overload.cpp index d591d2218af..573d57f1fcd 100644 --- a/tests/Unit/Codegen/restric_overload.cpp +++ b/tests/Unit/Codegen/restric_overload.cpp @@ -5,14 +5,14 @@ #endif class baz { public: - void foo(void) restrict(amp) {bar = 1;} - void foo(void) restrict(cpu) {bar = 2;} + void foo(void) [[hc]] {bar = 1;} + void foo(void) [[cpu]] {bar = 2;} int bar; }; -int fake_use(void) restrict(cpu,amp) { +int fake_use(void) [[cpu, hc]] { baz baz_cpu; - baz_cpu.foo(); //call the one with restrict(cpu) + baz_cpu.foo(); //call the one with [[cpu]] return baz_cpu.bar; } #ifndef __KALMAR_ACCELERATOR__ diff --git a/tests/Unit/Codegen/separate.cpp b/tests/Unit/Codegen/separate.cpp index 9e2279e25e7..41956f59ee5 100644 --- a/tests/Unit/Codegen/separate.cpp +++ b/tests/Unit/Codegen/separate.cpp @@ -4,7 +4,7 @@ extern "C" { int foo(void) { return 42; } -int bar(void) restrict(amp) { +int bar(void) [[hc]] { return 43; } } @@ -15,11 +15,11 @@ class baz { return 44; } __attribute__((noinline)) - int cho(void) restrict(amp) { + int cho(void) [[hc]] { return 45; } }; -int kerker(void) restrict(amp,cpu) { +int kerker(void) [[cpu, hc]] { baz b1; return b1.cho()+b1.bzzt(); } diff --git a/tests/Unit/Codegen/separate2.cpp b/tests/Unit/Codegen/separate2.cpp index d53a2558b29..fda04ceebe5 100644 --- a/tests/Unit/Codegen/separate2.cpp +++ b/tests/Unit/Codegen/separate2.cpp @@ -1,25 +1,25 @@ // RUN: %cxxamp -emit-llvm -S -c %s -o -|%FileCheck %s extern "C" { #if 0 -int foo(void) restrict(cpu, amp) { +int foo(void) [[cpu, hc]] { return 42; } #endif -int bar(void) restrict(amp) { +int bar(void) [[hc]] { return 43; } } class baz { public: - int bzzt(void) restrict(cpu) { + int bzzt(void) [[cpu]] { return 44; } - int cho(void) restrict(amp) { + int cho(void) [[hc]] { return 45; } }; -int kerker(void) restrict(amp,cpu) { +int kerker(void) [[cpu, hc]] { baz b1; return b1.cho()+b1.bzzt(); } diff --git a/tests/Unit/Codegen/tworef.cpp b/tests/Unit/Codegen/tworef.cpp index 281c954a2ea..03768436ab5 100644 --- a/tests/Unit/Codegen/tworef.cpp +++ b/tests/Unit/Codegen/tworef.cpp @@ -1,7 +1,7 @@ // RUN: %amp_device -D__KALMAR_ACCELERATOR__ -c -S -emit-llvm %s -#include +#include -using namespace concurrency; +using namespace hc; int main() { @@ -9,6 +9,6 @@ int main() array temp(length); array data(length); extent<1> cdomain_transpose(16); - parallel_for_each (cdomain_transpose, [=, &data, &temp] (index<1> tidx) restrict(amp) {}); + parallel_for_each (cdomain_transpose, [=, &data, &temp] (index<1> tidx) [[hc]] {}); return 0; } diff --git a/tests/Unit/Codegen/vector_addition_using_array.cpp b/tests/Unit/Codegen/vector_addition_using_array.cpp index 40b04bbda04..c3e7d48fdad 100644 --- a/tests/Unit/Codegen/vector_addition_using_array.cpp +++ b/tests/Unit/Codegen/vector_addition_using_array.cpp @@ -3,8 +3,8 @@ #include #include #include -#include -using namespace Concurrency; +#include +using namespace hc; #define N 10 @@ -23,11 +23,12 @@ void vectorAdd_by_array(const std::vector& vecA, const std::vector array_view cv(C); extent<1> e(N); - parallel_for_each(e, - [=](index<1> idx) restrict(amp) { cv[idx] = av[idx] + bv[idx]; }); + parallel_for_each(e, [=](index<1> idx) [[hc]] { + cv[idx] = av[idx] + bv[idx]; + }); } -int main(void) +int main() { std::vector vecA(N); std::vector vecB(N); diff --git a/tests/Unit/Copy/copy.cpp b/tests/Unit/Copy/copy.cpp index a9cca0bdddc..1bde5632afe 100644 --- a/tests/Unit/Copy/copy.cpp +++ b/tests/Unit/Copy/copy.cpp @@ -1,12 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include + +#include +#include #include -#include #include -#include +#include -using namespace concurrency; +using namespace hc; #define T int #define INIT 50 @@ -34,7 +35,8 @@ int main(void) { // Run in a separate thread std::thread t([&]() { - parallel_for_each(gpu_av, dest.get_extent(), [=, &dest, &tgt](index<1> idx) restrict(amp) { + parallel_for_each( + gpu_av, dest.get_extent(), [=, &dest, &tgt](index<1> idx) [[hc]] { for(unsigned i = 0; i < vecSize; i++) for (unsigned j = 0; j < vecSize; j++) tgt[idx] = dest[i]; @@ -42,10 +44,11 @@ int main(void) { }); t.join(); - // At this point, the copying needs to wait for availability of dest in thread t - // otherwise, undefined behavior happens in PFE since dest[i] is not deterministic + // At this point, the copying needs to wait for availability of dest in thread + // t otherwise, undefined behavior happens in PFE since dest[i] is not + // deterministic. copy(src, dest); - + // Verify tgt on CPU array_view av(tgt); bool ret = true; @@ -56,4 +59,4 @@ int main(void) { } } return !(ret == true); -} +} \ No newline at end of file diff --git a/tests/Unit/DataContainers/array_view.cpp b/tests/Unit/DataContainers/array_view.cpp index 8ec6a35fd8b..0b254be034d 100644 --- a/tests/Unit/DataContainers/array_view.cpp +++ b/tests/Unit/DataContainers/array_view.cpp @@ -3,12 +3,16 @@ // What's in the comment above indicates it will build this file using // -std=c++amp and all other necessary flags to build. Then the system will // run the built program and check its results with all google test cases. -#include -#include +#include + #include +#include + #define N0 5000 +using namespace hc; + int init1D(std::vector& vec) { int n = N0; for (int i = 0; i < n; ++i) { @@ -22,14 +26,14 @@ TEST(ClassArrayView, Constructor) { int old_vec0 = vec[0]; // Testing line 2251 of C++AMP Language and Programming Model version 1.0 { - Concurrency::array_view av(sizeVec, vec); + array_view av(sizeVec, vec); EXPECT_EQ(vec[0], av[0]); av[0]+=1234; } // Synchronize back at destruction time EXPECT_EQ(old_vec0+1234, vec[0]); { - Concurrency::array_view av(sizeVec, vec); + array_view av(sizeVec, vec); EXPECT_EQ(vec[0], av[0]); old_vec0 = vec[0]++; av.refresh(); @@ -38,10 +42,10 @@ TEST(ClassArrayView, Constructor) { // Testing line 2554 of C++AMP LPM v 1.0 { int foo[]={123, 456, 789}; - Concurrency::array_view av(3, foo); + array_view av(3, foo); EXPECT_EQ(foo[2], av[2]); { - Concurrency::array_view bv(av); + array_view bv(av); EXPECT_EQ(av[1], bv[1]); } // Line 2178 of C++AMP LPM v 1.0 diff --git a/tests/Unit/DataContainers/array_view_2d.1.cpp b/tests/Unit/DataContainers/array_view_2d.1.cpp index 3626b7ea27b..22ae034d6c1 100644 --- a/tests/Unit/DataContainers/array_view_2d.1.cpp +++ b/tests/Unit/DataContainers/array_view_2d.1.cpp @@ -1,15 +1,15 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include -using namespace concurrency; +#include +using namespace hc; int main() { int v[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; extent<2> e(5, 2); { array_view av(e, v); - parallel_for_each(av.get_extent(), [=](index<2> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<2> idx) [[hc]] { av[idx] -= 1; }); assert(av.get_extent() == e); diff --git a/tests/Unit/DataContainers/array_view_2d.2.cpp b/tests/Unit/DataContainers/array_view_2d.2.cpp index d7e78dfe8e5..21984fc9154 100644 --- a/tests/Unit/DataContainers/array_view_2d.2.cpp +++ b/tests/Unit/DataContainers/array_view_2d.2.cpp @@ -1,15 +1,15 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include -using namespace concurrency; +#include +using namespace hc; int main() { int v[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; extent<2> e(5, 2); { array_view av(e, v); - parallel_for_each(av.get_extent(), [=](index<2> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<2> idx) [[hc]] { av(idx) -= 1; }); assert(av.get_extent() == e); diff --git a/tests/Unit/DataContainers/array_view_2d.3.cpp b/tests/Unit/DataContainers/array_view_2d.3.cpp index 166fd4c79ca..aaeadd808b8 100644 --- a/tests/Unit/DataContainers/array_view_2d.3.cpp +++ b/tests/Unit/DataContainers/array_view_2d.3.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include +#include #include -using namespace concurrency; +using namespace hc; int main() { std::vector vv(10); @@ -13,7 +13,7 @@ int main() extent<2> e(5, 2); { array_view av(5, 2, vv); - parallel_for_each(av.get_extent(), [=](index<2> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<2> idx) [[hc]] { av(idx) -= 1; }); assert(av.get_extent() == e); diff --git a/tests/Unit/DataContainers/extent.cpp b/tests/Unit/DataContainers/extent.cpp index a8fed66bf05..701dc10acc5 100644 --- a/tests/Unit/DataContainers/extent.cpp +++ b/tests/Unit/DataContainers/extent.cpp @@ -3,7 +3,7 @@ // What's in the comment above indicates it will build this file using // -std=c++amp and all other necessary flags to build. Then the system will // run the built program and check its results with all google test cases. -#include +#include #include #define N0 10 @@ -12,7 +12,7 @@ TEST(ClassExtent, Extent1D) { int n0 = N0; - Concurrency::extent<1> ext(n0); + hc::extent<1> ext(n0); EXPECT_EQ(n0, ext[0]); } @@ -20,7 +20,7 @@ TEST(ClassExtent, Extent1D) { TEST(ClassExtent, Extent2D) { int n0 = N0; int n1 = N1; - Concurrency::extent<2> ext(n0, n1); + hc::extent<2> ext(n0, n1); EXPECT_EQ(n0, ext[0]); EXPECT_EQ(n1, ext[1]); @@ -29,9 +29,9 @@ TEST(ClassExtent, Extent2D) { TEST(ClassExtent, Extent2DSub) { int n0 = N0; int n1 = N1; - Concurrency::extent<2> ext(n0, n1); - Concurrency::extent<2> sub(1, 1); - Concurrency::extent<2> ext2 = ext - sub; + hc::extent<2> ext(n0, n1); + hc::extent<2> sub(1, 1); + hc::extent<2> ext2 = ext - sub; EXPECT_EQ(n0-1, ext2[0]); EXPECT_EQ(n1-1, ext2[1]); @@ -41,7 +41,7 @@ TEST(ClassExtent, Extent3D) { int n0 = N0; int n1 = N1; int n2 = N2; - Concurrency::extent<3> ext(n0, n1, n2); + hc::extent<3> ext(n0, n1, n2); EXPECT_EQ(n0, ext[0]); EXPECT_EQ(n1, ext[1]); @@ -49,10 +49,10 @@ TEST(ClassExtent, Extent3D) { } TEST(ClassExtent, ExtentContains) { - Concurrency::index<2> i(1234, 5678); - Concurrency::index<2> j(5000, 1234); - Concurrency::index<2> k(4999, 6001); - Concurrency::extent<2> foo(5000, 6000); + hc::index<2> i(1234, 5678); + hc::index<2> j(5000, 1234); + hc::index<2> k(4999, 6001); + hc::extent<2> foo(5000, 6000); EXPECT_EQ(true, foo.contains(i)); EXPECT_EQ(false, foo.contains(j)); EXPECT_EQ(false, foo.contains(k)); diff --git a/tests/Unit/Design/2d.cpp b/tests/Unit/Design/2d.cpp index 9cdc0bf7915..b0567b6b56b 100644 --- a/tests/Unit/Design/2d.cpp +++ b/tests/Unit/Design/2d.cpp @@ -1,77 +1,75 @@ -//_view RUN: %gtest_amp %s -o %t.out && %t.out + //_view RUN: %gtest_amp %s -o %t.out && %t.out + +#include -#include -#include -#include -#ifndef __KALMAR_ACCELERATOR__ #include -#endif + +#include +#include +#include + +using namespace hc; class myVecAdd { - public: - // CPU-side constructor. Written by the user - myVecAdd(Concurrency::array_view& a, - Concurrency::array_view &b, - Concurrency::array_view &c): - a_(a), b_(b), c_(c) { - } - void operator() (Concurrency::index<2> idx) restrict(amp) { - c_[idx] = a_[idx]+b_[idx]; - } - void operator() (Concurrency::tiled_index<4, 4> idx) restrict(amp) { - c_[idx] = a_[idx]+b_[idx]; - } - private: - Concurrency::array_view &c_; - Concurrency::array_view a_, b_; + array_view a_, b_, c_; +public: + // CPU-side constructor. Written by the user + myVecAdd( + array_view& a, array_view& b, array_view& c) + : a_(a), b_(b), c_(c) + {} + + void operator()(index<2> idx) const [[hc]] { c_[idx] = a_[idx]+b_[idx]; } + void operator()(tiled_index<2> idx) const [[hc]] + { + c_[idx] = a_[idx] + b_[idx]; + } }; -void bar(void) restrict(amp,cpu) { - int* foo = reinterpret_cast(&myVecAdd::__cxxamp_trampoline); -} -#ifndef __KALMAR_ACCELERATOR__ + #define M 20 #define N 40 -TEST(Design, Final) { - std::vector vector_a(M*N), - vector_b(M*N); - for (int i = 0; i < M*N; i++) { - vector_a[i] = 100.0f * rand() / RAND_MAX; - vector_b[i] = 100.0f * rand() / RAND_MAX; - } - Concurrency::extent<2> e(M, N); - concurrency::array_view av(e, vector_a); - EXPECT_EQ(vector_a[2], av(0,2)); - concurrency::array_view bv(e, vector_b); - { // Test untiled version - concurrency::array_view c(e); - myVecAdd mf(av, bv, c); - Concurrency::parallel_for_each(e, mf); - int error=0; - for(int i = 0; i < M; i++) { - for(int j = 0; j < N; j++) { - std::cout << "av[" < vector_a(M * N), vector_b(M * N); + + for (int i = 0; i < M * N; i++) { + vector_a[i] = 100.0f * rand() / RAND_MAX; + vector_b[i] = 100.0f * rand() / RAND_MAX; + } + extent<2> e(M, N); + array_view av(e, vector_a); + EXPECT_EQ(vector_a[2], av(0, 2)); + array_view bv(e, vector_b); + { // Test untiled version + array_view c(e); + myVecAdd mf(av, bv, c); + parallel_for_each(e, mf); + int error=0; + for(int i = 0; i < M; i++) { + for(int j = 0; j < N; j++) { + std::cout << "av[" < c(e); - myVecAdd mf(av, bv, c); - Concurrency::parallel_for_each(e.tile<4, 4>(), mf); - int error=0; - for(int i = 0; i < M; i++) { - for(int j = 0; j < N; j++) { - std::cout << "av[" < c(e); + myVecAdd mf(av, bv, c); + parallel_for_each(e.tile(4, 4), mf); + int error=0; + for(int i = 0; i < M; i++) { + for(int j = 0; j < N; j++) { + std::cout << "av[" < -using namespace Concurrency; +#include +using namespace hc; template bool test_array_rank(int extval = _rank) @@ -12,7 +12,7 @@ bool test_array_rank(int extval = _rank) extent<_rank> e(data); array<_type, _rank> a1(e); - parallel_for_each(e, [&](index<_rank> idx) restrict(amp) { + parallel_for_each(e, [&](index<_rank> idx) [[hc]] { a1[idx] = 1; }); diff --git a/tests/Unit/Design/addr_space.cpp b/tests/Unit/Design/addr_space.cpp index e36a12ecccb..e7468591950 100644 --- a/tests/Unit/Design/addr_space.cpp +++ b/tests/Unit/Design/addr_space.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include -#include +#include #include -using namespace concurrency; +using namespace hc; -float x(float *p) restrict(amp) { +float x(float *p) [[hc]] { return fast_math::sin(*p); } @@ -29,7 +29,7 @@ int main(void) { parallel_for_each( e, - [=](index<1> idx) restrict(amp) { + [=](index<1> idx) [[hc]] { gc[idx] = x(&ga[idx]); }); diff --git a/tests/Unit/Design/array_view_extent.cpp b/tests/Unit/Design/array_view_extent.cpp index da2db5d75a4..7c15cb4f31e 100644 --- a/tests/Unit/Design/array_view_extent.cpp +++ b/tests/Unit/Design/array_view_extent.cpp @@ -1,13 +1,13 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include -using namespace concurrency; +#include +using namespace hc; int main() { int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; array_view av(11, v); - parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<1> idx) [[hc]] { av[idx] += 1; }); diff --git a/tests/Unit/Design/array_view_extent_2d.cpp b/tests/Unit/Design/array_view_extent_2d.cpp index e03d6cfa061..ee2c6d71da4 100644 --- a/tests/Unit/Design/array_view_extent_2d.cpp +++ b/tests/Unit/Design/array_view_extent_2d.cpp @@ -1,14 +1,14 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include -using namespace concurrency; +#include +using namespace hc; int main() { int v[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; extent<2> e(5, 2); array_view av(e, v); - parallel_for_each(av.get_extent(), [=](index<2> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<2> idx) [[hc]] { av[idx] -= 1; }); assert(av.get_extent() == e); diff --git a/tests/Unit/Design/array_view_extent_2d_tile.cpp b/tests/Unit/Design/array_view_extent_2d_tile.cpp index 92630e690f6..bad950e6a9c 100644 --- a/tests/Unit/Design/array_view_extent_2d_tile.cpp +++ b/tests/Unit/Design/array_view_extent_2d_tile.cpp @@ -1,18 +1,17 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include -using namespace concurrency; -int main() +#include +#include +using namespace hc; +int main() { int v[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; extent<2> e(5, 2); - array_view av(e, v); + array_view av(e, v); assert(av.get_extent() == e); // Testing tiled_index - parallel_for_each(av.get_extent().tile<1,2>(), - [=](tiled_index<1,2> idx) restrict(amp) { - av[idx] -= 1; + parallel_for_each(av.get_extent().tile(1, 2), [=](tiled_index<2> idx) [[hc]] { + av[idx] -= 1; }); assert(av.get_extent() == e); for(unsigned int i = 0; i < av.get_extent()[0]; i++) diff --git a/tests/Unit/Design/double_lamda_in_one_fuction.cpp b/tests/Unit/Design/double_lamda_in_one_fuction.cpp index ef6d0395460..76f13089c60 100644 --- a/tests/Unit/Design/double_lamda_in_one_fuction.cpp +++ b/tests/Unit/Design/double_lamda_in_one_fuction.cpp @@ -1,16 +1,16 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include -using namespace concurrency; +#include +using namespace hc; int main() { int v[11] = {0,1,2,3,4,5,6,7,8,9,10}; int expexted_v[11] = {11,12,13,14,15,16,17,18,19,20,21}; array_view av(11, v); - parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<1> idx) [[hc]] { av[idx] +=1 ; }); - parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<1> idx) [[hc]] { av[idx] += 10; }); diff --git a/tests/Unit/Design/lambda.cpp b/tests/Unit/Design/lambda.cpp index 0a7a7edee9b..c54472b4ec7 100644 --- a/tests/Unit/Design/lambda.cpp +++ b/tests/Unit/Design/lambda.cpp @@ -1,28 +1,28 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include int main(void){ const int vecSize = 100; // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::array a(vecSize); - Concurrency::array b(vecSize); - Concurrency::array c(vecSize); + hc::extent<1> e(vecSize); + hc::array a(vecSize); + hc::array b(vecSize); + hc::array c(vecSize); int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); - Concurrency::array_view gc(c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { + hc::array_view ga(a); + hc::array_view gb(b); + hc::array_view gc(c); + for (hc::index<1> i(0); i[0] < vecSize; i++) { ga[i] = 100.0f * rand() / RAND_MAX; gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; + sum += ga[i] + gb[i]; } - Concurrency::parallel_for_each( + hc::parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { gc[idx] = ga[idx]+gb[idx]; }); diff --git a/tests/Unit/Design/lambda_tiled.cpp b/tests/Unit/Design/lambda_tiled.cpp index 800d5155f6b..591f53f827d 100644 --- a/tests/Unit/Design/lambda_tiled.cpp +++ b/tests/Unit/Design/lambda_tiled.cpp @@ -1,32 +1,30 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include int main(void){ const int vecSize = 128; // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::tiled_extent<16> et(e); - Concurrency::tiled_extent<16> et2 = e.tile<16>(); - assert(et.tile_dim0 == 16); - assert(et2.tile_dim0 == 16); - Concurrency::array a(vecSize); - Concurrency::array b(vecSize); - Concurrency::array c(vecSize); + hc::extent<1> e(vecSize); + hc::tiled_extent<1> et(e.tile(16)); + hc::tiled_extent<1> et2 = e.tile(16); + assert(et.tile_dim[0] == 16); + assert(et2.tile_dim[0] == 16); + hc::array a(vecSize); + hc::array b(vecSize); + hc::array c(vecSize); int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); - Concurrency::array_view gc(c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { + hc::array_view ga(a); + hc::array_view gb(b); + hc::array_view gc(c); + for (hc::index<1> i(0); i[0] < vecSize; i++) { ga[i] = 100.0f * rand() / RAND_MAX; gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; + sum += ga[i] + gb[i]; } - Concurrency::parallel_for_each( - et, - [=](Concurrency::tiled_index<16> idx) restrict(amp) { + hc::parallel_for_each(et, [=](hc::tiled_index<1> idx) [[hc]] { gc[idx] = ga[idx]+gb[idx]; }); diff --git a/tests/Unit/Design/lambda_tiled_local.cpp b/tests/Unit/Design/lambda_tiled_local.cpp index 53f75820b59..34ca1d9a0e7 100644 --- a/tests/Unit/Design/lambda_tiled_local.cpp +++ b/tests/Unit/Design/lambda_tiled_local.cpp @@ -1,32 +1,30 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include int main(void){ const int vecSize = 1280; #define TILE 128 // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::tiled_extent et(e); - Concurrency::tiled_extent et2 = e.tile(); - assert(et.tile_dim0 == TILE); - assert(et2.tile_dim0 == TILE); - Concurrency::array a(vecSize); - Concurrency::array b(vecSize); - Concurrency::array c(vecSize); + hc::extent<1> e(vecSize); + hc::tiled_extent<1> et(e.tile(TILE)); + hc::tiled_extent<1> et2 = e.tile(TILE); + assert(et.tile_dim[0] == TILE); + assert(et2.tile_dim[0] == TILE); + hc::array a(vecSize); + hc::array b(vecSize); + hc::array c(vecSize); int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); - Concurrency::array_view gc(c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { + hc::array_view ga(a); + hc::array_view gb(b); + hc::array_view gc(c); + for (hc::index<1> i(0); i[0] < vecSize; i++) { ga[i] = 100.0f * rand() / RAND_MAX; gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; + sum += ga[i] + gb[i]; } - Concurrency::parallel_for_each( - et, - [=](Concurrency::tiled_index idx) restrict(amp) { + hc::parallel_for_each(et, [=](hc::tiled_index<1> idx) [[hc]] { tile_static int shm[TILE]; shm[idx.local[0]] = ga[idx]; idx.barrier.wait(); diff --git a/tests/Unit/Design/overload.cpp b/tests/Unit/Design/overload.cpp index 974774392f3..77c0496eb19 100644 --- a/tests/Unit/Design/overload.cpp +++ b/tests/Unit/Design/overload.cpp @@ -1,10 +1,11 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace Concurrency; +#include -int f() restrict(amp) { return 55; } -int f() restrict(cpu) { return 66; } -int g() restrict(amp,cpu) { return f(); } +using namespace hc; + +int f() [[hc]] { return 55; } +int f() [[cpu]] { return 66; } +int g() [[cpu, hc]] { return f(); } bool TestOnHost() { @@ -16,9 +17,7 @@ bool TestOnDevice() array a((extent<1>(1))); array_view A(a); extent<1> ex(1); - parallel_for_each(ex, [&](index<1> idx) restrict(amp,cpu) { - A(idx) = g(); - }); + parallel_for_each(ex, [=](index<1> idx) [[hc]] { A(idx) = g(); }); return A[0] == 55; } diff --git a/tests/Unit/Design/pass_by_ref.cpp b/tests/Unit/Design/pass_by_ref.cpp index 00c431c872a..66f1a0fda80 100644 --- a/tests/Unit/Design/pass_by_ref.cpp +++ b/tests/Unit/Design/pass_by_ref.cpp @@ -1,58 +1,49 @@ -// RUN: %gtest_amp %s -o %t.out +// RUN: %gtest_amp %s -o %t.out // RUN: %t.out -#include -#include -#include -#ifndef __KALMAR_ACCELERATOR__ +#include + #include -#endif + +#include +#include class myVecAdd { public: // CPU-side constructor. Written by the user - myVecAdd(Concurrency::array_view& a, - Concurrency::array_view &b, - Concurrency::array_view &c): - a_(a), b_(b), c_(c) { - } - void operator() (Concurrency::index<1> idx) restrict(amp) { - c_[idx] = a_[idx]+b_[idx]; - } + myVecAdd( + hc::array_view& a, hc::array_view &b, hc::array_view &c) + : a_(a), b_(b), c_(c) + {} + void operator()(hc::index<1> idx) const [[hc]] { c_[idx] = a_[idx]+b_[idx]; } private: - Concurrency::array_view a_, b_; - Concurrency::array_view& c_; + hc::array_view a_, b_, c_; }; -void bar(void) restrict(amp,cpu) { - int* foo = reinterpret_cast(&myVecAdd::__cxxamp_trampoline); -} -#ifndef __KALMAR_ACCELERATOR__ + TEST(Design, Final) { const int vecSize = 100; // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::array_view a(vecSize); - Concurrency::array_view b(vecSize); - Concurrency::array_view c(vecSize); + hc::extent<1> e(vecSize); + hc::array_view a(vecSize); + hc::array_view b(vecSize); + hc::array_view c(vecSize); int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); + hc::array_view ga(a); + hc::array_view gb(b); myVecAdd mf(ga, gb, c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { + for (hc::index<1> i(0); i[0] < vecSize; i++) { ga[i] = 100.0f * rand() / RAND_MAX; gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; + sum += ga[i] + gb[i]; } - Concurrency::parallel_for_each( - e, - mf); + hc::parallel_for_each(e, mf); int error = 0; for(unsigned i = 0; i < vecSize; i++) { - error += c[Concurrency::index<1>(i)] - (ga[i] + gb[i]); + error += c[hc::index<1>(i)] - (ga[i] + gb[i]); } EXPECT_EQ(error, 0); } -#endif + diff --git a/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp b/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp index c638d862e2e..58dabb5adc2 100644 --- a/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp +++ b/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp @@ -1,58 +1,52 @@ // RUN: %gtest_amp %s -o %t.out && %t.out -#include -#include -#include -#ifndef __KALMAR_ACCELERATOR__ +#include + #include -#endif + +#include +#include class myVecAdd { public: // CPU-side constructor. Written by the user - myVecAdd(Concurrency::array_view& a, - Concurrency::array_view &b, - Concurrency::array_view &c): + myVecAdd(hc::array_view& a, + hc::array_view &b, + hc::array_view &c): a_(a), b_(b), c_(c) { } - void operator() (Concurrency::index<1> idx) restrict(amp) { + void operator() (hc::index<1> idx) const [[hc]] { c_[idx] = a_[idx]+b_[idx]; } private: - Concurrency::array_view a_, b_, c_; + hc::array_view a_, b_, c_; }; -void bar(void) restrict(amp,cpu) { - int* foo = reinterpret_cast(&myVecAdd::__cxxamp_trampoline); -} -#ifndef __KALMAR_ACCELERATOR__ + TEST(Design, Final) { const int vecSize = 100; // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::array a(vecSize); - Concurrency::array b(vecSize); - Concurrency::array c(vecSize); + hc::extent<1> e(vecSize); + hc::array a(vecSize); + hc::array b(vecSize); + hc::array c(vecSize); int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); - Concurrency::array_view gc(c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { + hc::array_view ga(a); + hc::array_view gb(b); + hc::array_view gc(c); + for (hc::index<1> i(0); i[0] < vecSize; i++) { ga[i] = 100.0f * rand() / RAND_MAX; gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; + sum += ga[i] + gb[i]; } myVecAdd mf(ga, gb, gc); - Concurrency::parallel_for_each( - e, - mf); + hc::parallel_for_each(e, mf); int error = 0; for(unsigned i = 0; i < vecSize; i++) { error += gc[i] - (ga[i] + gb[i]); } EXPECT_EQ(error, 0); -} -#endif +} \ No newline at end of file diff --git a/tests/Unit/Design/transpose.cpp b/tests/Unit/Design/transpose.cpp index 50463c2c19a..7d8716730cd 100644 --- a/tests/Unit/Design/transpose.cpp +++ b/tests/Unit/Design/transpose.cpp @@ -5,21 +5,21 @@ // Implement C++ AMP version of matrix transpose //---------------------------------------------------------------------------- -#include +#include #include #include #include #include -using namespace concurrency; +using namespace hc; //----------------------------------------------------------------------------- // Common utility functions and definitions //----------------------------------------------------------------------------- template -_2d_index_type transpose(const _2d_index_type& idx) restrict(cpu, amp) { +_2d_index_type transpose(const _2d_index_type& idx) [[cpu, hc]] { return _2d_index_type(idx[1], idx[0]); } @@ -32,7 +32,7 @@ void transpose_simple(const array_view& data, assert(data.get_extent() == transpose(data_transpose.get_extent())); data_transpose.discard_data(); - parallel_for_each(data.get_extent(), [=] (index<2> idx) restrict(amp) { + parallel_for_each(data.get_extent(), [=] (index<2> idx) [[hc]] { data_transpose[transpose(idx)] = data[idx]; }); } @@ -51,8 +51,8 @@ void transpose_tiled_even(const array_view& data, data_transpose.discard_data(); extent<2> e = data.get_extent(); - parallel_for_each(e.tile<_tile_size, _tile_size>(), - [=] (tiled_index<_tile_size, _tile_size> tidx) restrict(amp) { + parallel_for_each(e.tile(_tile_size, _tile_size), + [=] (tiled_index<2> tidx) [[hc]] { tile_static _value_type t1[_tile_size][_tile_size]; t1[tidx.local[1]][tidx.local[0]] = data[tidx.global]; @@ -72,14 +72,14 @@ void transpose_tiled_even(const array_view& data, //----------------------------------------------------------------------------- template _value_type guarded_read(const array_view& data, - const index<2>& idx) restrict(amp) { + const index<2>& idx) [[hc]] { auto e = data.get_extent(); return e.contains(idx) ? data[idx] : _value_type(); } template void guarded_write(const array_view<_value_type, 2>& data, const index<2>& idx, - const _value_type& val) restrict(amp) { + const _value_type& val) [[hc]] { auto e = data.get_extent(); if(e.contains(idx)) data[idx] = val; @@ -92,8 +92,8 @@ void transpose_tiled_pad(const array_view& data, data_transpose.discard_data(); extent<2> e = data.get_extent(); - parallel_for_each(e.tile<_tile_size, _tile_size>().pad(), - [=] (tiled_index<_tile_size, _tile_size> tidx) restrict(amp) { + parallel_for_each(e.tile(_tile_size, _tile_size).pad(), + [=] (tiled_index<2> tidx) [[hc]] { tile_static _value_type t1[_tile_size][_tile_size]; t1[tidx.local[1]][tidx.local[0]] = guarded_read(data, tidx.global); @@ -157,12 +157,12 @@ void transpose_tiled_truncate_option_a( const array_view& data, const array_view<_value_type, 2>& data_transpose) { extent<2> e = data.get_extent(); - tiled_extent<_tile_size, _tile_size> e_truncated(e.tile<_tile_size, - _tile_size>().truncate()); + tiled_extent<_tile_size, _tile_size> e_truncated(e.tile(_tile_size, + _tile_size).truncate()); data_transpose.discard_data(); parallel_for_each(e_truncated, - [=] (tiled_index<_tile_size, _tile_size> tidx) restrict(amp) { + [=] (tiled_index<2> tidx) [[hc]] { // Normal processing tile_static _value_type t1[_tile_size][_tile_size]; t1[tidx.local[1]][tidx.local[0]] = data[tidx.global]; @@ -215,9 +215,8 @@ void transpose_tiled_truncate_option_b( const array_view& data, const array_view<_value_type, 2>& data_transpose) { extent<2> e = data.get_extent(); - tiled_extent<_tile_size, _tile_size> e_tiled(e.tile<_tile_size, - _tile_size>()); - tiled_extent<_tile_size, _tile_size> e_truncated(e_tiled.truncate()); + tiled_extent<2> e_tiled(e.tile(_tile_size, _tile_size)); + tiled_extent<2> e_truncated(e_tiled.truncate()); // Transform matrix to be multiple of 16*16 and transpose. auto b = data.section(index<2>(0,0), e_truncated); diff --git a/tests/Unit/Design/veccadd3.cpp b/tests/Unit/Design/veccadd3.cpp index 0166cb22025..074c960797a 100644 --- a/tests/Unit/Design/veccadd3.cpp +++ b/tests/Unit/Design/veccadd3.cpp @@ -1,8 +1,8 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include -using namespace concurrency; +using namespace hc; void vecAdd(float* A, float* B, float* C, int n) @@ -14,7 +14,7 @@ void vecAdd(float* A, float* B, float* C, int n) copy(A,AA); copy(B,BA); parallel_for_each(view, CA.get_extent(), - [&AA,&BA,&CA](index<1> i) restrict(amp) { + [&AA,&BA,&CA](index<1> i) [[hc]] { CA[i] = AA[i] + BA[i]; }); copy(CA,C); diff --git a/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp b/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp index a41b11a3325..3f9c443cfc0 100644 --- a/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp +++ b/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp @@ -1,4 +1,4 @@ -// RUN: %hc %s %S/hsacodelib.CPP -I/opt/rocm/include -L/opt/rocm/lib -lhsa-runtime64 -lhc_am -o %t.out && %t.out %S/vcpy_isa.hsaco +// RUN: %hc %s %S/hsacodelib.CPP -I/home/alexv/Programming/ROCR-Runtime/src/inc -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 -lhc_am -o %t.out && %t.out %S/vcpy_isa.hsaco #include diff --git a/tests/Unit/DynamicTileStatic/test3.cpp b/tests/Unit/DynamicTileStatic/test3.cpp index 8f229f88f14..2483b42217d 100644 --- a/tests/Unit/DynamicTileStatic/test3.cpp +++ b/tests/Unit/DynamicTileStatic/test3.cpp @@ -1,7 +1,6 @@ // RUN: %hc %s -o %t.out && %t.out -#include #include #include @@ -17,12 +16,14 @@ bool test1D() { std::vector table2(grid_size); std::vector table3(grid_size); std::vector table4(grid_size); - Concurrency::array_view av1(grid_size, table1); - Concurrency::array_view av2(grid_size, table2); - Concurrency::array_view av3(grid_size, table3); - Concurrency::array_view av4(grid_size, table4); - - Concurrency::parallel_for_each(Concurrency::extent<1>(grid_size).tile(), [=](Concurrency::tiled_index& idx) restrict(amp) { + hc::array_view av1(grid_size, table1); + hc::array_view av2(grid_size, table2); + hc::array_view av3(grid_size, table3); + hc::array_view av4(grid_size, table4); + + hc::parallel_for_each( + hc::extent<1>(grid_size).tile(tile_size), + [=](hc::tiled_index<1>& idx) [[hc]] { av1(idx) = idx.global[0]; av2(idx) = idx.local[0]; av3(idx) = idx.tile[0]; @@ -41,7 +42,9 @@ bool test1D() { hc::array_view av7(grid_size, table7); hc::array_view av8(grid_size, table8); - hc::completion_future fut = hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) restrict(amp) { + hc::completion_future fut = hc::parallel_for_each( + hc::tiled_extent<1>(grid_size, tile_size), + [=](hc::tiled_index<1>& idx) [[hc]] { av5(idx) = idx.global[0]; av6(idx) = idx.local[0]; av7(idx) = idx.tile[0]; @@ -113,16 +116,18 @@ bool test2D() { std::vector table6(grid_size_0 * grid_size_1); std::vector table7(grid_size_0 * grid_size_1); std::vector table8(grid_size_0 * grid_size_1); - Concurrency::array_view av1(grid_size_0, grid_size_1, table1); - Concurrency::array_view av2(grid_size_0, grid_size_1, table2); - Concurrency::array_view av3(grid_size_0, grid_size_1, table3); - Concurrency::array_view av4(grid_size_0, grid_size_1, table4); - Concurrency::array_view av5(grid_size_0, grid_size_1, table5); - Concurrency::array_view av6(grid_size_0, grid_size_1, table6); - Concurrency::array_view av7(grid_size_0, grid_size_1, table7); - Concurrency::array_view av8(grid_size_0, grid_size_1, table8); - - Concurrency::parallel_for_each(Concurrency::extent<2>(grid_size_0, grid_size_1).tile(), [=](Concurrency::tiled_index& idx) restrict(amp) { + hc::array_view av1(grid_size_0, grid_size_1, table1); + hc::array_view av2(grid_size_0, grid_size_1, table2); + hc::array_view av3(grid_size_0, grid_size_1, table3); + hc::array_view av4(grid_size_0, grid_size_1, table4); + hc::array_view av5(grid_size_0, grid_size_1, table5); + hc::array_view av6(grid_size_0, grid_size_1, table6); + hc::array_view av7(grid_size_0, grid_size_1, table7); + hc::array_view av8(grid_size_0, grid_size_1, table8); + + hc::parallel_for_each( + hc::extent<2>(grid_size_0, grid_size_1).tile(tile_size_0, tile_size_1), + [=](hc::tiled_index<2>& idx) [[hc]] { av1(idx) = idx.global[0]; av2(idx) = idx.global[1]; av3(idx) = idx.local[0]; @@ -152,7 +157,9 @@ bool test2D() { hc::array_view av15(grid_size_0, grid_size_1, table15); hc::array_view av16(grid_size_0, grid_size_1, table16); - hc::completion_future fut = hc::parallel_for_each(hc::tiled_extent<2>(grid_size_0, grid_size_1, tile_size_0, tile_size_1), [=](hc::tiled_index<2>& idx) restrict(amp) { + hc::completion_future fut = hc::parallel_for_each( + hc::tiled_extent<2>(grid_size_0, grid_size_1, tile_size_0, tile_size_1), + [=](hc::tiled_index<2>& idx) [[hc]] { av9(idx) = idx.global[0]; av10(idx) = idx.global[1]; av11(idx) = idx.local[0]; @@ -222,7 +229,13 @@ bool test2D() { } /// test HC parallel_for_each interface -template +template< + size_t grid_size_0, + size_t grid_size_1, + size_t grid_size_2, + size_t tile_size_0, + size_t tile_size_1, + size_t tile_size_2> bool test3D() { bool ret = true; @@ -241,20 +254,23 @@ bool test3D() { std::vector table10(grid_size_0 * grid_size_1 * grid_size_2); std::vector table11(grid_size_0 * grid_size_1 * grid_size_2); std::vector table12(grid_size_0 * grid_size_1 * grid_size_2); - Concurrency::array_view av1(grid_size_0, grid_size_1, grid_size_2, table1); - Concurrency::array_view av2(grid_size_0, grid_size_1, grid_size_2, table2); - Concurrency::array_view av3(grid_size_0, grid_size_1, grid_size_2, table3); - Concurrency::array_view av4(grid_size_0, grid_size_1, grid_size_2, table4); - Concurrency::array_view av5(grid_size_0, grid_size_1, grid_size_2, table5); - Concurrency::array_view av6(grid_size_0, grid_size_1, grid_size_2, table6); - Concurrency::array_view av7(grid_size_0, grid_size_1, grid_size_2, table7); - Concurrency::array_view av8(grid_size_0, grid_size_1, grid_size_2, table8); - Concurrency::array_view av9(grid_size_0, grid_size_1, grid_size_2, table9); - Concurrency::array_view av10(grid_size_0, grid_size_1, grid_size_2, table10); - Concurrency::array_view av11(grid_size_0, grid_size_1, grid_size_2, table11); - Concurrency::array_view av12(grid_size_0, grid_size_1, grid_size_2, table12); - - Concurrency::parallel_for_each(Concurrency::extent<3>(grid_size_0, grid_size_1, grid_size_2).tile(), [=](Concurrency::tiled_index& idx) restrict(amp) { + hc::array_view av1(grid_size_0, grid_size_1, grid_size_2, table1); + hc::array_view av2(grid_size_0, grid_size_1, grid_size_2, table2); + hc::array_view av3(grid_size_0, grid_size_1, grid_size_2, table3); + hc::array_view av4(grid_size_0, grid_size_1, grid_size_2, table4); + hc::array_view av5(grid_size_0, grid_size_1, grid_size_2, table5); + hc::array_view av6(grid_size_0, grid_size_1, grid_size_2, table6); + hc::array_view av7(grid_size_0, grid_size_1, grid_size_2, table7); + hc::array_view av8(grid_size_0, grid_size_1, grid_size_2, table8); + hc::array_view av9(grid_size_0, grid_size_1, grid_size_2, table9); + hc::array_view av10(grid_size_0, grid_size_1, grid_size_2, table10); + hc::array_view av11(grid_size_0, grid_size_1, grid_size_2, table11); + hc::array_view av12(grid_size_0, grid_size_1, grid_size_2, table12); + + hc::parallel_for_each( + hc::extent<3>(grid_size_0, grid_size_1, grid_size_2).tile( + tile_size_0, tile_size_1, tile_size_2), + [=](hc::tiled_index<3>& idx) [[hc]] { av1(idx) = idx.global[0]; av2(idx) = idx.global[1]; av3(idx) = idx.global[2]; @@ -296,7 +312,15 @@ bool test3D() { hc::array_view av23(grid_size_0, grid_size_1, grid_size_2, table23); hc::array_view av24(grid_size_0, grid_size_1, grid_size_2, table24); - hc::completion_future fut = hc::parallel_for_each(hc::tiled_extent<3>(grid_size_0, grid_size_1, grid_size_2, tile_size_0, tile_size_1, tile_size_2), [=](hc::tiled_index<3>& idx) restrict(amp) { + hc::completion_future fut = hc::parallel_for_each( + hc::tiled_extent<3>( + grid_size_0, + grid_size_1, + grid_size_2, + tile_size_0, + tile_size_1, + tile_size_2), + [=](hc::tiled_index<3>& idx) [[hc]] { av13(idx) = idx.global[0]; av14(idx) = idx.global[1]; av15(idx) = idx.global[2]; diff --git a/tests/Unit/DynamicTileStatic/test8.cpp b/tests/Unit/DynamicTileStatic/test8.cpp index e0d38a5fe51..d41a1f7e4f0 100644 --- a/tests/Unit/DynamicTileStatic/test8.cpp +++ b/tests/Unit/DynamicTileStatic/test8.cpp @@ -23,7 +23,7 @@ bool test1D() { array_view av3(grid_size, table3); array_view av4(grid_size, table4); - completion_future fut1 = parallel_for_each(extent<1>(grid_size).tile(tile_size), [=](tiled_index<1>& idx) restrict(amp) { + completion_future fut1 = parallel_for_each(extent<1>(grid_size).tile(tile_size), [=](tiled_index<1>& idx) [[hc]] { av1(idx) = idx.global[0]; av2(idx) = idx.local[0]; av3(idx) = idx.tile[0]; @@ -40,7 +40,7 @@ bool test1D() { array_view av7(grid_size, table7); array_view av8(grid_size, table8); - completion_future fut2 = parallel_for_each(tiled_extent<1>(grid_size, tile_size), [=](tiled_index<1>& idx) restrict(amp) { + completion_future fut2 = parallel_for_each(tiled_extent<1>(grid_size, tile_size), [=](tiled_index<1>& idx) [[hc]] { av5(idx) = idx.global[0]; av6(idx) = idx.local[0]; av7(idx) = idx.tile[0]; @@ -121,7 +121,7 @@ bool test2D() { array_view av7(grid_size_0, grid_size_1, table7); array_view av8(grid_size_0, grid_size_1, table8); - completion_future fut1 = parallel_for_each(extent<2>(grid_size_0, grid_size_1).tile(tile_size_0, tile_size_1), [=](tiled_index<2>& idx) restrict(amp) { + completion_future fut1 = parallel_for_each(extent<2>(grid_size_0, grid_size_1).tile(tile_size_0, tile_size_1), [=](tiled_index<2>& idx) [[hc]] { av1(idx) = idx.global[0]; av2(idx) = idx.global[1]; av3(idx) = idx.local[0]; @@ -150,7 +150,7 @@ bool test2D() { array_view av15(grid_size_0, grid_size_1, table15); array_view av16(grid_size_0, grid_size_1, table16); - completion_future fut2 = parallel_for_each(tiled_extent<2>(grid_size_0, grid_size_1, tile_size_0, tile_size_1), [=](tiled_index<2>& idx) restrict(amp) { + completion_future fut2 = parallel_for_each(tiled_extent<2>(grid_size_0, grid_size_1, tile_size_0, tile_size_1), [=](tiled_index<2>& idx) [[hc]] { av9(idx) = idx.global[0]; av10(idx) = idx.global[1]; av11(idx) = idx.local[0]; @@ -252,7 +252,7 @@ bool test3D() { array_view av11(grid_size_0, grid_size_1, grid_size_2, table11); array_view av12(grid_size_0, grid_size_1, grid_size_2, table12); - completion_future fut1 = parallel_for_each(extent<3>(grid_size_0, grid_size_1, grid_size_2).tile(tile_size_0, tile_size_1, tile_size_2), [=](tiled_index<3>& idx) restrict(amp) { + completion_future fut1 = parallel_for_each(extent<3>(grid_size_0, grid_size_1, grid_size_2).tile(tile_size_0, tile_size_1, tile_size_2), [=](tiled_index<3>& idx) [[hc]] { av1(idx) = idx.global[0]; av2(idx) = idx.global[1]; av3(idx) = idx.global[2]; @@ -294,7 +294,7 @@ bool test3D() { array_view av23(grid_size_0, grid_size_1, grid_size_2, table23); array_view av24(grid_size_0, grid_size_1, grid_size_2, table24); - completion_future fut2 = parallel_for_each(tiled_extent<3>(grid_size_0, grid_size_1, grid_size_2, tile_size_0, tile_size_1, tile_size_2), [=](tiled_index<3>& idx) restrict(amp) { + completion_future fut2 = parallel_for_each(tiled_extent<3>(grid_size_0, grid_size_1, grid_size_2, tile_size_0, tile_size_1, tile_size_2), [=](tiled_index<3>& idx) [[hc]] { av13(idx) = idx.global[0]; av14(idx) = idx.global[1]; av15(idx) = idx.global[2]; diff --git a/tests/Unit/DynamicTileStatic/test9.cpp b/tests/Unit/DynamicTileStatic/test9.cpp index 1fa112d6c6c..b9513233c76 100644 --- a/tests/Unit/DynamicTileStatic/test9.cpp +++ b/tests/Unit/DynamicTileStatic/test9.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include #include @@ -14,9 +14,9 @@ bool test1D() { // first run normal C++AMP parallel_for_each std::vector table1(grid_size); - Concurrency::array_view av1(grid_size, table1); + hc::array_view av1(grid_size, table1); - Concurrency::parallel_for_each(Concurrency::extent<1>(grid_size), [=](Concurrency::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { av1(idx) = idx[0]; }); @@ -25,7 +25,7 @@ bool test1D() { std::vector table5(grid_size); hc::array_view av5(grid_size, table5); - hc::completion_future fut = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) restrict(amp) { + hc::completion_future fut = hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] { av5(idx) = idx[0]; }); @@ -76,10 +76,10 @@ bool test2D() { std::vector table1(grid_size_0 * grid_size_1); std::vector table2(grid_size_0 * grid_size_1); - Concurrency::array_view av1(grid_size_0, grid_size_1, table1); - Concurrency::array_view av2(grid_size_0, grid_size_1, table2); + hc::array_view av1(grid_size_0, grid_size_1, table1); + hc::array_view av2(grid_size_0, grid_size_1, table2); - Concurrency::parallel_for_each(Concurrency::extent<2>(grid_size_0, grid_size_1), [=](Concurrency::index<2>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<2>(grid_size_0, grid_size_1), [=](hc::index<2>& idx) [[hc]] { av1(idx) = idx[0]; av2(idx) = idx[1]; }); @@ -91,7 +91,7 @@ bool test2D() { hc::array_view av9(grid_size_0, grid_size_1, table9); hc::array_view av10(grid_size_0, grid_size_1, table10); - hc::completion_future fut = hc::parallel_for_each(hc::extent<2>(grid_size_0, grid_size_1), [=](hc::index<2>& idx) restrict(amp) { + hc::completion_future fut = hc::parallel_for_each(hc::extent<2>(grid_size_0, grid_size_1), [=](hc::index<2>& idx) [[hc]] { av9(idx) = idx[0]; av10(idx) = idx[1]; }); @@ -147,11 +147,11 @@ bool test3D() { std::vector table1(grid_size_0 * grid_size_1 * grid_size_2); std::vector table2(grid_size_0 * grid_size_1 * grid_size_2); std::vector table3(grid_size_0 * grid_size_1 * grid_size_2); - Concurrency::array_view av1(grid_size_0, grid_size_1, grid_size_2, table1); - Concurrency::array_view av2(grid_size_0, grid_size_1, grid_size_2, table2); - Concurrency::array_view av3(grid_size_0, grid_size_1, grid_size_2, table3); + hc::array_view av1(grid_size_0, grid_size_1, grid_size_2, table1); + hc::array_view av2(grid_size_0, grid_size_1, grid_size_2, table2); + hc::array_view av3(grid_size_0, grid_size_1, grid_size_2, table3); - Concurrency::parallel_for_each(Concurrency::extent<3>(grid_size_0, grid_size_1, grid_size_2), [=](Concurrency::index<3>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<3>(grid_size_0, grid_size_1, grid_size_2), [=](hc::index<3>& idx) [[hc]] { av1(idx) = idx[0]; av2(idx) = idx[1]; av3(idx) = idx[2]; @@ -166,7 +166,7 @@ bool test3D() { hc::array_view av14(grid_size_0, grid_size_1, grid_size_2, table14); hc::array_view av15(grid_size_0, grid_size_1, grid_size_2, table15); - hc::completion_future fut = hc::parallel_for_each(hc::extent<3>(grid_size_0, grid_size_1, grid_size_2), [=](hc::index<3>& idx) restrict(amp) { + hc::completion_future fut = hc::parallel_for_each(hc::extent<3>(grid_size_0, grid_size_1, grid_size_2), [=](hc::index<3>& idx) [[hc]] { av13(idx) = idx[0]; av14(idx) = idx[1]; av15(idx) = idx[2]; diff --git a/tests/Unit/HC/auto_annotate_attribute.cpp b/tests/Unit/HC/auto_annotate_attribute.cpp deleted file mode 100644 index 6f81faa0568..00000000000 --- a/tests/Unit/HC/auto_annotate_attribute.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// RUN: %hc -Xclang -fauto-compile-for-accelerator %s -o %t.out && %t.out - -#include - -#include -#include - -// foo is a global function which doesn't have [[hc]] attribute -// if compiled with -Xclang -fauto-compile-for-accelerator, [[hc]] would be -// annotated automatically -int foo() { - return 1; -} - -template -bool test1() { - using namespace hc; - bool ret = true; - array table(GRID_SIZE); - extent<1> ex(GRID_SIZE); - parallel_for_each(ex, [&](index<1>& idx) [[hc]] { - table[idx] = foo(); - }).wait(); - - std::vector result = table; - for (int i = 0; i < GRID_SIZE; ++i) { - if (result[i] != 1) { - std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n"; - ret = false; - break; - } - } - return ret; -} - -// bar is a static function which doesn't have [[hc]] attribute -// if compiled with -Xclang -fauto-compile-for-accelerator, [[hc]] would be -// annotated automatically -static int bar() { - return 1; -} - -template -bool test2() { - using namespace hc; - bool ret = true; - array table(GRID_SIZE); - extent<1> ex(GRID_SIZE); - parallel_for_each(ex, [&](index<1>& idx) [[hc]] { - table[idx] = bar(); - }).wait(); - - std::vector result = table; - for (int i = 0; i < GRID_SIZE; ++i) { - if (result[i] != 1) { - std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n"; - ret = false; - break; - } - } - return ret; -} - -// baz is a class with a member function test() which doesn't have [[hc]] attribute -// if compiled with -Xclang -fauto-compile-for-accelerator, [[hc]] would be -// annotated automatically -class baz { -public: - int test() { - return 1; - } - - static int test2() { - return 1; - } -}; - -template -bool test3() { - using namespace hc; - bool ret = true; - array table(GRID_SIZE); - extent<1> ex(GRID_SIZE); - baz obj; - parallel_for_each(ex, [&](index<1>& idx) [[hc]] { - table[idx] = obj.test(); - }).wait(); - - std::vector result = table; - for (int i = 0; i < GRID_SIZE; ++i) { - if (result[i] != 1) { - std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n"; - ret = false; - break; - } - } - return ret; -} - -template -bool test4() { - using namespace hc; - bool ret = true; - array table(GRID_SIZE); - extent<1> ex(GRID_SIZE); - parallel_for_each(ex, [&](index<1>& idx) [[hc]] { - table[idx] = baz::test2(); - }).wait(); - - std::vector result = table; - for (int i = 0; i < GRID_SIZE; ++i) { - if (result[i] != 1) { - std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n"; - ret = false; - break; - } - } - return ret; -} - -int main() { - bool ret = true; - - // test with global function - ret &= test1<64>(); - - // test with static function - ret &= test2<64>(); - - // test with member function - ret &= test3<64>(); - - // test with static member function - ret &= test4<64>(); - - return !(ret == true); -} - diff --git a/tests/Unit/HC/create_blocking_marker.cpp b/tests/Unit/HC/create_blocking_marker.cpp index ac6f9f67d29..ce660d71793 100644 --- a/tests/Unit/HC/create_blocking_marker.cpp +++ b/tests/Unit/HC/create_blocking_marker.cpp @@ -1,4 +1,4 @@ -// RUN: %hc %s -I/opt/rocm/hsa/include -L/opt/rocm/lib -lhsa-runtime64 -o %t.out && %t.out +// RUN: %hc %s -I/home/alexv/Programming/ROCR-Runtime/src/inc -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 -o %t.out && %t.out #include diff --git a/tests/Unit/HC/create_blocking_marker2.cpp b/tests/Unit/HC/create_blocking_marker2.cpp index cea356012e6..66d7936958e 100644 --- a/tests/Unit/HC/create_blocking_marker2.cpp +++ b/tests/Unit/HC/create_blocking_marker2.cpp @@ -1,4 +1,4 @@ -// RUN: %hc %s -I/opt/rocm/hsa/include -L/opt/rocm/lib -lhsa-runtime64 -o %t.out && %t.out +// RUN: %hc %s -I/home/alexv/Programming/ROCR-Runtime/src/inc -L/home/alexv/Programming/ROCR-Runtime/src/build -lhsa-runtime64 -o %t.out && %t.out #include diff --git a/tests/Unit/HC/test2.cpp b/tests/Unit/HC/test2.cpp index 820a716070f..9b707f5fa98 100644 --- a/tests/Unit/HC/test2.cpp +++ b/tests/Unit/HC/test2.cpp @@ -38,21 +38,21 @@ int main() { hc::accelerator_view accelerator_view = hc::accelerator().get_default_view(); // do 3 kernel dispatches + 3 barriers - hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) av3(idx) = av1(idx) + av2(idx); }); accelerator_view.create_marker(); - hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) av4(idx) = av1(idx) + av2(idx); }); accelerator_view.create_marker(); - hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) restrict(amp) { + hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) [[hc]] { for (int i = 0; i < LOOP_COUNT; ++i) av5(idx) = av1(idx) + av2(idx); }); diff --git a/tests/Unit/HC/wg_register_limit1.cpp b/tests/Unit/HC/wg_register_limit1.cpp index c9a9a54a8d5..5c95184d148 100644 --- a/tests/Unit/HC/wg_register_limit1.cpp +++ b/tests/Unit/HC/wg_register_limit1.cpp @@ -46,7 +46,7 @@ int main() { } }); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The number of work items") != std::string::npos && err_str.find("per work group exceeds the limit") != std::string::npos; diff --git a/tests/Unit/HC/wg_size_unsupported1.cpp b/tests/Unit/HC/wg_size_unsupported1.cpp index c58691be80a..32e011fb351 100644 --- a/tests/Unit/HC/wg_size_unsupported1.cpp +++ b/tests/Unit/HC/wg_size_unsupported1.cpp @@ -9,7 +9,7 @@ int main() { try { // We expect the runtime will fire an exception due to a large work group size hc::parallel_for_each(hc::extent<1>(8192).tile(8192), [](hc::tiled_index<1> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the device limit") != std::string::npos; diff --git a/tests/Unit/HC/wg_size_unsupported2.cpp b/tests/Unit/HC/wg_size_unsupported2.cpp index 4c7e8f032bc..a691f4cff9c 100644 --- a/tests/Unit/HC/wg_size_unsupported2.cpp +++ b/tests/Unit/HC/wg_size_unsupported2.cpp @@ -10,7 +10,7 @@ int main() { try { // We expect the runtime will fire an exception due to a large work group size hc::parallel_for_each(hc::extent<2>(8192,1).tile(8192,1), [](hc::tiled_index<2> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the device limit") != std::string::npos; @@ -19,7 +19,7 @@ int main() { try { // We expect the runtime will fire an exception due to a large work group size hc::parallel_for_each(hc::extent<2>(1,8192).tile(1,8192), [](hc::tiled_index<2> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass &= err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the device limit") != std::string::npos; diff --git a/tests/Unit/HC/wg_size_unsupported3.cpp b/tests/Unit/HC/wg_size_unsupported3.cpp index 06e5420e588..64b27370c24 100644 --- a/tests/Unit/HC/wg_size_unsupported3.cpp +++ b/tests/Unit/HC/wg_size_unsupported3.cpp @@ -10,7 +10,7 @@ int main() { try { // We expect the runtime will fire an exception due to a large work group size hc::parallel_for_each(hc::extent<3>(8192,1,1).tile(8192,1,1), [](hc::tiled_index<3> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the device limit") != std::string::npos; @@ -19,7 +19,7 @@ int main() { try { // We expect the runtime will fire an exception due to a large work group size hc::parallel_for_each(hc::extent<3>(1,8192,1).tile(1,8192,1), [](hc::tiled_index<3> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the device limit") != std::string::npos; @@ -28,7 +28,7 @@ int main() { try { // We expect the runtime will fire an exception due to a large work group size hc::parallel_for_each(hc::extent<3>(1,1,8192).tile(1,1,8192), [](hc::tiled_index<3> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the device limit") != std::string::npos; diff --git a/tests/Unit/HC/wg_size_unsupported4.cpp b/tests/Unit/HC/wg_size_unsupported4.cpp index 069128f4748..250d505454a 100644 --- a/tests/Unit/HC/wg_size_unsupported4.cpp +++ b/tests/Unit/HC/wg_size_unsupported4.cpp @@ -9,7 +9,7 @@ int main() { try { hc::parallel_for_each(hc::extent<3>(16,16,16).tile(32,1,1), [](hc::tiled_index<3> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the compute grid extent") != std::string::npos; @@ -17,7 +17,7 @@ int main() { try { hc::parallel_for_each(hc::extent<3>(16,16,16).tile(1,32,1), [](hc::tiled_index<3> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the compute grid extent") != std::string::npos; @@ -25,7 +25,7 @@ int main() { try { hc::parallel_for_each(hc::extent<3>(16,16,16).tile(1,1,32), [](hc::tiled_index<3> i) [[hc]] {}); - } catch (Kalmar::runtime_exception e) { + } catch (detail::runtime_exception e) { std::string err_str = e.what(); pass = err_str.find("The extent of the tile") != std::string::npos && err_str.find("exceeds the compute grid extent") != std::string::npos; diff --git a/tests/Unit/HSA/functor1.cpp b/tests/Unit/HSA/functor1.cpp index 403cb67924c..a2535ca1f33 100644 --- a/tests/Unit/HSA/functor1.cpp +++ b/tests/Unit/HSA/functor1.cpp @@ -1,28 +1,25 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include -// added for checking HSA profile -#include - // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully #define SIZE (16) -using namespace concurrency; +using namespace hc; // test supply a class with operator() to parallel_for_each class prog { int (&input)[SIZE]; public: - prog(int (&t)[SIZE]) restrict(amp,cpu) : input(t) {} + prog(int (&t)[SIZE]) [[cpu, hc]] : input(t) {} - void operator() (index<1>& idx) restrict(amp) { + void operator()(index<1>& idx) const [[hc]] { input[idx[0]] = idx[0]; } diff --git a/tests/Unit/HSA/functor2.cpp b/tests/Unit/HSA/functor2.cpp index 8adca7e05dd..5f1a539c246 100644 --- a/tests/Unit/HSA/functor2.cpp +++ b/tests/Unit/HSA/functor2.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include @@ -13,13 +13,13 @@ #define SIZE (16) -using namespace concurrency; +using namespace hc; // test supply a class with operator() to parallel_for_each // the class will call a separate functor class user_functor { public: - void operator() (index<1>& idx, int (&input)[SIZE]) restrict(amp) { + void operator()(index<1>& idx, int (&input)[SIZE]) const [[hc]] { input[idx[0]] = idx[0]; } }; @@ -29,10 +29,10 @@ class prog { user_functor& kernel; public: - prog(int (&t)[SIZE], user_functor& f) restrict(amp,cpu) : input(t), kernel(f) { + prog(int (&t)[SIZE], user_functor& f) [[cpu, hc]] : input(t), kernel(f) { } - void operator() (index<1>& idx) restrict(amp) { + void operator()(index<1>& idx) const [[hc]] { kernel(idx, input); } diff --git a/tests/Unit/HSA/functor3.cpp b/tests/Unit/HSA/functor3.cpp index 113d5d66e62..091d75a6af3 100644 --- a/tests/Unit/HSA/functor3.cpp +++ b/tests/Unit/HSA/functor3.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include @@ -13,7 +13,7 @@ #define SIZE (16) -using namespace concurrency; +using namespace hc; // test supply a class with operator() to parallel_for_each // the class will call a separate functor with a customized ctor @@ -21,9 +21,9 @@ class user_functor { public: int (&input)[SIZE]; - user_functor(int (&t)[SIZE]) restrict(amp,cpu) : input(t) {} + user_functor(int (&t)[SIZE]) [[cpu, hc]] : input(t) {} - void operator() (index<1>& idx) restrict(amp) { + void operator() (index<1>& idx) const [[hc]] { input[idx[0]] = idx[0]; } }; @@ -32,10 +32,10 @@ class prog { user_functor& kernel; public: - prog(user_functor& f) restrict(amp,cpu) : kernel(f) { + prog(user_functor& f) [[cpu, hc]] : kernel(f) { } - void operator() (index<1>& idx) restrict(amp) { + void operator() (index<1>& idx) const [[hc]] { kernel(idx); } diff --git a/tests/Unit/HSA/functor4.cpp b/tests/Unit/HSA/functor4.cpp index 36ee50e0c3e..95b6ff86f22 100644 --- a/tests/Unit/HSA/functor4.cpp +++ b/tests/Unit/HSA/functor4.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include @@ -13,7 +13,7 @@ #define SIZE (16) -using namespace concurrency; +using namespace hc; // test supply a template class with operator() to parallel_for_each template @@ -21,10 +21,10 @@ class prog { _Tp (&input)[N]; public: - prog(_Tp (&t)[N]) restrict(amp,cpu) : input(t) { + prog(_Tp (&t)[N]) [[cpu, hc]] : input(t) { } - void operator() (index<1>& idx) restrict(amp) { + void operator() (index<1>& idx) const [[hc]] { input[idx[0]] = idx[0]; } diff --git a/tests/Unit/HSA/functor5.cpp b/tests/Unit/HSA/functor5.cpp index 0ad51d630ee..5e456766f02 100644 --- a/tests/Unit/HSA/functor5.cpp +++ b/tests/Unit/HSA/functor5.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include @@ -13,7 +13,7 @@ #define SIZE (16) -using namespace concurrency; +using namespace hc; // test supply a class with operator() to parallel_for_each // the template class will call a separate template functor @@ -21,7 +21,7 @@ using namespace concurrency; template class user_functor { public: - void operator() (index<1>& idx, _Tp (&input)[N]) restrict(amp) { + void operator() (index<1>& idx, _Tp (&input)[N]) [[hc]] { input[idx[0]] = idx[0]; } }; @@ -32,10 +32,10 @@ class prog { user_functor<_Tp, N>& kernel; public: - prog(_Tp (&t)[N], user_functor<_Tp, N>& f) restrict(amp,cpu) : input(t), kernel(f) { + prog(_Tp (&t)[N], user_functor<_Tp, N>& f) [[cpu, hc]] : input(t), kernel(f) { } - void operator() (index<1>& idx) restrict(amp) { + void operator() (index<1>& idx) const [[hc]] { kernel(idx, input); } diff --git a/tests/Unit/HSA/functor6.cpp b/tests/Unit/HSA/functor6.cpp index 6e6014426d7..0b34ed81c8a 100644 --- a/tests/Unit/HSA/functor6.cpp +++ b/tests/Unit/HSA/functor6.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include @@ -13,7 +13,7 @@ #define SIZE (16) -using namespace concurrency; +using namespace hc; // test supply a class with operator() to parallel_for_each // the template class will call a separate template functor with a customized ctor @@ -23,9 +23,9 @@ class user_functor { public: _Tp (&input)[N]; - user_functor(_Tp (&t)[N]) restrict(amp,cpu) : input(t) {} + user_functor(_Tp (&t)[N]) [[cpu, hc]] : input(t) {} - void operator() (index<1>& idx) restrict(amp) { + void operator() (index<1>& idx) const [[hc]] { input[idx[0]] = idx[0]; } }; @@ -35,10 +35,10 @@ class prog { _Tp& kernel; public: - prog(_Tp& f) restrict(amp,cpu) : kernel(f) { + prog(_Tp& f) [[cpu, hc]] : kernel(f) { } - void operator() (index<1>& idx) restrict(amp) { + void operator() (index<1>& idx) const [[hc]] { kernel(idx); } diff --git a/tests/Unit/HSA/list.cpp b/tests/Unit/HSA/list.cpp index e9e8def4514..dab10a719b5 100644 --- a/tests/Unit/HSA/list.cpp +++ b/tests/Unit/HSA/list.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include // added for checking HSA profile #include @@ -11,7 +11,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -using namespace concurrency; +using namespace hc; class List { public: @@ -34,7 +34,7 @@ bool test() { int n = nodes.size(); // test on GPU - parallel_for_each(extent<1>(1),[=, &sum_gpu](index<1> idx) restrict(amp) { + parallel_for_each(extent<1>(1),[=, &sum_gpu](index<1> idx) [[hc]] { List* l = head; for (int i = 0; i < n; ++i) { sum_gpu += l->data; diff --git a/tests/Unit/HSA/list2.cpp b/tests/Unit/HSA/list2.cpp index 877014c7ca2..bac288fe92b 100644 --- a/tests/Unit/HSA/list2.cpp +++ b/tests/Unit/HSA/list2.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include @@ -83,7 +83,7 @@ bool test() { list_data *newdata = (list_data*) malloc (sizeof(list_data)); newdata->data16 = 10; - parallel_for_each(concurrency::extent<1>(1),[=, &sum_gpu](concurrency::index<1> idx) restrict(amp) { + parallel_for_each(hc::extent<1>(1),[=, &sum_gpu](hc::index<1> idx) [[hc]] { list_head* l = llist; list_insert_new(llist, newitem, newdata, NUM_LIST_NODES-1); for (int i = 0; i <= NUM_LIST_NODES; ++i) { diff --git a/tests/Unit/HSA/no_printf.cpp b/tests/Unit/HSA/no_printf.cpp index f627a979055..e9a9d9458db 100644 --- a/tests/Unit/HSA/no_printf.cpp +++ b/tests/Unit/HSA/no_printf.cpp @@ -4,7 +4,7 @@ #include int main() { - hc::parallel_for_each(hc::extent<1>(1), []() [[hc]] { + hc::parallel_for_each(hc::extent<1>(1), [](hc::index<1>) [[hc]] { hc::printf("Accelerator: Hello World!\n"); }).wait(); return 0; diff --git a/tests/Unit/HSA/printf_minimal.cpp b/tests/Unit/HSA/printf_minimal.cpp index a06d50a61a0..475605c17ca 100644 --- a/tests/Unit/HSA/printf_minimal.cpp +++ b/tests/Unit/HSA/printf_minimal.cpp @@ -4,7 +4,7 @@ #include int main() { - hc::parallel_for_each(hc::extent<1>(1), []() [[hc]] { + hc::parallel_for_each(hc::extent<1>(1), [](hc::index<1>) [[hc]] { hc::printf("Accelerator: Hello World!\n"); }).wait(); return 0; diff --git a/tests/Unit/HSA/sizeof.cpp b/tests/Unit/HSA/sizeof.cpp index 853b494cbd3..2aadd180328 100644 --- a/tests/Unit/HSA/sizeof.cpp +++ b/tests/Unit/HSA/sizeof.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include #include @@ -13,11 +13,11 @@ template bool test() { - using namespace concurrency; + using namespace hc; int width = 0; - auto k = [&width] (const index<1>& idx) restrict(amp) { + auto k = [&width] (const index<1>& idx) [[hc]] { width = sizeof(T); }; diff --git a/tests/Unit/HSA/string.cpp b/tests/Unit/HSA/string.cpp index f3319c8cdaa..3858ee8c7f9 100644 --- a/tests/Unit/HSA/string.cpp +++ b/tests/Unit/HSA/string.cpp @@ -2,7 +2,7 @@ // RUN: %hc %s -o %t.out && %t.out #include -#include +#include // added for checking HSA profile #include @@ -10,7 +10,7 @@ // test C++AMP with fine-grained SVM // requires HSA Full Profile to operate successfully -using namespace concurrency; +using namespace hc; class List { public: @@ -30,7 +30,7 @@ bool test() { int sum_cpu = 0; // test on GPU - parallel_for_each(extent<1>(1),[=,&l,&sum_gpu](index<1> i) restrict(amp) { + parallel_for_each(extent<1>(1),[=,&l,&sum_gpu](index<1> i) [[hc]] { for (int j = 0; j < 4; j++) { sum_gpu+=l.strings[j][0]; } diff --git a/tests/Unit/HSA/volatile_union.cpp b/tests/Unit/HSA/volatile_union.cpp index 2c518279c81..f71212fac87 100644 --- a/tests/Unit/HSA/volatile_union.cpp +++ b/tests/Unit/HSA/volatile_union.cpp @@ -1,7 +1,7 @@ // RUN: %hc %s -o %t.out && %t.out -#include +#include // added for checking HSA profile #include @@ -49,12 +49,12 @@ __attribute__((amp,cpu)) float foo2(float a) { bool test() { bool ret = true; - using namespace concurrency; + using namespace hc; float table[SIZE] { 0.0f }; // test foo1 - parallel_for_each(extent<1>(SIZE), [&table](index<1> idx) restrict(amp) { + parallel_for_each(extent<1>(SIZE), [&table](index<1> idx) [[hc]] { table[idx[0]] = foo1(0.0f); }); @@ -70,7 +70,7 @@ bool test() { } // test foo2 - parallel_for_each(extent<1>(SIZE), [&table](index<1> idx) restrict(amp) { + parallel_for_each(extent<1>(SIZE), [&table](index<1> idx) [[hc]] { table[idx[0]] = foo2(0.0f); }); diff --git a/tests/Unit/Indexing/extent.cpp b/tests/Unit/Indexing/extent.cpp index 56839893604..e496c65a533 100644 --- a/tests/Unit/Indexing/extent.cpp +++ b/tests/Unit/Indexing/extent.cpp @@ -1,8 +1,8 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include +#include #include -using namespace concurrency; +using namespace hc; int main() { std::vector vv(10); @@ -12,7 +12,7 @@ int main() extent<2> e(5, 2); { array_view av(5, 2, vv.data()); - parallel_for_each(av.get_extent(), [=](index<2> idx) restrict(amp) { + parallel_for_each(av.get_extent(), [=](index<2> idx) [[hc]] { av(idx) -= av.get_extent()[1]; }); assert(av.get_extent() == e); diff --git a/tests/Unit/Indexing/index.cpp b/tests/Unit/Indexing/index.cpp index 41854808161..8a65ce5eb72 100644 --- a/tests/Unit/Indexing/index.cpp +++ b/tests/Unit/Indexing/index.cpp @@ -1,7 +1,7 @@ // RUN: %amp_device -c -S -D__KALMAR_ACCELERATOR__ -emit-llvm %s -O -o -|%cppfilt|%FileCheck %s // RUN: %gtest_amp %s -o %t && %t -// Testing if an efficient (i.e. fully inlined version) of Concurrency::index -#include +// Testing if an efficient (i.e. fully inlined version) of hc::index +#include #ifndef __KALMAR_ACCELERATOR__ //Device mode compilation cannot have RTTI #include #endif @@ -9,12 +9,12 @@ // Test code generation; operator[] should be inlined completely // And there shouldn't be any load/stores! -int foo(int k) restrict(amp){ - Concurrency::index<1> i(k); +int foo(int k) [[hc]]{ + hc::index<1> i(k); return i[0]; } //CHECK: define {{.*}} @foo(int) -//CHECK-NOT: call {{.*}}Concurrency::index<1>::operator[] +//CHECK-NOT: call {{.*}}hc::index<1>::operator[] //CHECK-NOT: load //CHECK: } @@ -22,43 +22,43 @@ int foo(int k) restrict(amp){ // Test correctness TEST(ClassIndex, Index1D) { int n0 = N0; - Concurrency::index<1> i(n0); + hc::index<1> i(n0); EXPECT_EQ(n0, i[0]); } TEST(ClassIndex, Def) { - Concurrency::index<1> i(1234); + hc::index<1> i(1234); // Test copy constructor - Concurrency::index<1> j(i); + hc::index<1> j(i); EXPECT_EQ(i[0], j[0]); // Test prefix ++ ++j; EXPECT_EQ(i[0]+1, j[0]); // Test postfix ++ - Concurrency::index<1> k(j++); + hc::index<1> k(j++); EXPECT_EQ(i[0]+1, k[0]); EXPECT_EQ(i[0]+2, j[0]); } TEST(ClassIndex, Add) { - Concurrency::index<2> i(1234, 5678); - Concurrency::index<2> j(4321, 8765); - Concurrency::index<2> k = i + j; + hc::index<2> i(1234, 5678); + hc::index<2> j(4321, 8765); + hc::index<2> k = i + j; EXPECT_EQ(1234+4321, k[0]); EXPECT_EQ(5678+8765, k[1]); } TEST(ClassIndex, AddEqual) { - Concurrency::index<2> i(1234, 5678); - Concurrency::index<2> j(4321, 8765); + hc::index<2> i(1234, 5678); + hc::index<2> j(4321, 8765); i += j; EXPECT_EQ(1234+4321, i[0]); EXPECT_EQ(5678+8765, i[1]); } TEST(ClassIndex, SubEqual) { - Concurrency::index<2> i(5555, 9999); - Concurrency::index<2> j(4321, 8765); + hc::index<2> i(5555, 9999); + hc::index<2> j(4321, 8765); i -= j; EXPECT_EQ(1234, i[0]); EXPECT_EQ(1234, i[1]); diff --git a/tests/Unit/Indexing/tile_index.cpp b/tests/Unit/Indexing/tile_index.cpp index 7853909eaac..aac8ac193f5 100644 --- a/tests/Unit/Indexing/tile_index.cpp +++ b/tests/Unit/Indexing/tile_index.cpp @@ -1,8 +1,8 @@ // RUN: %cxxamp %s -o %t.out && %t.out #include -#include +#include #include -using namespace concurrency; +using namespace hc; int test_1d() { std::vector vv(100); for (int i = 0; i<100; i++) @@ -10,10 +10,9 @@ int test_1d() { extent<1> e(100); { - array_view av(e, vv.data()); - parallel_for_each(av.get_extent().tile<5>(), - [=](tiled_index<5> idx) restrict(amp) { - av(idx) = + array_view av(e, vv.data()); + parallel_for_each(av.get_extent().tile(5), [=](tiled_index<1> idx) [[hc]] { + av(idx) = idx.tile[0] + idx.tile_origin[0] * 100; }); @@ -31,16 +30,16 @@ int test_2d() extent<2> e(10, 20); { - array_view av(e, vv.data()); - parallel_for_each(av.get_extent().tile<5,5>(), - [=](tiled_index<5,5> idx) restrict(amp) { - av(idx) = + array_view av(e, vv.data()); + parallel_for_each( + av.get_extent().tile(5, 5), [=](tiled_index<2> idx) [[hc]] { + av(idx) = idx.tile[0] + idx.tile[1] * 10 + idx.tile_origin[0] * 100 + idx.tile_origin[1] * 1000 + - idx.tile_extent[0] * 10000 + - idx.tile_extent[1] * 100000; + idx.tile_dim[0] * 10000 + + idx.tile_dim[1] * 100000; }); assert(av.get_extent() == e); for(unsigned int i = 0; i < av.get_extent()[0]; i++) @@ -57,7 +56,7 @@ int test_2d() int test_tiled_extent_1d(void) { extent<1> e(123); - tiled_extent<10> myTileExtent(e); + tiled_extent<1> myTileExtent(e.tile(10)); auto padded = myTileExtent.pad(); assert(padded[0] == 130); @@ -68,7 +67,7 @@ int test_tiled_extent_1d(void) { int test_tiled_extent_2d(void) { extent<2> e(123, 456); - tiled_extent<10,30> myTileExtent(e); + tiled_extent<2> myTileExtent(e.tile(10, 30)); auto padded = myTileExtent.pad(); assert(padded[0] == 130); assert(padded[1] == 480); @@ -81,7 +80,7 @@ int test_tiled_extent_2d(void) { int test_tiled_extent_3d(void) { extent<3> e(123, 456, 789); - tiled_extent<10, 30, 40> myTileExtent(e); + tiled_extent myTileExtent(e.tile(10, 30, 40)); auto padded = myTileExtent.pad(); assert(padded[0] == 130); assert(padded[1] == 480); diff --git a/tests/Unit/InvalidLambda/empty_lambda2.cpp b/tests/Unit/InvalidLambda/empty_lambda2.cpp index 16e9d095fbc..1b8ac9b7848 100644 --- a/tests/Unit/InvalidLambda/empty_lambda2.cpp +++ b/tests/Unit/InvalidLambda/empty_lambda2.cpp @@ -1,12 +1,12 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include int main() { // This test outlines a subtle issue with how we obtain mangled kernel names // which is tracked in SWDEV-137849. fun is made static to work around it. int gpu_result; - concurrency::array_view gpu_resultsv(1, &gpu_result); + hc::array_view gpu_resultsv(1, &gpu_result); gpu_resultsv.discard_data(); - static auto fun = [&]() restrict(cpu,amp) { return 0; }; - concurrency::parallel_for_each(gpu_resultsv.get_extent(), [=] (concurrency::index<1> idx) restrict (amp) { gpu_resultsv[idx] = fun(); }); + static auto fun = [&]() [[cpu, hc]] { return 0; }; + hc::parallel_for_each(gpu_resultsv.get_extent(), [=] (hc::index<1> idx) restrict (amp) { gpu_resultsv[idx] = fun(); }); } diff --git a/tests/Unit/InvalidLambda/qq.cpp b/tests/Unit/InvalidLambda/qq.cpp index 7e1b7f3cc33..5430538f3a2 100644 --- a/tests/Unit/InvalidLambda/qq.cpp +++ b/tests/Unit/InvalidLambda/qq.cpp @@ -1,28 +1,28 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include #include int main(void){ const int vecSize = 100; // Alloc & init input data - Concurrency::extent<1> e(vecSize); - Concurrency::array a(vecSize); - Concurrency::array b(vecSize); - Concurrency::array c(vecSize); + hc::extent<1> e(vecSize); + hc::array a(vecSize); + hc::array b(vecSize); + hc::array c(vecSize); int sum = 0; - Concurrency::array_view ga(a); - Concurrency::array_view gb(b); - Concurrency::array_view gc(c); - for (Concurrency::index<1> i(0); i[0] < vecSize; i++) { + hc::array_view ga(a); + hc::array_view gb(b); + hc::array_view gc(c); + for (hc::index<1> i(0); i[0] < vecSize; i++) { ga[i] = 100.0f * rand() / RAND_MAX; gb[i] = 100.0f * rand() / RAND_MAX; - sum += a[i] + b[i]; + sum += ga[i] + gb[i]; } - Concurrency::parallel_for_each( + hc::parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { // NULL body. Should compile && running OK }); diff --git a/tests/Unit/Macro/check_hcc_accelerator.cpp b/tests/Unit/Macro/check_hcc_accelerator.cpp index 30bd8b025a4..662e78c4670 100644 --- a/tests/Unit/Macro/check_hcc_accelerator.cpp +++ b/tests/Unit/Macro/check_hcc_accelerator.cpp @@ -1,15 +1,15 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include int main() { int test[1] { 0 }; - using namespace concurrency; + using namespace hc; array_view av(1, test); - parallel_for_each(extent<1>(1), [=](index<1> idx) restrict(amp) { + parallel_for_each(extent<1>(1), [=](index<1> idx) [[hc]] { #ifdef __HCC_ACCELERATOR__ av[idx] = 1; #else diff --git a/tests/Unit/Macro/check_hcc_cpu.cpp b/tests/Unit/Macro/check_hcc_cpu.cpp index 8813d234c66..c315b819144 100644 --- a/tests/Unit/Macro/check_hcc_cpu.cpp +++ b/tests/Unit/Macro/check_hcc_cpu.cpp @@ -1,15 +1,15 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include int main() { int test[1] { 0 }; - using namespace concurrency; + using namespace hc; array_view av(1, test); - parallel_for_each(extent<1>(1), [=](index<1> idx) restrict(amp) { + parallel_for_each(extent<1>(1), [=](index<1> idx) [[hc]] { #ifdef __HCC_CPU__ av[idx] = 0; #else diff --git a/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp b/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp index 2587b557f45..da68e56c800 100644 --- a/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp +++ b/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp @@ -1,7 +1,7 @@ // XFAIL: * // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; void foo() { @@ -9,7 +9,7 @@ void foo() int main() { - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { foo(); // Call from AMP to CPU. Caller: Lambda }); diff --git a/tests/Unit/Overload/Disjoint_restrict.cpp b/tests/Unit/Overload/Disjoint_restrict.cpp index 43f9e7ae3c4..63c62734f85 100644 --- a/tests/Unit/Overload/Disjoint_restrict.cpp +++ b/tests/Unit/Overload/Disjoint_restrict.cpp @@ -1,10 +1,11 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -#include +#include + #include -using namespace concurrency; -int test() restrict(cpu,amp) +using namespace hc; + +int test() [[cpu, hc]] { int data[] = {1}; for (int i = 0; i < 1; i++) @@ -20,11 +21,11 @@ int test() restrict(cpu,amp) struct runall_result { - runall_result() restrict(cpu,amp) + runall_result() [[cpu, hc]] : _exit_code(0) {} - runall_result(int result) restrict(cpu,amp) + runall_result(int result) [[cpu, hc]] : _exit_code(result) { verify_exit_code(); @@ -34,11 +35,11 @@ struct runall_result private: int _exit_code; - void verify_exit_code() restrict(cpu); - void verify_exit_code() restrict(amp) {} + void verify_exit_code() [[cpu]]; + void verify_exit_code() [[hc]] {} }; -void runall_result::verify_exit_code() restrict(cpu) +void runall_result::verify_exit_code() [[cpu]] { if(_exit_code != 0) { @@ -49,10 +50,9 @@ void runall_result::verify_exit_code() restrict(cpu) int main() { runall_result gpu_result; - concurrency::array_view gpu_resultsv(1, &gpu_result); + array_view gpu_resultsv(1, &gpu_result); - concurrency::parallel_for_each(gpu_resultsv.get_extent(), [=](concurrency::index<1> idx) restrict(amp) - { + parallel_for_each(gpu_resultsv.get_extent(), [=](index<1> idx) [[hc]] { gpu_resultsv[idx] = test(); }); -} +} \ No newline at end of file diff --git a/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp b/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp index 146d7ac4c7b..b6148cc4274 100644 --- a/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp +++ b/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp @@ -4,24 +4,24 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; +#include +using namespace hc; -void foo() restrict(amp) +void foo() [[hc]] { } int main() { - auto a_lambda_func = []() restrict(cpu) { + auto a_lambda_func = []() [[cpu]] { foo(); }; // CHECK: call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp:[[@LINE-2]]:8: error: 'foo': no overloaded function has restriction specifiers that are compatible with the ambient context 'main()::(anonymous class)::operator()' // CHECK-NEXT: foo(); // CHECK-NEXT: ^ - parallel_for_each(extent<1>(1), [](index<1>) restrict(cpu) { + parallel_for_each(extent<1>(1), [](index<1>) [[cpu]] { foo(); }); // CHECK: call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp:[[@LINE-2]]:8: error: 'foo': no overloaded function has restriction specifiers that are compatible with the ambient context 'main()::(anonymous class)::operator()' diff --git a/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp b/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp index a823e9f428e..615c7a13191 100644 --- a/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp +++ b/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp @@ -4,10 +4,10 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; +#include +using namespace hc; -int foo() restrict(amp) +int foo() [[hc]] { return 1; } diff --git a/tests/Unit/Overload/Negative/call_amp_linking_error.cpp b/tests/Unit/Overload/Negative/call_amp_linking_error.cpp index 538b3cf82cf..0d52983ce1f 100644 --- a/tests/Unit/Overload/Negative/call_amp_linking_error.cpp +++ b/tests/Unit/Overload/Negative/call_amp_linking_error.cpp @@ -4,15 +4,15 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; +#include +using namespace hc; -int f1() restrict(amp) {return 1;} -int f2() restrict(amp) { +int f1() [[hc]] {return 1;} +int f2() [[hc]] { return f1(); } -int CPU_Func() restrict(cpu) +int CPU_Func() [[cpu]] { return f2(); } diff --git a/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp b/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp index 0fdba029a3c..f8658645949 100644 --- a/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp +++ b/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp @@ -4,19 +4,19 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; +#include +using namespace hc; void foo() { } -int f1() restrict(cpu) {return 1;} -int f2() restrict(cpu) { +int f1() [[cpu]] {return 1;} +int f2() [[cpu]] { return f1(); } -int AMP_Func() restrict(amp) +int AMP_Func() [[hc]] { return f2(); } @@ -26,14 +26,14 @@ int AMP_Func() restrict(amp) int main() { - auto a_lambda_func = []() restrict(amp) { + auto a_lambda_func = []() [[hc]] { foo(); }; // CHECK: call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp:[[@LINE-2]]:8: error: 'foo': no overloaded function has restriction specifiers that are compatible with the ambient context 'main()::(anonymous class)::operator()' // CHECK-NEXT: foo(); // CHECK-NEXT: ^ - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { foo(); }); diff --git a/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp b/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp index fb33f26b7b5..3da02280f6c 100644 --- a/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp +++ b/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp @@ -4,12 +4,12 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; +#include +using namespace hc; -int f1() restrict(cpu) {return 1;} +int f1() [[cpu]] {return 1;} -int AMP_AND_CPU_Func_1() restrict(cpu,amp) +int AMP_AND_CPU_Func_1() [[cpu, hc]] { return f1(); } @@ -22,7 +22,7 @@ int foo() {} int main() { - auto a_lambda_func = []() restrict(cpu,amp) { + auto a_lambda_func = []() [[cpu, hc]] { foo(); }; // CHECK: call_distinct_from_dual_context.cpp:[[@LINE-2]]:8: error: 'foo': no overloaded function has restriction specifiers that are compatible with the ambient context 'main()::(anonymous class)::operator()' @@ -30,7 +30,7 @@ int main() // CHECK-NEXT: ^ - parallel_for_each(extent<1>(1), [](index<1>) restrict(cpu,amp) { + parallel_for_each(extent<1>(1), [](index<1>) [[cpu, hc]] { foo(); }); // CHECK: call_distinct_from_dual_context.cpp:[[@LINE-2]]:8: error: 'foo': no overloaded function has restriction specifiers that are compatible with the ambient context 'main()::(anonymous class)::operator()' diff --git a/tests/Unit/Overload/Negative/linking_error.cpp b/tests/Unit/Overload/Negative/linking_error.cpp index a07d0a32e77..91687e22563 100644 --- a/tests/Unit/Overload/Negative/linking_error.cpp +++ b/tests/Unit/Overload/Negative/linking_error.cpp @@ -4,12 +4,12 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include -using namespace concurrency; +#include +using namespace hc; -int f2() restrict(amp) {return 2;} +int f2() [[hc]] {return 2;} -int AMP_AND_CPU_Func() restrict(cpu,amp) +int AMP_AND_CPU_Func() [[cpu, hc]] { // Link error: undefined reference to `f2()' // clang-3.3: error: linker command failed with exit code 1 (use -v to see invocation) diff --git a/tests/Unit/Overload/Test_Overload.cpp b/tests/Unit/Overload/Test_Overload.cpp index 44d4f640e80..50727ca6f09 100644 --- a/tests/Unit/Overload/Test_Overload.cpp +++ b/tests/Unit/Overload/Test_Overload.cpp @@ -1,13 +1,14 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include + +using namespace hc; #define TEST_CPU #define TEST_ELIDED #define TEST_GPU #define TEST_BOTH -int callee() restrict(amp) +int callee() [[hc]] { return 1; } @@ -44,7 +45,7 @@ bool Elided_Func() #endif #ifdef TEST_GPU -bool AMP_Func() restrict(amp) +bool AMP_Func() [[hc]] { if (callee() != 1) { @@ -56,7 +57,7 @@ bool AMP_Func() restrict(amp) #endif #ifdef TEST_BOTH -bool BOTH_CPU_AND_AMP() restrict(cpu,amp) +bool BOTH_CPU_AND_AMP() [[cpu, hc]] { #if __KALMAR_ACCELERATOR__ if (callee() != 1) @@ -71,7 +72,7 @@ bool BOTH_CPU_AND_AMP() restrict(cpu,amp) } #endif -int main(int argc, char **argv) +int main() { int flag; #ifdef TEST_CPU @@ -86,12 +87,11 @@ int main(int argc, char **argv) // directly called is not allowed, we use pfe { int result; - concurrency::array_view gpu_resultsv(1, &result); - concurrency::parallel_for_each(gpu_resultsv.get_extent(), [=](concurrency::index<1> idx) restrict(amp) - { + array_view gpu_resultsv(1, &result); + parallel_for_each(gpu_resultsv.get_extent(), [=](index<1> idx) [[hc]] { gpu_resultsv[idx] = AMP_Func(); }); - + if(gpu_resultsv[0] == 0) { printf("AMP_Func Error! exit!\n"); exit(1);} } #endif @@ -99,12 +99,12 @@ int main(int argc, char **argv) #ifdef TEST_BOTH { int result; - concurrency::array_view gpu_resultsv(1, &result); - concurrency::parallel_for_each(gpu_resultsv.get_extent(), [=](concurrency::index<1> idx) restrict(amp,cpu) + array_view gpu_resultsv(1, &result); + parallel_for_each(gpu_resultsv.get_extent(), [=](index<1> idx) [[hc]] { gpu_resultsv[idx] = BOTH_CPU_AND_AMP(); }); - + if(gpu_resultsv[0] == 0) { printf("BOTH_CPU_AND_AMP Error! exit!\n"); exit(1);} } #endif diff --git a/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp b/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp index a3b6b786d45..be1e1202176 100644 --- a/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp +++ b/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp @@ -1,6 +1,6 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; int main() @@ -8,12 +8,12 @@ int main() // This test outlines a subtle issue with how we obtain mangled kernel names // which is tracked in SWDEV-137849. a_lambda_func is moved after the pfe to // work around this and ensure matched mangling. - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { - // OK. Since parallel_for_each is implemented as restrict(cpu,amp) inside + // OK. Since parallel_for_each is implemented as [[cpu, hc]] inside }); - auto a_lambda_func = []() restrict(amp) { + auto a_lambda_func = []() [[hc]] { }; return 0; // Should not compile diff --git a/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp b/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp index 89b4c46529d..27baebea175 100644 --- a/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp +++ b/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp @@ -1,19 +1,19 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; int CPU_Func() { - auto a_lambda = []() restrict(amp) + auto a_lambda = []() [[hc]] { }; return 0; } -int CPU_Func_1() restrict(cpu) +int CPU_Func_1() [[cpu]] { - auto a_lambda = []() restrict(amp) + auto a_lambda = []() [[hc]] { }; @@ -23,7 +23,7 @@ int CPU_Func_1() restrict(cpu) inline int CPU_Func_X() { - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { // OK }); @@ -31,9 +31,9 @@ int CPU_Func_X() } inline -int CPU_Func_Y() restrict(cpu) +int CPU_Func_Y() [[cpu]] { - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { // OK }); @@ -51,15 +51,15 @@ int main(void) CPU_Func_X(); CPU_Func_Y(); - auto a_lambda = [] () restrict(cpu) { - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + auto a_lambda = [] () [[cpu]] { + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { // OK }); }; - auto a_lambda_1 = [] () restrict(cpu) { - auto a_lambda_AMP = [] () restrict(amp) {}; //OK + auto a_lambda_1 = [] () [[cpu]] { + auto a_lambda_AMP = [] () [[hc]] {}; //OK }; return 0; } diff --git a/tests/Unit/Overload/cpu_caller_distinct_callees.cpp b/tests/Unit/Overload/cpu_caller_distinct_callees.cpp index cfb8262a9b8..66efaa1958c 100644 --- a/tests/Unit/Overload/cpu_caller_distinct_callees.cpp +++ b/tests/Unit/Overload/cpu_caller_distinct_callees.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; -int f(int &) restrict(amp) +int f(int &) [[hc]] { return 0; } diff --git a/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp b/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp index e08548b15aa..4e27a6f76c0 100644 --- a/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp +++ b/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp @@ -1,9 +1,9 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; -int fooCPU() restrict(cpu) +int fooCPU() [[cpu]] { return 1; } @@ -17,7 +17,7 @@ int main(void) { fooCPU(); foo(); - auto a_lambda = [] () restrict(cpu) {}; + auto a_lambda = [] () [[cpu]] {}; auto another_lambda = [] () {}; return 0; diff --git a/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp b/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp index c3e6d2238a6..d6a8e03dc3f 100644 --- a/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp +++ b/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp @@ -1,12 +1,12 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; inline -int fooAMP() restrict(amp) +int fooAMP() [[hc]] { - auto a_lambda = []() restrict(cpu) {}; // OK + auto a_lambda = []() [[cpu]] {}; // OK return 1; } @@ -16,9 +16,9 @@ int main(void) // This test outlines a subtle issue with how we obtain mangled kernel names // which is tracked in SWDEV-137849. fooAMP is made inline to work around this // and ensure matched mangling. - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { - auto a_lambda = []() restrict(cpu) {};// OK + auto a_lambda = []() [[cpu]] {};// OK }); return 0; } diff --git a/tests/Unit/Parse/amp_header_test.cpp b/tests/Unit/Parse/amp_header_test.cpp index e33fed92ea5..872143eef57 100644 --- a/tests/Unit/Parse/amp_header_test.cpp +++ b/tests/Unit/Parse/amp_header_test.cpp @@ -1,4 +1,4 @@ // RUN: %cxxamp -c %s -#include +#include diff --git a/tests/Unit/Parse/class_cross_referencing.cpp b/tests/Unit/Parse/class_cross_referencing.cpp index 68fc0cd6b4f..8f19614df41 100644 --- a/tests/Unit/Parse/class_cross_referencing.cpp +++ b/tests/Unit/Parse/class_cross_referencing.cpp @@ -1,19 +1,19 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include class unorm; class norm { float f; public: - norm(const unorm& other) restrict(cpu, amp); + norm(const unorm& other) [[cpu, hc]]; }; class unorm { float f; public: - unorm() restrict(cpu, amp) {} - unorm(const norm& other) restrict(cpu, amp) {} + unorm() [[cpu, hc]] {} + unorm(const norm& other) [[cpu, hc]] {} }; int main(void) diff --git a/tests/Unit/Parse/function_declarator.cpp b/tests/Unit/Parse/function_declarator.cpp index 15b8846d6dc..0fed445e782 100644 --- a/tests/Unit/Parse/function_declarator.cpp +++ b/tests/Unit/Parse/function_declarator.cpp @@ -1,6 +1,6 @@ // RUN: %cxxamp -c %s -int func() restrict(amp) { +int func() [[hc]] { return 0; } diff --git a/tests/Unit/Parse/lambda_attribute.cpp b/tests/Unit/Parse/lambda_attribute.cpp index 611de7e7eb7..9c2448fd461 100644 --- a/tests/Unit/Parse/lambda_attribute.cpp +++ b/tests/Unit/Parse/lambda_attribute.cpp @@ -1,6 +1,6 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include @@ -9,7 +9,7 @@ int main() { bool ret = true; - using namespace concurrency; + using namespace hc; array_view av(SIZE); diff --git a/tests/Unit/Parse/lambda_attribute_hc.cpp b/tests/Unit/Parse/lambda_attribute_hc.cpp index b4c2e697247..c6d132ee01d 100644 --- a/tests/Unit/Parse/lambda_attribute_hc.cpp +++ b/tests/Unit/Parse/lambda_attribute_hc.cpp @@ -1,6 +1,6 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include +#include #include @@ -9,7 +9,7 @@ int main() { bool ret = true; - using namespace concurrency; + using namespace hc; array_view av(SIZE); diff --git a/tests/Unit/Parse/lambda_expr.amp.cpp b/tests/Unit/Parse/lambda_expr.amp.cpp index cae84dd59b0..ba30cb31db6 100644 --- a/tests/Unit/Parse/lambda_expr.amp.cpp +++ b/tests/Unit/Parse/lambda_expr.amp.cpp @@ -6,7 +6,7 @@ int main() { int c; // Note that capture-by-reference in amp restricted codes is not allowed [=, &c] () - restrict(cpu) + [[cpu]] { c = a + b; } (); return c; } diff --git a/tests/Unit/Parse/lambda_expr.both.cpp b/tests/Unit/Parse/lambda_expr.both.cpp index cae84dd59b0..ba30cb31db6 100644 --- a/tests/Unit/Parse/lambda_expr.both.cpp +++ b/tests/Unit/Parse/lambda_expr.both.cpp @@ -6,7 +6,7 @@ int main() { int c; // Note that capture-by-reference in amp restricted codes is not allowed [=, &c] () - restrict(cpu) + [[cpu]] { c = a + b; } (); return c; } diff --git a/tests/Unit/Parse/lambda_expr.cpu.cpp b/tests/Unit/Parse/lambda_expr.cpu.cpp index 13d86a6a3b4..6abe07e909d 100644 --- a/tests/Unit/Parse/lambda_expr.cpu.cpp +++ b/tests/Unit/Parse/lambda_expr.cpu.cpp @@ -5,7 +5,7 @@ int main() { int b = 2; int c; [=, &c] () - restrict(cpu) + [[cpu]] { c = a + b; } (); return c; } diff --git a/tests/Unit/Parse/lambda_expr.cpu_only.cpp b/tests/Unit/Parse/lambda_expr.cpu_only.cpp index 13d86a6a3b4..6abe07e909d 100644 --- a/tests/Unit/Parse/lambda_expr.cpu_only.cpp +++ b/tests/Unit/Parse/lambda_expr.cpu_only.cpp @@ -5,7 +5,7 @@ int main() { int b = 2; int c; [=, &c] () - restrict(cpu) + [[cpu]] { c = a + b; } (); return c; } diff --git a/tests/Unit/Parse/lambda_expr.mixed.cpp b/tests/Unit/Parse/lambda_expr.mixed.cpp index 95234d3ea7d..a22a3dee15b 100644 --- a/tests/Unit/Parse/lambda_expr.mixed.cpp +++ b/tests/Unit/Parse/lambda_expr.mixed.cpp @@ -6,7 +6,7 @@ int main() { int c; // capture-by-reference is not allowed in amp-restricted codes [=, &c] () - restrict(cpu) + [[cpu]] { c = a + b; } (); return c; } diff --git a/tests/Unit/Parse/lambda_expr.recursive.cpp b/tests/Unit/Parse/lambda_expr.recursive.cpp index 053b7ba5591..f00906dcba6 100644 --- a/tests/Unit/Parse/lambda_expr.recursive.cpp +++ b/tests/Unit/Parse/lambda_expr.recursive.cpp @@ -6,7 +6,7 @@ int main() { int c; // capture-by-reference is not allowed in amp-restricted kernel lambda [=, &c] () - restrict(cpu) + [[cpu]] { c = a + b; } (); return c; } diff --git a/tests/Unit/Parse/lambda_expr.without.params.cpp b/tests/Unit/Parse/lambda_expr.without.params.cpp index cc6f3ab402c..53988b58a86 100644 --- a/tests/Unit/Parse/lambda_expr.without.params.cpp +++ b/tests/Unit/Parse/lambda_expr.without.params.cpp @@ -1,7 +1,7 @@ // RUN: %cxxamp -c %s -int f1() restrict(amp) { return 1;} -int f_amp() restrict(amp) { +int f1() [[hc]] { return 1;} +int f_amp() [[hc]] { [] { f1(); // OK diff --git a/tests/Unit/Parse/method_declarator.cpp b/tests/Unit/Parse/method_declarator.cpp index 869157901cc..6be7bb00586 100644 --- a/tests/Unit/Parse/method_declarator.cpp +++ b/tests/Unit/Parse/method_declarator.cpp @@ -4,16 +4,16 @@ class AClass { public: AClass(); - AClass(int n) restrict(amp, cpu); // constructor with restrict should be accepted. + AClass(int n) [[cpu, hc]]; // constructor with restrict should be accepted. int method_1() const; // not a problem - int method_2() restrict(amp, cpu); // should accept + int method_2() [[cpu, hc]]; // should accept int method_3() restrict; // not to be confused with C++AMP restrict. }; -int func() restrict(amp) { +int func() [[hc]] { return 0; } diff --git a/tests/Unit/PlatformAtomics/atomic_int.cpp b/tests/Unit/PlatformAtomics/atomic_int.cpp index 5c03e9031ab..cbc79e7da5c 100644 --- a/tests/Unit/PlatformAtomics/atomic_int.cpp +++ b/tests/Unit/PlatformAtomics/atomic_int.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include // added for checking HSA profile #include @@ -33,10 +33,10 @@ bool test() { } // launch kernel - Concurrency::extent<1> e(vecSize); + hc::extent<1> e(vecSize); parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { int tid = idx[0]; (ptr_a + tid)->fetch_add(1); diff --git a/tests/Unit/PlatformAtomics/pingpong.cpp b/tests/Unit/PlatformAtomics/pingpong.cpp index 3f2e2a042f9..08f12c1633b 100644 --- a/tests/Unit/PlatformAtomics/pingpong.cpp +++ b/tests/Unit/PlatformAtomics/pingpong.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include // added for checking HSA profile #include @@ -78,10 +78,10 @@ bool test() { }); // launch kernel - Concurrency::extent<1> e(vecSize); + hc::extent<1> e(vecSize); parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { int tid = idx[0]; int flag; diff --git a/tests/Unit/PlatformAtomics/sync_1way.cpp b/tests/Unit/PlatformAtomics/sync_1way.cpp index 6a71c8ecc5e..22ba4ad2f76 100644 --- a/tests/Unit/PlatformAtomics/sync_1way.cpp +++ b/tests/Unit/PlatformAtomics/sync_1way.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include // added for checking HSA profile #include @@ -52,10 +52,10 @@ bool test() { }); // launch kernel - Concurrency::extent<1> e(vecSize); + hc::extent<1> e(vecSize); parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { int tid = idx[0]; int counter = 0; diff --git a/tests/Unit/PlatformAtomics/sync_2way.cpp b/tests/Unit/PlatformAtomics/sync_2way.cpp index 9ea80dfd2e5..83f057560af 100644 --- a/tests/Unit/PlatformAtomics/sync_2way.cpp +++ b/tests/Unit/PlatformAtomics/sync_2way.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include // added for checking HSA profile #include @@ -61,10 +61,10 @@ bool test() { }); // launch kernel - Concurrency::extent<1> e(vecSize); + hc::extent<1> e(vecSize); parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { int tid = idx[0]; int flag; diff --git a/tests/Unit/PlatformAtomics/syscall.cpp b/tests/Unit/PlatformAtomics/syscall.cpp index bc143cc7ca7..399deb9b317 100644 --- a/tests/Unit/PlatformAtomics/syscall.cpp +++ b/tests/Unit/PlatformAtomics/syscall.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include // added for checking HSA profile #include @@ -83,10 +83,10 @@ bool test() { }); // launch kernel - Concurrency::extent<1> e(vecSize); + hc::extent<1> e(vecSize); parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { int tid = idx[0]; int flag; diff --git a/tests/Unit/RawPointer/array_add.cpp b/tests/Unit/RawPointer/array_add.cpp index a9085812ca6..08b2ca7e803 100644 --- a/tests/Unit/RawPointer/array_add.cpp +++ b/tests/Unit/RawPointer/array_add.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include // added for checking HSA profile #include @@ -31,10 +31,10 @@ bool test() { } // launch kernel - Concurrency::extent<1> e(vecSize); + hc::extent<1> e(vecSize); parallel_for_each( e, - [=](Concurrency::index<1> idx) restrict(amp) { + [=](hc::index<1> idx) [[hc]] { p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; diff --git a/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp b/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp index 5b0a658ea88..029b80bd4d8 100644 --- a/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp +++ b/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp @@ -4,7 +4,7 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include +#include int foo() restrict() { diff --git a/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp b/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp index 3e31653aaba..6fa1c8cf76a 100644 --- a/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp +++ b/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp @@ -5,7 +5,7 @@ // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include +#include int f1() restrict(cpu,auto1) { diff --git a/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp b/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp index bc4040926fe..1c3ba5f3313 100644 --- a/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp +++ b/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp @@ -5,7 +5,7 @@ // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include +#include int foo() restrict(xx:auto1) { diff --git a/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp b/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp index 6fe6043d895..b23959a44b6 100644 --- a/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp +++ b/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp @@ -5,7 +5,7 @@ // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include +#include int foo() restrict(!,,,,) { diff --git a/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp b/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp index 818c6220ebc..7fddb497a33 100644 --- a/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp +++ b/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp @@ -4,7 +4,7 @@ // Do not delete or add any line; it is referred to by absolute line number in the // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include +#include int f1() restrict(amp:,) { @@ -15,7 +15,7 @@ int f1() restrict(amp:,) // CHECK-NEXT: ^ // 'amp' should not be attached to f1() -int f2() restrict(amp) +int f2() [[hc]] { f1(); // expected-error{{'f1': no overload...}} return 0; diff --git a/tests/Unit/RestrictionSpecifier/Negative/space.cpp b/tests/Unit/RestrictionSpecifier/Negative/space.cpp index a48cc91fbc6..d0effa63f6d 100644 --- a/tests/Unit/RestrictionSpecifier/Negative/space.cpp +++ b/tests/Unit/RestrictionSpecifier/Negative/space.cpp @@ -5,7 +5,7 @@ // FileCheck lines below ////////////////////////////////////////////////////////////////////////////////// -#include +#include int f1() restrict(cpu, ,auto1) // expected-error{{'auto1': unrecognized restriction sepcifier}} { diff --git a/tests/Unit/RestrictionSpecifier/OKCase.cpp b/tests/Unit/RestrictionSpecifier/OKCase.cpp index 3376e2beb72..df230c263a4 100644 --- a/tests/Unit/RestrictionSpecifier/OKCase.cpp +++ b/tests/Unit/RestrictionSpecifier/OKCase.cpp @@ -1,6 +1,6 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; int foo() restrict(,) // OK { @@ -23,7 +23,7 @@ int foo2() restrict(, ,,, ,cpu,,,,) // OK { return 0; } -int fooCPU() restrict(cpu) // OK +int fooCPU() [[cpu]] // OK { foo2(); // OK return 0; @@ -32,7 +32,7 @@ int fooCPU() restrict(cpu) // OK int main(void) { - parallel_for_each(extent<1>(1), [](index<1>) restrict(amp) + parallel_for_each(extent<1>(1), [](index<1>) [[hc]] { fooAMP(); }); diff --git a/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp b/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp index a1d76394b34..593414d9e8f 100644 --- a/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp +++ b/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp @@ -1,6 +1,6 @@ // RUN: %cxxamp %s -o %t.out && %t.out -#include -using namespace concurrency; +#include +using namespace hc; #define LLVM_OVERRIDE override diff --git a/tests/Unit/SharedLibrary/shared_library2.cpp b/tests/Unit/SharedLibrary/shared_library2.cpp index ce13fc43ba3..c8f23b1fcbf 100644 --- a/tests/Unit/SharedLibrary/shared_library2.cpp +++ b/tests/Unit/SharedLibrary/shared_library2.cpp @@ -78,7 +78,7 @@ int main() { } if (foo_handle && bar_handle) { - for (int i = 0; i < 16; ++i) { + for (int i = 1; i != 17; ++i) { ret &= (foo_handle(i) == i); ret &= (bar_handle(i * 2) == (i * 4)); } diff --git a/tests/Unit/SharedLibrary/shared_library3.cpp b/tests/Unit/SharedLibrary/shared_library3.cpp index 7b37a29e9d5..df506dcc21a 100644 --- a/tests/Unit/SharedLibrary/shared_library3.cpp +++ b/tests/Unit/SharedLibrary/shared_library3.cpp @@ -60,7 +60,7 @@ extern "C" int bar(int); int main() { bool ret = true; - for (int i = 0; i < 16; ++i) { + for (int i = 1; i != 17; ++i) { ret &= (foo(i) == i); ret &= (bar(i * 2) == (i * 4)); } diff --git a/tests/Unit/Template/Specialization_Inheritate_Restrictions.cpp b/tests/Unit/Template/Specialization_Inheritate_Restrictions.cpp index 35879a5b9d2..c37f3ae7105 100644 --- a/tests/Unit/Template/Specialization_Inheritate_Restrictions.cpp +++ b/tests/Unit/Template/Specialization_Inheritate_Restrictions.cpp @@ -1,7 +1,7 @@ // RUN: %cxxamp -c %s -template T tf_c_1(T) restrict(cpu, amp); -void f_cpu_amp() restrict(cpu, amp) +template T tf_c_1(T) [[cpu, hc]]; +void f_cpu_amp() [[cpu, hc]] { - tf_c_1(1.f); // Expect tf_c_1 restrict(cpu,amp) here + tf_c_1(1.f); // Expect tf_c_1 [[cpu, hc]] here } diff --git a/tests/Unit/decltype/TrailingReturn.cpp b/tests/Unit/decltype/TrailingReturn.cpp index c06537853ca..213d1e11dc3 100644 --- a/tests/Unit/decltype/TrailingReturn.cpp +++ b/tests/Unit/decltype/TrailingReturn.cpp @@ -3,20 +3,20 @@ #define TEST(a,b) static_assert(std::is_same::value, "Test failed, type of \"" #a "\" != type of \"" #b "\".") struct cpu_t { - operator bool() restrict(cpu,amp); // Req'd to define in 'if' condition + operator bool() [[cpu, hc]]; // Req'd to define in 'if' condition }; struct amp_t { - operator bool() restrict(cpu,amp); // Req'd to define in 'if' condition + operator bool() [[cpu, hc]]; // Req'd to define in 'if' condition int i; // Req'd to satisfy alignment }; -cpu_t f() restrict(cpu); -amp_t f() restrict(amp); +cpu_t f() [[cpu]]; +amp_t f() [[hc]]; -auto test_trt_2() restrict(amp) -> decltype(f()); // expect: amp_t test_trt_2() restrict(amp) +auto test_trt_2() [[hc]] -> decltype(f()); // expect: amp_t test_trt_2() [[hc]] -void test_trt_2_verify() restrict(amp) +void test_trt_2_verify() [[hc]] { amp_t r = test_trt_2(); // verify // Error // since the auto & trailing return type of test_trt_2 is cpu_t From fcf67437866e3ef6cd054c64cef370732e418c1a Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 26 Aug 2018 23:17:00 +0100 Subject: [PATCH 006/134] Add C++AMP deprecation warning. --- include/amp.h | 2158 +---------------------------------- include/amp_math.h | 2 + include/amp_short_vectors.h | 5 +- 3 files changed, 63 insertions(+), 2102 deletions(-) diff --git a/include/amp.h b/include/amp.h index 4cd6c58718d..c9042a00783 100644 --- a/include/amp.h +++ b/include/amp.h @@ -12,7 +12,10 @@ #pragma once +#warning "C++AMP support is deprecated in ROCm 1.9 and will be removed in ROCm 2.0!" + #include "atomics.hpp" +#include "hc.hpp" #include "hc_defines.h" #include "kalmar_exception.h" #include "kalmar_index.h" @@ -30,8 +33,6 @@ namespace Concurrency { class completion_future; class accelerator; class accelerator_view; -template class array_view; -template class array; template class extent; template class tiled_extent; } // namespace Concurrency @@ -44,6 +45,9 @@ namespace concurrency = Concurrency; // type alias namespace Concurrency { +using hc::array; +using hc::array_view; + /** * Represents a unique position in N-dimensional space. */ @@ -238,8 +242,8 @@ class accelerator_view { const Domain&, const Kernel&); - template friend class array; - template friend class array_view; + template friend class hc::array; + template friend class hc::array_view; template friend void parallel_for_each(const Concurrency::extent&, const Kernel&); @@ -743,7 +747,7 @@ class completion_future { template friend completion_future copy_async(const array_view& src, OutputIter destBegin); - template friend class array_view; + template friend class hc::array_view; }; // ------------------------------------------------------------------------ @@ -799,6 +803,10 @@ class extent { extent(const extent& other) restrict(amp,cpu) : base_(other.base_) {} + extent(const hc::extent& other) restrict(cpu, amp) + : extent{reinterpret_cast(other)} + {} + /** @{ */ /** * Constructs an extent with the coordinate values provided by @f$e_{0..2}@f$. @@ -1076,6 +1084,11 @@ class extent { explicit extent(const tiled_extent& other) restrict(amp,cpu) : base_(other.base_) {} + constexpr + operator const hc::extent&() const + { // TODO: temporary, icky. + return *reinterpret_cast* const>(this); + } private: typedef detail::index_impl::type> base; base base_; @@ -2159,1549 +2172,54 @@ void copy(const array &src, OutputIter destBegin); * @tparam T The element type of this array * @tparam N The dimensionality of the array, defaults to 1 if elided. */ -template -class array { - static_assert(!std::is_const::value, "array is not supported"); - static_assert(0 == (sizeof(T) % sizeof(int)), "only value types whose size is a multiple of the size of an integer are allowed in array"); -public: -#if __KALMAR_ACCELERATOR__ == 1 - typedef detail::_data acc_buffer_t; -#else - typedef detail::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; - - /** - * The element type of this array. - */ - typedef T value_type; - - /** - * There is no default constructor for array. - */ - array() = delete; - - /** - * Copy constructor. Constructs a new array from the supplied argument - * other. The new array is located on the same accelerator_view as the - * source array. A deep copy is performed. - * - * @param[in] other An object of type array from which to initialize - * this new array. - */ - array(const array& other) - : array(other.get_extent(), other.get_accelerator_view()) - { Concurrency::copy(other, *this); } - - /** - * Move constructor. Constructs a new array by moving from the - * supplied argument other. - * - * @param[in] other An object of type array from which to initialize - * this new array. - */ - array(array&& other) - : m_device(other.m_device), extent(other.extent) - { other.m_device.reset(); } - - /** - * Constructs a new array with the supplied extent, located on the default - * view of the default accelerator. If any components of the extent are - * non-positive, an exception will be thrown. - * - * @param[in] ext The extent in each dimension of this array. - */ - explicit array(const extent& ext) - : array(ext, accelerator(L"default").get_default_view()) {} - - /** @{ */ - /** - * Equivalent to construction using "array(extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - */ - explicit array(int e0) - : array(Concurrency::extent(e0)) { static_assert(N == 1, "illegal"); } - explicit array(int e0, int e1) - : array(Concurrency::extent(e0, e1)) {} - explicit array(int e0, int e1, int e2) - : array(Concurrency::extent(e0, e1, e2)) {} - - /** @} */ - - /** @{ */ - /** - * Constructs a new array with the supplied extent, located on the default - * accelerator, initialized with the contents of a source container - * specified by a beginning and optional ending iterator. The source data - * is copied by value into this array as if by calling "copy()". - * - * If the number of available container elements is less than - * this->extent.size(), undefined behavior results. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - */ - template - array(const Concurrency::extent& ext, InputIter srcBegin) - : array(ext, srcBegin, accelerator(L"default").get_default_view()) {} - template - array(const Concurrency::extent& ext, InputIter srcBegin, InputIter srcEnd) - : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {} - - /** @} */ - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - */ - template - array(int e0, InputIter srcBegin) - : array(Concurrency::extent(e0), srcBegin) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd) - : array(Concurrency::extent(e0), srcBegin, srcEnd) {} - template - array(int e0, int e1, InputIter srcBegin) - : array(Concurrency::extent(e0, e1), srcBegin) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd) - : array(Concurrency::extent(e0, e1), srcBegin, srcEnd) {} - template - array(int e0, int e1, int e2, InputIter srcBegin) - : array(Concurrency::extent(e0, e1, e2), srcBegin) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd) - : array(Concurrency::extent(e0, e1, e2), srcBegin, srcEnd) {} - - /** @} */ - - /** - * Constructs a new array, located on the default view of the default - * accelerator, initialized with the contents of the array_view "src". The - * extent of this array is taken from the extent of the source array_view. - * The "src" is copied by value into this array as if by calling - * "copy(src, *this)". - * - * @param[in] src An array_view object from which to copy the data into - * this array (and also to determine the extent of this - * array). - */ - explicit array(const array_view& src) - : array(src.get_extent(), accelerator(L"default").get_default_view()) - { Concurrency::copy(src, *this); } - - /** - * Constructs a new array with the supplied extent, located on the - * accelerator bound to the accelerator_view "av". - * - * Users can optionally specify the type of CPU access desired for "this" - * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the - * specified CPU access_type). If a value other than access_type_auto or - * access_type_none is specified for the cpu_access_type parameter and the - * accelerator corresponding to the accelerator_view "av" does not support - * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type - * parameter has a default value of access_type_auto which leaves it up to - * the implementation to decide what type of allowed CPU access should the - * array be created with. The actual CPU access_type allowed for the - * created array can be queried using the get_cpu_access_type member - * method. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] av An accelerator_view object which specifies the location of - * this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - array(const extent& ext, accelerator_view av, access_type cpu_access_type = access_type_auto) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size()), extent(ext) {} -#else - : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {} -#endif - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), av, cpu_access_type)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] av An accelerator_view object which specifies the location of - * this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0), av, cpu_access_type) {} - array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0, e1), av, cpu_access_type) {} - array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0, e1, e2), av, cpu_access_type) {} - - /** @} */ - - /** - * Constructs a new array with the supplied extent, located on the - * accelerator bound to the accelerator_view "av", initialized with the - * contents of the source container specified by a beginning and optional - * ending iterator. The data is copied by value into this array as if by - * calling "copy()". - * - * Users can optionally specify the type of CPU access desired for "this" - * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the - * specified CPU access_type). If a value other than access_type_auto or - * access_type_none is specified for the cpu_access_type parameter and the - * accelerator corresponding to the accelerator_view "av" does not support - * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type - * parameter has a default value of access_type_auto which leaves it upto - * the implementation to decide what type of allowed CPU access should the - * array be created with. The actual CPU access_type allowed for the - * created array can be queried using the get_cpu_access_type member - * method. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - template - array(const Concurrency::extent& ext, InputIter srcBegin, accelerator_view av, - access_type cpu_access_type = access_type_auto) - : array(ext, av, cpu_access_type) { Concurrency::copy(srcBegin, *this); } - template - array(const Concurrency::extent& ext, InputIter srcBegin, InputIter srcEnd, - accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(ext, av, cpu_access_type) { - if(ext.size() < std::distance(srcBegin, srcEnd)) - throw runtime_exception("errorMsg_throw", 0); - Concurrency::copy(srcBegin, srcEnd, *this); - } - - /** @} */ - - /** - * Constructs a new array initialized with the contents of the array_view - * "src". The extent of this array is taken from the extent of the source - * array_view. The "src" is copied by value into this array as if by - * calling "copy(src, *this)". The new array is located on the accelerator - * bound to the accelerator_view "av". - * - * Users can optionally specify the type of CPU access desired for "this" - * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the - * specified CPU access_type). If a value other than access_type_auto or - * access_type_none is specified for the cpu_access_type parameter and the - * accelerator corresponding to the accelerator_view “av†does not support - * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type - * parameter has a default value of access_type_auto which leaves it upto - * the implementation to decide what type of allowed CPU access should the - * array be created with. The actual CPU access_type allowed for the - * created array can be queried using the get_cpu_access_type member - * method. - * - * @param[in] src An array_view object from which to copy the data into - * this array (and also to determine the extent of this array). - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - array(const array_view& src, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(src.get_extent(), av, cpu_access_type) { Concurrency::copy(src, *this); } - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd], av, cpu_access_type)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - template - array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0), srcBegin, av, cpu_access_type) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0), srcBegin, srcEnd, av, cpu_access_type) {} - template - array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0, e1), srcBegin, av, cpu_access_type) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0, e1, e2), srcBegin, av, cpu_access_type) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(Concurrency::extent(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {} - - /** @} */ - - /** - * Constructs a staging array with the given extent, which acts as a - * staging area between accelerator views "av" and "associated_av". If "av" - * is a cpu accelerator view, this will construct a staging array which is - * optimized for data transfers between the CPU and "associated_av". - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - array(const Concurrency::extent& ext, accelerator_view av, accelerator_view associated_av) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size()), extent(ext) {} -#else - : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {} -#endif - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), av, associated_av)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - array(int e0, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0), av, associated_av) {} - array(int e0, int e1, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0, e1), av, associated_av) {} - array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0, e1, e2), av, associated_av) {} - - /** @} */ - - /** @{ */ - /** - * Constructs a staging array with the given extent, which acts as a - * staging area between accelerator_views "av" (which must be the CPU - * accelerator) and "associated_av". The staging array will be initialized - * with the data specified by "src" as if by calling "copy(src, *this)". - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - template - array(const Concurrency::extent& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(ext, av, associated_av) { Concurrency::copy(srcBegin, *this); } - template - array(const Concurrency::extent& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(ext, av, associated_av) { - if(ext.size() < std::distance(srcBegin, srcEnd)) - throw runtime_exception("errorMsg_throw", 0); - Concurrency::copy(srcBegin, srcEnd, *this); - } - - /** @} */ - - /** - * Constructs a staging array initialized with the array_view given by - * "src", which acts as a staging area between accelerator_views "av" - * (which must be the CPU accelerator) and "associated_av". The extent of - * this array is taken from the extent of the source array_view. The - * staging array will be initialized from "src" as if by calling - * "copy(src, *this)". - * - * @param[in] src An array_view object from which to copy the data into - * this array (and also to determine the extent of this - * array). - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - array(const array_view& src, accelerator_view av, accelerator_view associated_av) - : array(src.get_extent(), av, associated_av) - { Concurrency::copy(src, *this); } - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), src, av, associated_av)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - template - array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0), srcBegin, av, associated_av) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0), srcBegin, srcEnd, av, associated_av) {} - template - array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0, e1), srcBegin, av, associated_av) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0, e1), srcBegin, srcEnd, av, associated_av) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0, e1, e2), srcBegin, av, associated_av) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(Concurrency::extent(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {} - - /** @} */ - - /** - * Access the extent that defines the shape of this array. - */ - Concurrency::extent get_extent() const restrict(amp,cpu) { return extent; } - - /** - * This property returns the accelerator_view representing the location - * where this array has been allocated. - */ - accelerator_view get_accelerator_view() const { return m_device.get_av(); } - - /** - * This property returns the accelerator_view representing the preferred - * target where this array can be copied. - */ - accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); } - /** - * This property returns the CPU "access_type" allowed for this array. - */ - access_type get_cpu_access_type() const { return m_device.get_access(); } - - /** - * Assigns the contents of the array "other" to this array, using a deep - * copy. - * - * @param[in] other An object of type array from which to copy into - * this array. - * @return Returns *this. - */ - array& operator=(const array& other) { - if (this != &other) { - array arr(other); - *this = std::move(arr); - } - return *this; - } - - /** - * Moves the contents of the array "other" to this array. - * - * @param[in] other An object of type array from which to move into - * this array. - * @return Returns *this. - */ - array& operator=(array&& other) { - if (this != &other) { - extent = other.extent; - m_device = other.m_device; - other.m_device.reset(); - } - return *this; - } - - /** - * Assigns the contents of the array_view "src", as if by calling - * "copy(src, *this)". - * - * @param[in] src An object of type array_view from which to copy into - * this array. - * @return Returns *this. - */ - array& operator=(const array_view& src) { - array arr(src); - *this = std::move(arr); - return *this; - } - - /** - * Copies the contents of this array to the array given by "dest", as - * if by calling "copy(*this, dest)". - * - * @param[out] dest An object of type array to which to copy data - * from this array. - */ - // FIXME: const is not defined in C++ AMP specification - void copy_to(array& dest) const { -#if __KALMAR_ACCELERATOR__ != 1 - for(int i = 0 ; i < N ; i++) - { - if(dest.extent[i] < this->extent[i] ) - throw runtime_exception("errorMsg_throw", 0); - } -#endif - copy(*this, dest); - } - - /** - * Copies the contents of this array to the array_view given by "dest", as - * if by calling "copy(*this, dest)". - * - * @param[out] dest An object of type array_view to which to copy data - * from this array. - */ - // FIXME: const is not defined in C++ AMP specification - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the raw data underlying this array. - * - * @return A (const) pointer to the first element in the linearized array. - */ - // FIXME: const is not defined in C++ AMP specification - // FIXME: missing const T* data() const - T* data() const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - return nullptr; - m_device.synchronize(true); -#endif - return reinterpret_cast(m_device.get()); - } - - /** - * Implicitly converts an array to a std::vector, as if by - * "copy(*this, vector)". - * - * @return An object of type vector which contains a copy of the data - * contained on the array. - */ - operator std::vector() const { - std::vector vec(extent.size()); - Concurrency::copy(*this, vec.data()); - return std::move(vec); - } - - /** @{ */ - /** - * Returns a reference to the element of this array that is at the location - * in N-dimensional space specified by "idx". Accessing array data on a - * location where it is not resident (e.g. from the CPU when it is resident - * on a GPU) results in an exception (in cpu-restricted context) or - * undefined behavior (in amp-restricted context). - * - * @param[in] idx An object of type index from that specifies the - * location of the element. - */ - T& operator[](const index& idx) restrict(amp,cpu) { -#ifndef __KALMAR_ACCELERATOR__ - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(true); -#endif - T *ptr = reinterpret_cast(m_device.get()); - return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx, extent)]; - } - T& operator()(const index& idx) restrict(amp,cpu) { - return (*this)[idx]; - } - - /** @} */ +// ------------------------------------------------------------------------ +// utility classes for array_view +// ------------------------------------------------------------------------ - /** @{ */ - /** - * Returns a const reference to the element of this array that is at the - * location in N-dimensional space specified by "idx". Accessing array data - * on a location where it is not resident (e.g. from the CPU when it is - * resident on a GPU) results in an exception (in cpu-restricted context) - * or undefined behavior (in amp-restricted context). - * - * @param[in] idx An object of type index from that specifies the - * location of the element. - */ - const T& operator[](const index& idx) const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(); -#endif - T *ptr = reinterpret_cast(m_device.get()); - return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx, extent)]; - } - const T& operator()(const index& idx) const restrict(amp,cpu) { - return (*this)[idx]; - } +template +struct __has_data +{ +private: + struct two {char __lx; char __lxx;}; + template static char test(decltype(std::declval().data())); + template static two test(...); +public: + static const bool value = sizeof(test(0)) == 1; +}; - /** @} */ +template +struct __has_size +{ +private: + struct two {char __lx; char __lxx;}; + template static char test(decltype(&C::size)); + template static two test(...); +public: + static const bool value = sizeof(test(0)) == 1; +}; - /** @{ */ - /** - * Equivalent to - * "array::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - T& operator()(int i0, int i1) restrict(amp,cpu) { - return (*this)[index<2>(i0, i1)]; - } - T& operator()(int i0, int i1, int i2) restrict(amp,cpu) { - return (*this)[index<3>(i0, i1, i2)]; - } +template +struct __is_container +{ + using _T = typename std::remove_reference::type; + static const bool value = __has_size<_T>::value && __has_data<_T>::value; +}; - /** @} */ +// ------------------------------------------------------------------------ +// array_view +// ------------------------------------------------------------------------ - /** @{ */ - /** - * Equivalent to - * "array::operator()(index(i0 [, i1 [, i2 ]])) const". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - const T& operator()(int i0, int i1) const restrict(amp,cpu) { - return (*this)[index<2>(i0, i1)]; - } - const T& operator()(int i0, int i1, int i2) const restrict(amp,cpu) { - return (*this)[index<3>(i0, i1, i2)]; - } +/** + * The array_view type represents a possibly cached view into the data + * held in an array, or a section thereof. It also provides such views + * over native CPU data. It exposes an indexing interface congruent to that of + * array. + */ - /** @} */ - - /** @{ */ - /** - * This overload is defined for array where @f$N \ge 2@f$. - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, …); - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array. - */ - typename array_projection_helper::result_type - operator[] (int i) restrict(amp,cpu) { - return array_projection_helper::project(*this, i); - } - typename array_projection_helper::result_type - operator()(int i0) restrict(amp,cpu) { - return (*this)[i0]; - } - typename array_projection_helper::const_result_type - operator[] (int i) const restrict(amp,cpu) { - return array_projection_helper::project(*this, i); - } - typename array_projection_helper::const_result_type - operator()(int i0) const restrict(amp,cpu) { - return (*this)[i0]; - } - - /** @} */ - - /** @{ */ - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] origin Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const Concurrency::index& origin, const Concurrency::extent& ext) restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if( !detail::amp_helper, Concurrency::extent>::contains(origin, ext ,this->extent) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(*this); - return av.section(origin, ext); - } - array_view section(const Concurrency::index& origin, const Concurrency::extent& ext) const restrict(amp,cpu) { - array_view av(*this); - return av.section(origin, ext); - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const index& idx) restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if( !detail::amp_helper, Concurrency::extent>::contains(idx, this->extent ) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(*this); - return av.section(idx); - } - array_view section(const index& idx) const restrict(amp,cpu) { - array_view av(*this); - return av.section(idx); - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const extent& ext) restrict(amp,cpu) { - array_view av(*this); - return av.section(ext); - } - array_view section(const extent& ext) const restrict(amp,cpu) { - array_view av(*this); - return av.section(ext); - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to - * "array::section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]])) const". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) restrict(amp,cpu) { - static_assert(N == 1, "Rank must be 1"); - return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0)); - } - array_view section(int i0, int e0) const restrict(amp,cpu) { - static_assert(N == 1, "Rank must be 1"); - return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0)); - } - array_view section(int i0, int i1, int e0, int e1) const restrict(amp,cpu) { - static_assert(N == 2, "Rank must be 2"); - return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1)); - } - array_view section(int i0, int i1, int e0, int e1) restrict(amp,cpu) { - static_assert(N == 2, "Rank must be 2"); - return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1)); - } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) restrict(amp,cpu) { - static_assert(N == 3, "Rank must be 3"); - return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2)); - } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const restrict(amp,cpu) { - static_assert(N == 3, "Rank must be 3"); - return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** @{ */ - /** - * Sometimes it is desirable to view the data of an N-dimensional array as - * a linear array, possibly with a (unsafe) reinterpretation of the element - * type. This can be achieved through the reinterpret_as member function. - * Example: - * - * @code{.cpp} - * struct RGB { float r; float g; float b; }; - * array a = ...; - * array_view v = a.reinterpret_as(); - * assert(v.extent == 3*a.extent); - * @endcode - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array. - * - * @return Returns an array_view from this array with the element type - * reinterpreted from T to ElementType, and the rank reduced from N - * to 1. - */ - template - array_view reinterpret_as() restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); - if( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception("errorMsg_throw", 0); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(m_device), Concurrency::extent<1>(size), 0); - return av; - } - template - array_view reinterpret_as() const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(m_device), Concurrency::extent<1>(size), 0); - return av; - } - - /** @} */ - - /** @{ */ - /** - * An array of higher rank can be reshaped into an array of lower rank, or - * vice versa, using the view_as member function. Example: - * - * @code{.cpp} - * array a(100); - * array_view av = a.view_as(extent<2>(2,50)); - * @endcode - * - * @return Returns an array_view from this array with the rank changed - * to K from N. - */ - template array_view - view_as(const Concurrency::extent& viewExtent) restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(m_device, viewExtent, 0); - return av; - } - template array_view - view_as(const Concurrency::extent& viewExtent) const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - const array_view av(m_device, viewExtent, 0); - return av; - } - - /** @} */ - - ~array() {} - - // FIXME: functions below are not defined in C++ AMP specification - const acc_buffer_t& internal() const restrict(amp,cpu) { return m_device; } - int get_offset() const restrict(amp,cpu) { return 0; } - Concurrency::index get_index_base() const restrict(amp,cpu) { return Concurrency::index(); } -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - acc_buffer_t m_device; - Concurrency::extent extent; - - template friend - void copy(const array&, const array_view&); - template friend - void copy(const array_view&, array&); -}; - -// ------------------------------------------------------------------------ -// utility classes for array_view -// ------------------------------------------------------------------------ - -template -struct __has_data -{ -private: - struct two {char __lx; char __lxx;}; - template static char test(decltype(std::declval().data())); - template static two test(...); -public: - static const bool value = sizeof(test(0)) == 1; -}; - -template -struct __has_size -{ -private: - struct two {char __lx; char __lxx;}; - template static char test(decltype(&C::size)); - template static two test(...); -public: - static const bool value = sizeof(test(0)) == 1; -}; - -template -struct __is_container -{ - using _T = typename std::remove_reference::type; - static const bool value = __has_size<_T>::value && __has_data<_T>::value; -}; - -// ------------------------------------------------------------------------ -// array_view -// ------------------------------------------------------------------------ - -/** - * The array_view type represents a possibly cached view into the data - * held in an array, or a section thereof. It also provides such views - * over native CPU data. It exposes an indexing interface congruent to that of - * array. - */ -template -class array_view -{ - static_assert(0 == (sizeof(T) % sizeof(int)), "only value types whose size is a multiple of the size of an integer are allowed in array views"); -public: - typedef typename std::remove_const::type nc_T; -#if __KALMAR_ACCELERATOR__ == 1 - typedef detail::_data acc_buffer_t; -#else - typedef detail::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; - - /** - * The element type of this array. - */ - typedef T value_type; - - /** - * There is no default constructor for array_view. - */ - array_view() = delete; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" array. The extent of the array_view is that of the src array, and - * the origin of the array view is at zero. - * - * @param[in] src An array which contains the data that this array_view is - * bound to. - */ - array_view(array& src) restrict(amp,cpu) - : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} - - // FIXME: following interfaces were not implemented yet - // template - // explicit array_view::array_view(Container& src); - // template - // explicit array_view::array_view(value_type (&src) [Size]) restrict(amp,cpu); - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A template argument that must resolve to a linear - * container that supports .data() and .size() members (such - * as std::vector or std::array) - * @param[in] extent The extent of this array_view. - */ - template ::value>::type> - array_view(const Concurrency::extent& extent, Container& src) - : array_view(extent, src.data()) - { static_assert( std::is_same::value, "container element type and array view element type must match"); } - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than the - * size of extent, the behavior is undefined. - * @param[in] ext The extent of this array_view. - */ - array_view(const Concurrency::extent& ext, value_type* src) restrict(amp,cpu) -#if __KALMAR_ACCELERATOR__ == 1 - : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {} -#else - : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {} -#endif - - /** - * Constructs an array_view which is not bound to a data source. The extent - * of the array_view is that given by the "extent" argument, and the origin - * of the array view is at zero. An array_view thus constructed represents - * uninitialized data and the underlying allocations are created lazily as - * the array_view is accessed on different locations (on an - * accelerator_view or on the CPU). - * - * @param[in] ext The extent of this array_view. - */ - explicit array_view(const Concurrency::extent& ext) - : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A template argument that must resolve to a contiguousi - * container that supports .data() and .size() members (such - * as std::vector or std::array) - */ - template ::value>::type> - array_view(int e0, Container& src) - : array_view(Concurrency::extent(e0), src) {} - template ::value>::type> - array_view(int e0, int e1, Container& src) - : array_view(Concurrency::extent(e0, e1), src) {} - template ::value>::type> - array_view(int e0, int e1, int e2, Container& src) - : array_view(Concurrency::extent(e0, e1, e2), src) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than - * the size of extent, the behavior is undefined. - */ - array_view(int e0, value_type *src) restrict(amp,cpu) - : array_view(Concurrency::extent(e0), src) {} - array_view(int e0, int e1, value_type *src) restrict(amp,cpu) - : array_view(Concurrency::extent(e0, e1), src) {} - array_view(int e0, int e1, int e2, value_type *src) restrict(amp,cpu) - : array_view(Concurrency::extent(e0, e1, e2), src) {} - - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - */ - explicit array_view(int e0) restrict(cpu) : array_view(Concurrency::extent(e0)) {} - explicit array_view(int e0, int e1) restrict(cpu) - : array_view(Concurrency::extent(e0, e1)) {} - explicit array_view(int e0, int e1, int e2) restrict(cpu) - : array_view(Concurrency::extent(e0, e1, e2)) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view or - * array_view from which to initialize this - * new array_view. - */ - array_view(const array_view& other) restrict(amp,cpu) - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Access the extent that defines the shape of this array_view. - */ - extent get_extent() const restrict(amp,cpu) { return extent; } - - /** - * Access the accelerator_view where the data source of the array_view is - * located. - * - * When the data source of the array_view is native CPU memory, the method - * returns accelerator(accelerator::cpu_accelerator).default_view. When the - * data source underlying the array_view is an array, the method returns - * the accelerator_view where the source array is located. - */ - accelerator_view get_source_accelerator_view() const { return cache.get_av(); } - - /** - * Assigns the contents of the array_view "other" to this array_view, using - * a shallow copy. Both array_views will refer to the same data. - * - * @param[in] other An object of type array_view from which to copy - * into this array. - * @return Returns *this. - */ - array_view& operator=(const array_view& other) restrict(amp,cpu) { - if (this != &other) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - } - return *this; - } - - /** - * Copies the data referred to by this array_view to the array given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array to which to copy data from - * this array. - */ - void copy_to(array& dest) const { -#if __KALMAR_ACCELERATOR__ != 1 - for(int i= 0 ;i< N;i++) { - if (dest.get_extent()[i] < this->extent[i]) - throw runtime_exception("errorMsg_throw", 0); - } -#endif - copy(*this, dest); - } - - /** - * Copies the contents of this array_view to the array_view given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array_view to which to copy data - * from this array. - */ - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the first data element underlying this array_view. - * This is only available on array_views of rank 1. - * - * When the data source of the array_view is native CPU memory, the pointer - * returned by data() is valid for the lifetime of the data source. - * - * When the data source underlying the array_view is an array, or the array - * view is created without a data source, the pointer returned by data() in - * CPU context is ephemeral and is invalidated when the original data - * source or any of its views are accessed on an accelerator_view through a - * parallel_for_each or a copy operation. - * - * @return A pointer to the first element in the linearized array. - */ - T* data() const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - static_assert(N == 1, "data() is only permissible on array views of rank 1"); - return reinterpret_cast(cache.get() + offset + index_base[0]); - } - - /** - * Calling this member function informs the array_view that its bound - * memory has been modified outside the array_view interface. This will - * render all cached information stale. - */ - void refresh() const { cache.refresh(); } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to its source data container. For - * example, for an array_view on system memory, if the data underlying the - * view are modified on a remote accelerator_view through a - * parallel_for_each invocation, calling synchronize ensures that the - * modifications are synchronized to the source data and will be visible - * through the system memory pointer which the array_view was created over. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the source data - * container through the "type" parameter. For example specifying a - * "access_type_read" (which is also the default value of the parameter) - * indicates that the data has been synchronized to its source location - * only for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to its source location - * both for reading and writing; i.e. any modifications to the source data - * directly through the source data container are legal after synchronizing - * the array_view with write access and before subsequently accessing the - * array_view on another remote location. - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * the source location, which is unnecessary if the contents are intended - * to be overwritten without reading. - * - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - */ - // FIXME: type parameter is not implemented - void synchronize() const { cache.get_cpu_access(); } - - /** - * An asynchronous version of synchronize, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: type parameter is not implemented - completion_future synchronize_async() const { - std::future fut = std::async([&]() mutable { synchronize(); }); - return completion_future(fut.share()); - } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to the specified accelerator_view - * "av". For example, for an array_view on system memory, if the data - * underlying the view is modified on the CPU, and synchronize_to is called - * on "this" array_view, then the array_view contents are cached on the - * specified accelerator_view location. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the specified target - * accelerator_view "av", through the "type" parameter. For example - * specifying a "access_type_read" (which is also the default value of the - * parameter) indicates that the data has been synchronized to "av" only - * for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to "av" both for reading - * and writing; i.e. any modifications to the data on "av" are legal after - * synchronizing the array_view with write access and before subsequently - * accessing the array_view on a location other than "av". - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * "av", which is unnecessary if the contents are intended to be - * immediately overwritten without reading. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - */ - // FIXME: type parameter is not implemented - void synchronize_to(const accelerator_view& av) const { -#if __KALMAR_ACCELERATOR__ != 1 - cache.sync_to(av.pQueue); -#endif - } - - /** - * An asynchronous version of synchronize_to, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: this method is not implemented yet - completion_future synchronize_to_async(const accelerator_view& av) const; - - /** - * Indicates to the runtime that it may discard the current logical - * contents of this array_view. This is an optimization hint to the runtime - * used to avoid copying the current contents of the view to a target - * accelerator_view, and its use is recommended if the existing content is - * not needed. - */ - void discard_data() const { -#if __KALMAR_ACCELERATOR__ != 1 - cache.discard(); -#endif - } - - /** @{ */ - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * @param[in] idx An object of type index that specifies the location of - * the element. - */ - T& operator[] (const index& idx) const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx + index_base, extent_base)]; - } - - T& operator() (const index& idx) const restrict(amp,cpu) { - return (*this)[idx]; - } - - /** @} */ - - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * Unlike the other indexing operators for accessing the array_view on the - * CPU, this method does not implicitly synchronize this array_view's - * contents to the CPU. After accessing the array_view on a remote location - * or performing a copy operation involving this array_view, users are - * responsible to explicitly synchronize the array_view to the CPU before - * calling this method. Failure to do so results in undefined behavior. - */ - // FIXME: this method is not implemented - T& get_ref(const index& idx) const restrict(amp,cpu); - - /** @{ */ - /** - * Equivalent to - * "array_view::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - T& operator() (int i0, int i1) const restrict(amp,cpu) { - static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view"); - return (*this)[index<2>(i0, i1)]; - } - - T& operator() (int i0, int i1, int i2) const restrict(amp,cpu) { - static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view"); - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @} */ - - /** @{ */ - /** - * This overload is defined for array_view where @f$N \ge 2@f$. - * - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, ...); - * - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param[in] i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array_view. - */ - typename projection_helper::result_type - operator[] (int i) const restrict(amp,cpu) { - return projection_helper::project(*this, i); - } - - typename projection_helper::result_type - operator() (int i0) const restrict(amp,cpu) { return (*this)[i0]; } - - /** @} */ - - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] idx Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const Concurrency::index& idx, - const Concurrency::extent& ext) const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !detail::amp_helper, Concurrency::extent>::contains(idx, ext,this->extent ) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(cache, ext, extent_base, idx + index_base, offset); - return av; - } - - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const Concurrency::index& idx) const restrict(amp,cpu) { - Concurrency::extent ext(extent); - detail::amp_helper, Concurrency::extent>::minus(idx, ext); - return section(idx, ext); - } - - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const Concurrency::extent& ext) const restrict(amp,cpu) { - Concurrency::index idx; - return section(idx, ext); - } - - /** @{ */ - /** - * Equivalent to - * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) const restrict(amp,cpu) { - static_assert(N == 1, "Rank must be 1"); - return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0)); - } - - array_view section(int i0, int i1, int e0, int e1) const restrict(amp,cpu) { - static_assert(N == 2, "Rank must be 2"); - return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1)); - } - - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const restrict(amp,cpu) { - static_assert(N == 3, "Rank must be 3"); - return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** - * This member function is similar to "array::reinterpret_as", - * although it only supports array_views of rank 1 (only those guarantee - * that all elements are laid out contiguously). - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array_view. - * - * @return Returns an array_view from this array_view with the element - * type reinterpreted from T to ElementType. - */ - template - array_view reinterpret_as() const restrict(amp,cpu) { - static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); - if ( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception("errorMsg_throw", 0); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(cache), - Concurrency::extent<1>(size), - (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); - return av; - } - - /** - * This member function is similar to "array::view_as", although it - * only supports array_views of rank 1 (only those guarantee that all - * elements are laid out contiguously). - * - * @return Returns an array_view from this array_view with the rank - * changed to K from 1. - */ - template - array_view view_as(Concurrency::extent viewExtent) const restrict(amp,cpu) { - static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - if ( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(cache, viewExtent, offset + index_base[0]); - return av; - } - - ~array_view() restrict(amp,cpu) {} - - // FIXME: functions below are not defined in C++ AMP specification - template - T& operator[] (const tiled_index& idx) const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx.global + index_base, extent_base)]; - } - - const acc_buffer_t& internal() const restrict(amp,cpu) { return cache; } - - int get_offset() const restrict(amp,cpu) { return offset; } - - Concurrency::index get_index_base() const restrict(amp,cpu) { return index_base; } - -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template friend - bool is_flat(const array_view&) noexcept; - template friend - void copy(const array&, const array_view&); - template friend - void copy(InputIter, InputIter, const array_view&); - template friend - void copy(const array_view&, array&); - template friend - void copy(const array_view&, OutputIter); - template friend - void copy(const array_view& src, const array_view& dest); - - // used by view_as and reinterpret_as - array_view(const acc_buffer_t& cache, const Concurrency::extent& ext, - int offset) restrict(amp,cpu) - : cache(cache), extent(ext), extent_base(ext), offset(offset) {} - - // used by section and projection - array_view(const acc_buffer_t& cache, const Concurrency::extent& ext_now, - const Concurrency::extent& ext_b, - const Concurrency::index& idx_b, int off) restrict(amp,cpu) - : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {} - - acc_buffer_t cache; - Concurrency::extent extent; - Concurrency::extent extent_base; - Concurrency::index index_base; - int offset; -}; - -// ------------------------------------------------------------------------ -// array_view -// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// array_view +// ------------------------------------------------------------------------ /** * The partial specialization array_view represents a view over @@ -3710,562 +2228,6 @@ class array_view * need only be copied to the target accelerator if it isn't already there. It * will not be copied out. */ -template -class array_view -{ -public: - typedef typename std::remove_const::type nc_T; - -#if __KALMAR_ACCELERATOR__ == 1 - typedef detail::_data acc_buffer_t; -#else - typedef detail::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; - - /** - * The element type of this array. - */ - typedef const T value_type; - - /** - * There is no default constructor for array_view. - */ - array_view() = delete; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" array. The extent of the array_view is that of the src array, and - * the origin of the array view is at zero. - * - * @param[in] src An array which contains the data that this array_view is - * bound to. - */ - array_view(const array& src) restrict(amp,cpu) - : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} - - // FIXME: following interfaces were not implemented yet - // template - // explicit array_view::array_view(const Container& src); - // template - // explicit array_view::array_view(const value_type (&src) [Size]) restrict(amp,cpu); - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A template argument that must resolve to a linear - * container that supports .data() and .size() members (such - * as std::vector or std::array) - * @param[in] extent The extent of this array_view. - */ - template ::value>::type> - array_view(const extent& extent, const Container& src) - : array_view(extent, src.data()) - { static_assert( std::is_same::type>::type, T>::value, "container element type and array view element type must match"); } - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than the - * size of extent, the behavior is undefined. - * @param[in] ext The extent of this array_view. - */ - array_view(const extent& ext, const value_type* src) restrict(amp,cpu) -#if __KALMAR_ACCELERATOR__ == 1 - : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {} -#else - : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {} -#endif - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A template argument that must resolve to a contiguousi - * container that supports .data() and .size() members (such - * as std::vector or std::array) - */ - template ::value>::type> - array_view(int e0, Container& src) : array_view(Concurrency::extent<1>(e0), src) {} - template ::value>::type> - array_view(int e0, int e1, Container& src) - : array_view(Concurrency::extent(e0, e1), src) {} - template ::value>::type> - array_view(int e0, int e1, int e2, Container& src) - : array_view(Concurrency::extent(e0, e1, e2), src) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than - * the size of extent, the behavior is undefined. - */ - array_view(int e0, const value_type *src) restrict(amp,cpu) - : array_view(Concurrency::extent<1>(e0), src) {} - array_view(int e0, int e1, const value_type *src) restrict(amp,cpu) - : array_view(Concurrency::extent<2>(e0, e1), src) {} - array_view(int e0, int e1, int e2, const value_type *src) restrict(amp,cpu) - : array_view(Concurrency::extent<3>(e0, e1, e2), src) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view or - * array_view from which to initialize this - * new array_view. - */ - array_view(const array_view& other) restrict(amp,cpu) - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view from which to - * initialize this new array_view. - */ - array_view(const array_view& other) restrict(amp,cpu) - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Access the extent that defines the shape of this array_view. - */ - extent get_extent() const restrict(amp,cpu) { return extent; } - - /** - * Access the accelerator_view where the data source of the array_view is - * located. - * - * When the data source of the array_view is native CPU memory, the method - * returns accelerator(accelerator::cpu_accelerator).default_view. When the - * data source underlying the array_view is an array, the method returns - * the accelerator_view where the source array is located. - */ - accelerator_view get_source_accelerator_view() const { return cache.get_av(); } - - /** @{ */ - /** - * Assigns the contents of the array_view "other" to this array_view, using - * a shallow copy. Both array_views will refer to the same data. - * - * @param[in] other An object of type array_view from which to copy - * into this array. - * @return Returns *this. - */ - array_view& operator=(const array_view& other) restrict(amp,cpu) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - return *this; - } - - array_view& operator=(const array_view& other) restrict(amp,cpu) { - if (this != &other) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - } - return *this; - } - - /** @} */ - - /** - * Copies the data referred to by this array_view to the array given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array to which to copy data from - * this array. - */ - void copy_to(array& dest) const { copy(*this, dest); } - - /** - * Copies the contents of this array_view to the array_view given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array_view to which to copy data - * from this array. - */ - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the first data element underlying this array_view. - * This is only available on array_views of rank 1. - * - * When the data source of the array_view is native CPU memory, the pointer - * returned by data() is valid for the lifetime of the data source. - * - * When the data source underlying the array_view is an array, or the array - * view is created without a data source, the pointer returned by data() in - * CPU context is ephemeral and is invalidated when the original data - * source or any of its views are accessed on an accelerator_view through a - * parallel_for_each or a copy operation. - * - * @return A const pointer to the first element in the linearized array. - */ - const T* data() const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(); -#endif - static_assert(N == 1, "data() is only permissible on array views of rank 1"); - return reinterpret_cast(cache.get() + offset + index_base[0]); - } - - /** - * Calling this member function informs the array_view that its bound - * memory has been modified outside the array_view interface. This will - * render all cached information stale. - */ - void refresh() const { cache.refresh(); } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to its source data container. For - * example, for an array_view on system memory, if the data underlying the - * view are modified on a remote accelerator_view through a - * parallel_for_each invocation, calling synchronize ensures that the - * modifications are synchronized to the source data and will be visible - * through the system memory pointer which the array_view was created over. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the source data - * container through the "type" parameter. For example specifying a - * "access_type_read" (which is also the default value of the parameter) - * indicates that the data has been synchronized to its source location - * only for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to its source location - * both for reading and writing; i.e. any modifications to the source data - * directly through the source data container are legal after synchronizing - * the array_view with write access and before subsequently accessing the - * array_view on another remote location. - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * the source location, which is unnecessary if the contents are intended - * to be overwritten without reading. - */ - void synchronize() const { cache.get_cpu_access(); } - - /** - * An asynchronous version of synchronize, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - completion_future synchronize_async() const { - std::future fut = std::async([&]() mutable { synchronize(); }); - return completion_future(fut.share()); - } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to the specified accelerator_view - * "av". For example, for an array_view on system memory, if the data - * underlying the view is modified on the CPU, and synchronize_to is called - * on "this" array_view, then the array_view contents are cached on the - * specified accelerator_view location. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - */ - void synchronize_to(const accelerator_view& av) const { -#if __KALMAR_ACCELERATOR__ != 1 - cache.sync_to(av.pQueue); -#endif - } - - /** - * An asynchronous version of synchronize_to, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: this method is not implemented yet - completion_future synchronize_to_async(const accelerator_view& av) const; - - /** @{ */ - /** - * Returns a const reference to the element of this array_view that is at - * the location in N-dimensional space specified by "idx". - * - * @param[in] idx An object of type index that specifies the location of - * the element. - */ - const T& operator[] (const index& idx) const restrict(amp,cpu) { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(); -#endif - const T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[detail::amp_helper, Concurrency::extent>::flatten(idx + index_base, extent_base)]; - } - const T& operator() (const index& idx) const restrict(amp,cpu) { - return (*this)[idx]; - } - - /** @} */ - - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * Unlike the other indexing operators for accessing the array_view on the - * CPU, this method does not implicitly synchronize this array_view's - * contents to the CPU. After accessing the array_view on a remote location - * or performing a copy operation involving this array_view, users are - * responsible to explicitly synchronize the array_view to the CPU before - * calling this method. Failure to do so results in undefined behavior. - */ - // FIXME: this method is not implemented - const T& get_ref(const index& idx) const restrict(amp,cpu); - - /** @{ */ - /** - * Equivalent to - * "array_view::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - const T& operator() (int i0) const restrict(amp,cpu) { - static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view"); - return (*this)[index<1>(i0)]; - } - - const T& operator() (int i0, int i1) const restrict(amp,cpu) { - static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view"); - return (*this)[index<2>(i0, i1)]; - } - const T& operator() (int i0, int i1, int i2) const restrict(amp,cpu) { - static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view"); - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @} */ - - /** @{ */ - /** - * This overload is defined for array_view where @f$N \ge 2@f$. - * - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, ...); - * - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param[in] i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array_view. - */ - typename projection_helper::const_result_type - operator[] (int i) const restrict(amp,cpu) { - return projection_helper::project(*this, i); - } - - // FIXME: typename projection_helper::const_result_type - // operator() (int i0) const restrict(cmp,cpu); - // is not implemented - - /** @} */ - - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] idx Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const Concurrency::index& idx, - const Concurrency::extent& ext) const restrict(amp,cpu) { - array_view av(cache, ext, extent_base, idx + index_base, offset); - return av; - } - - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const Concurrency::index& idx) const restrict(amp,cpu) { - Concurrency::extent ext(extent); - detail::amp_helper, Concurrency::extent>::minus(idx, ext); - return section(idx, ext); - } - - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const Concurrency::extent& ext) const restrict(amp,cpu) { - Concurrency::index idx; - return section(idx, ext); - } - - /** @{ */ - /** - * Equivalent to - * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) const restrict(amp,cpu) { - static_assert(N == 1, "Rank must be 1"); - return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0)); - } - - array_view section(int i0, int i1, int e0, int e1) const restrict(amp,cpu) { - static_assert(N == 2, "Rank must be 2"); - return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1)); - } - - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const restrict(amp,cpu) { - static_assert(N == 3, "Rank must be 3"); - return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** - * This member function is similar to "array::reinterpret_as", - * although it only supports array_views of rank 1 (only those guarantee - * that all elements are laid out contiguously). - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array_view. - * - * @return Returns an array_view from this array_view with the element - * type reinterpreted from T to ElementType. - */ - template - array_view reinterpret_as() const restrict(amp,cpu) { - static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(cache), - Concurrency::extent<1>(size), - (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); - return av; - } - - /** - * This member function is similar to "array::view_as", although it - * only supports array_views of rank 1 (only those guarantee that all - * elements are laid out contiguously). - * - * @return Returns an array_view from this array_view with the rank - * changed to K from 1. - */ - template - array_view view_as(Concurrency::extent viewExtent) const restrict(amp,cpu) { - static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - if ( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(cache, viewExtent, offset + index_base[0]); - return av; - } - - ~array_view() restrict(amp,cpu) {} - - // FIXME: functions below are not defined in C++ AMP specification - const acc_buffer_t& internal() const restrict(amp,cpu) { return cache; } - - int get_offset() const restrict(amp,cpu) { return offset; } - - Concurrency::index get_index_base() const restrict(amp,cpu) { return index_base; } - -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template friend - bool is_flat(const array_view&) noexcept; - template friend - void copy(const array&, const array_view&); - template friend - void copy(InputIter, InputIter, const array_view&); - template friend - void copy(const array_view&, array&); - template friend - void copy(const array_view&, OutputIter); - template friend - void copy(const array_view& src, const array_view& dest); - - // used by view_as and reinterpret_as - array_view(const acc_buffer_t& cache, const Concurrency::extent& ext, - int offset) restrict(amp,cpu) - : cache(cache), extent(ext), extent_base(ext), offset(offset) {} - - // used by section and projection - array_view(const acc_buffer_t& cache, const Concurrency::extent& ext_now, - const Concurrency::extent& ext_b, - const Concurrency::index& idx_b, int off) restrict(amp,cpu) - : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {} - - acc_buffer_t cache; - Concurrency::extent extent; - Concurrency::extent extent_base; - Concurrency::index index_base; - int offset; -}; // ------------------------------------------------------------------------ // global functions for extent @@ -5111,7 +3073,7 @@ template inline void validate_tile_dims() { - static_assert(dim > 0, "The number of threads in a tile must be positive."); + static_assert(dim >= 0, "The number of threads in a tile must be positive."); static_assert( dim <= 1024, "The maximum number of threads in a tile is 1024."); diff --git a/include/amp_math.h b/include/amp_math.h index d487a0072e8..248983a142f 100644 --- a/include/amp_math.h +++ b/include/amp_math.h @@ -7,6 +7,8 @@ #pragma once +#warning "C++AMP support is deprecated in ROCm 1.9 and will be removed in ROCm 2.0!" + #include "kalmar_math.h" namespace Concurrency { diff --git a/include/amp_short_vectors.h b/include/amp_short_vectors.h index 18ef85d8fe6..6c44ae6e4af 100644 --- a/include/amp_short_vectors.h +++ b/include/amp_short_vectors.h @@ -1,7 +1,6 @@ #pragma once -#ifndef _AMP_SHORT_VECTORS_H -#define _AMP_SHORT_VECTORS_H +#warning "C++AMP support is deprecated in ROCm 1.9 and will be removed in ROCm 2.0!" #include #include @@ -24,5 +23,3 @@ namespace graphics } // namespace graphics } // namespace Concurrency - -#endif // _AMP_SHORT_VECTORS_H From e79ebfe0494c85b5431ed778da72a87ef008dacc Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 26 Aug 2018 23:18:16 +0100 Subject: [PATCH 007/134] Use the right triple. --- lib/clamp-device.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/clamp-device.in b/lib/clamp-device.in index 160cf57e28f..00bdac6fb1e 100755 --- a/lib/clamp-device.in +++ b/lib/clamp-device.in @@ -195,7 +195,7 @@ fi # Optimization notes: # -disable-simplify-libcalls: prevents transforming loops into library calls such as memset, memcopy on GPU -$OPT -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET -amdgpu-internalize-symbols -disable-simplify-libcalls $KMOPTOPT -verify $2.selected.bc -o $2.opt.bc +$OPT -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET -amdgpu-internalize-symbols -disable-simplify-libcalls $KMOPTOPT -verify $2.selected.bc -o $2.opt.bc # error handling for opt RETVAL=$? @@ -218,9 +218,9 @@ if [ $KMDUMPLLVM == "1" ]; then fi if [ $KMTHINLTO == "1" ]; then - $LLC $KMOPTLLC -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET -filetype=obj -o $2 $2.opt.bc + $LLC $KMOPTLLC -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET -filetype=obj -o $2 $2.opt.bc else - $LLC $KMOPTLLC -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET -filetype=obj -o $2.isabin $2.opt.bc + $LLC $KMOPTLLC -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET -filetype=obj -o $2.isabin $2.opt.bc fi # error handling for llc @@ -236,7 +236,7 @@ if [ $KMDUMPISA == "1" ]; then else cp $2.isabin ./dump-$AMDGPU_TARGET.isabin fi - $LLC $KMOPTLLC -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET -filetype=asm -o $2.isa $2.opt.bc + $LLC $KMOPTLLC -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET -filetype=asm -o $2.isa $2.opt.bc mv $2.isa ${KMDUMPDIR}/dump-$AMDGPU_TARGET.isa fi From 1034251eb309ff4c7c83b661f11b5ab361928b13 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 26 Aug 2018 23:20:37 +0100 Subject: [PATCH 008/134] Pure virtual interfaces are a thing, so we might as well try using them. --- include/kalmar_runtime.h | 266 +++++++++++++++++++++++++++++++++------ 1 file changed, 227 insertions(+), 39 deletions(-) diff --git a/include/kalmar_runtime.h b/include/kalmar_runtime.h index 9393d84cc80..0ec363fe96f 100644 --- a/include/kalmar_runtime.h +++ b/include/kalmar_runtime.h @@ -255,7 +255,7 @@ class HCCQueue /// push device pointer to kernel argument list virtual void Push(void *kernel, int idx, void* device, bool modify) = 0; - virtual uint32_t GetGroupSegmentSize(void *kernel) { return 0; } + virtual uint32_t GetGroupSegmentSize(void*) = 0; HCCDevice* getDev() const { return pDev; } queuing_mode get_mode() const { return mode; } @@ -292,25 +292,54 @@ class HCCQueue virtual std::shared_ptr EnqueueMarker(memory_scope) { return nullptr; } /// enqueue marker with prior dependency - virtual std::shared_ptr EnqueueMarkerWithDependency(int count, std::shared_ptr *depOps, memory_scope scope) { return nullptr; } + virtual + std::shared_ptr EnqueueMarkerWithDependency( + int count, std::shared_ptr *depOps, memory_scope scope) = 0; - virtual std::shared_ptr detectStreamDeps(hcCommandKind commandKind, HCCAsyncOp *newCopyOp) { return nullptr; }; + virtual + std::shared_ptr detectStreamDeps( + hcCommandKind commandKind, HCCAsyncOp *newCopyOp) = 0; /// copy src to dst asynchronously - virtual std::shared_ptr EnqueueAsyncCopy(const void* src, void* dst, size_t size_bytes) { return nullptr; } - virtual std::shared_ptr EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes, - hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, - const detail::HCCDevice *copyDevice) { return nullptr; }; + virtual + std::shared_ptr EnqueueAsyncCopy( + const void* src, void* dst, size_t size_bytes) = 0; + virtual + std::shared_ptr EnqueueAsyncCopyExt( + const void* src, + void* dst, + size_t size_bytes, + hcCommandKind copyDir, + const hc::AmPointerInfo& srcInfo, + const hc::AmPointerInfo& dstInfo, + const detail::HCCDevice *copyDevice) = 0; // Copy src to dst synchronously - virtual void copy(const void *src, void *dst, size_t size_bytes) { } + virtual + void copy(const void *src, void *dst, size_t size_bytes) = 0; /// copy src to dst, with caller providing extended information about the pointers. //// TODO - remove me, this form is deprecated. - virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceUnpinnedCopy) { }; - virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, - const detail::HCCDevice *copyDev, bool forceUnpinnedCopy) { }; + virtual + void copy_ext( + const void* src, + void* dst, + size_t size_bytes, + hcCommandKind copyDir, + const hc::AmPointerInfo& srcInfo, + const hc::AmPointerInfo& dstInfo, + bool forceUnpinnedCopy) = 0; + virtual + void copy_ext( + const void* src, + void* dst, + size_t size_bytes, + hcCommandKind copyDir, + const hc::AmPointerInfo& srcInfo, + const hc::AmPointerInfo& dstInfo, + const detail::HCCDevice* copyDev, + bool forceUnpinnedCopy) = 0; /// cleanup internal resource /// this function is usually called by dtor of the implementation classes @@ -318,14 +347,19 @@ class HCCQueue /// resource clean up sequence virtual void dispose() {} - virtual void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, - const void * args, size_t argsize, - hc::completion_future *cf, const char *kernel_name) { }; + virtual + void dispatch_hsa_kernel( + const hsa_kernel_dispatch_packet_t* aql, + void* args, + size_t argsize, + hc::completion_future* cf, + const char* kernel_name) = 0; /// set CU affinity of this queue. /// the setting is permanent until the queue is destroyed or another setting /// is called. - virtual bool set_cu_mask(const std::vector& cu_mask) { return false; }; + virtual + bool set_cu_mask(const std::vector&) = 0; uint64_t assign_op_seq_num() { return ++opSeqNums; }; @@ -397,7 +431,8 @@ class HCCDevice virtual void release(void* ptr, struct rw_info* key) = 0; /// build program - virtual void BuildProgram(void* size, void* source) {} + virtual + void BuildProgram(void* size, void* source) = 0; /// create kernel virtual @@ -408,14 +443,18 @@ class HCCDevice std::size_t callable_size = 0u) = 0; /// check if a given kernel is compatible with the device - virtual bool IsCompatibleKernel(void* size, void* source) { return true; } + virtual + bool IsCompatibleKernel(void* size, void* source) = 0; /// check the dimension information is correct - virtual bool check(size_t* size, size_t dim_ext) { return true; } + virtual + bool check(size_t* size, size_t dim_ext) = 0; /// create HCCQueue from current device - virtual std::shared_ptr createQueue(execute_order order = execute_in_order) = 0; - virtual ~HCCDevice() {} + virtual + std::shared_ptr createQueue( + execute_order order = execute_in_order) = 0; + virtual ~HCCDevice() = default; std::shared_ptr get_default_queue() { #if !TLS_QUEUE @@ -439,13 +478,30 @@ class HCCDevice virtual size_t GetMaxTileStaticSize() { return 0; } /// get all queues associated with this device - virtual std::vector< std::shared_ptr > get_all_queues() { return std::vector< std::shared_ptr >(); } + virtual + std::vector> get_all_queues() + { + return std::vector< std::shared_ptr >(); + } - virtual void memcpySymbol(const char* symbolName, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {} + virtual + void memcpySymbol( + const char* symbolName, + void* hostptr, + size_t count, + size_t offset = 0, + hcCommandKind kind = hcMemcpyHostToDevice) = 0; - virtual void memcpySymbol(void* symbolAddr, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {} + virtual + void memcpySymbol( + void* symbolAddr, + void* hostptr, + size_t count, + size_t offset = 0, + hcCommandKind kind = hcMemcpyHostToDevice) = 0; - virtual void* getSymbolAddress(const char* symbolName) { return nullptr; } + virtual + void* getSymbolAddress(const char* symbolName) = 0; /// get underlying native agent handle virtual void* getHSAAgent() { return nullptr; } @@ -453,8 +509,10 @@ class HCCDevice /// get the profile of the agent virtual hcAgentProfile getProfile() { return hcAgentProfileNone; } - /// check if @p other can access to this device's device memory, return true if so, false otherwise - virtual bool is_peer(const HCCDevice* other) {return false;} + /// check if @p other can access to this device's device memory, return true + /// if so, false otherwise + virtual + bool is_peer(const HCCDevice* other) = 0; /// get device's compute unit count virtual unsigned int get_compute_unit_count() {return 0;} @@ -476,22 +534,115 @@ class CPUQueue final : public HCCQueue memmove(dst, (char*)device + offset, count); } - void write(void* device, const void* src, size_t count, size_t offset, bool blocking) override { + void write( + void* device, + const void* src, + size_t count, + size_t offset, + bool) override + { if (src != device) memmove((char*)device + offset, src, count); } - void copy(void* src, void* dst, size_t count, size_t src_offset, size_t dst_offset, bool blocking) override { + void copy( + void* src, + void* dst, + size_t count, + size_t src_offset, + size_t dst_offset, + bool) override { if (src != dst) memmove((char*)dst + dst_offset, (char*)src + src_offset, count); } + void* map(void* device, size_t, size_t offset, bool) override + { + return (char*)device + offset; + } + + void unmap(void*, void*, size_t, size_t, bool) override {} + + void Push(void*, int, void*, bool) override {} + + void wait(hcWaitMode = hcWaitModeBlocked) override {} + + void copy(const void*, void*, size_t) override + { + throw std::runtime_error{"Unsupported."}; + } + void copy_ext( + const void*, + void*, + size_t, + hcCommandKind, + const hc::AmPointerInfo&, + const hc::AmPointerInfo&, + bool) override + { + throw std::runtime_error{"Unsupported."}; + } + void copy_ext( + const void*, + void*, + size_t, + hcCommandKind, + const hc::AmPointerInfo&, + const hc::AmPointerInfo&, + const detail::HCCDevice*, + bool) override + { + throw std::runtime_error{"Unsupported."}; + } [[noreturn]] void* CreateKernel( const char*, HCCQueue*, const void*, std::size_t) override { throw std::runtime_error{"Unsupported."}; } + [[noreturn]] + std::shared_ptr detectStreamDeps(hcCommandKind, HCCAsyncOp*) override + { + throw std::runtime_error{"Unsupported."}; + } + void dispatch_hsa_kernel( + const hsa_kernel_dispatch_packet_t*, + void*, + size_t, + hc::completion_future*, + const char*) override + { + throw std::runtime_error{"Unimplemented."}; + } + [[noreturn]] + std::shared_ptr EnqueueAsyncCopy( + const void*, void*, std::size_t) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::shared_ptr EnqueueAsyncCopyExt( + const void*, + void*, + size_t, + hcCommandKind, + const hc::AmPointerInfo&, + const hc::AmPointerInfo&, + const detail::HCCDevice*) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::shared_ptr EnqueueMarkerWithDependency( + int, std::shared_ptr*, memory_scope) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + std::uint32_t GetGroupSegmentSize(void*) override + { + throw std::runtime_error{"Unsupported."}; + } void LaunchKernel( void*, std::size_t, @@ -528,16 +679,11 @@ class CPUQueue final : public HCCQueue { throw std::runtime_error{"Unimplemented."}; } - - void* map(void* device, size_t count, size_t offset, bool modify) override { - return (char*)device + offset; + [[noreturn]] + bool set_cu_mask(const std::vector&) override + { + throw std::runtime_error{"Unimplemented."}; } - - void unmap(void* device, void* addr, size_t count, size_t offset, bool modify) override {} - - void Push(void *kernel, int idx, void* device, bool modify) override {} - - void wait(hcWaitMode = hcWaitModeBlocked) override {} }; /// cpu accelerator @@ -554,18 +700,60 @@ class CPUDevice final : public HCCDevice uint32_t get_version() const override { return 0; } std::shared_ptr createQueue( - execute_order order = execute_in_order) override + execute_order = execute_in_order) override { return std::shared_ptr(new CPUQueue(this)); } void* create(size_t count, struct rw_info* /* not used */ ) override { return kalmar_aligned_alloc(0x1000, count); } void release(void* ptr, struct rw_info* /* not used */) override { kalmar_aligned_free(ptr); } + + void BuildProgram(void*, void*) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + bool check(std::size_t*, std::size_t) override + { + throw std::runtime_error{"Unsupported."}; + } [[noreturn]] void* CreateKernel( const char*, HCCQueue*, std::unique_ptr, - std::size_t = 0u) + std::size_t = 0u) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + void* getSymbolAddress(const char*) override + { + throw std::runtime_error{"Unsupported."}; + } + [[noreturn]] + bool IsCompatibleKernel(void*, void*) override + { + throw std::runtime_error{"Unsupported."}; + } + bool is_peer(const HCCDevice*) override + { + return true; // CPU is peer to all agents. + } + void memcpySymbol( + const char*, + void*, + size_t, + size_t = 0, + hcCommandKind = hcMemcpyHostToDevice) override + { + throw std::runtime_error{"Unsupported."}; + } + void memcpySymbol( + void*, + void*, + size_t, + size_t = 0, + hcCommandKind = hcMemcpyHostToDevice) override { throw std::runtime_error{"Unsupported."}; } From 0377a998cb4881748b209fa01970b6ab99342da1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 26 Aug 2018 23:21:41 +0100 Subject: [PATCH 009/134] Linearise kernel section retrieval. --- hc2/headers/types/program_state.hpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/hc2/headers/types/program_state.hpp b/hc2/headers/types/program_state.hpp index ed3faffb000..4dc0a3ac004 100644 --- a/hc2/headers/types/program_state.hpp +++ b/hc2/headers/types/program_state.hpp @@ -127,18 +127,20 @@ namespace hc2 static int copy_kernel_sections_(dl_phdr_info* x, size_t, void* kernels) { - static constexpr const char kernel[] = ".kernel"; - auto out = static_cast(kernels); ELFIO::elfio tmp; - if (tmp.load(x->dlpi_name)) { - for (auto&& y : tmp.sections) { - if (y->get_name() == kernel) { - out->emplace_back( - y->get_data(), y->get_data() + y->get_size()); - } - } + + if (!tmp.load(x->dlpi_name)) return 0; + + for (auto&& y : tmp.sections) { + static constexpr const char kernel[] = ".kernel"; + + if (y->get_name() != kernel) continue; + + out->emplace_back(y->get_data(), y->get_data() + y->get_size()); + + return 0; } return 0; From fb91003cdc4818ae988a72357992f87dd111698d Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 27 Aug 2018 03:11:50 +0100 Subject: [PATCH 010/134] Add functional array_view. Not fully optimised, or fully thread-safe. --- include/hc.hpp | 1882 ++++++++++++++------------------------- include/kalmar_launch.h | 23 +- 2 files changed, 697 insertions(+), 1208 deletions(-) diff --git a/include/hc.hpp b/include/hc.hpp index 27f573791af..da1c58b69aa 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -24,17 +24,17 @@ #include "hcc_features.hpp" -//#include -//#include - -#include "/opt/rocm/include/hsa/hsa.h" -#include "/opt/rocm/include/hsa/hsa_ext_amd.h" +// #include +// #include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -74,10 +74,10 @@ using namespace detail::CLAMP; class accelerator; class accelerator_view; class completion_future; -template class extent; -template class tiled_extent; -template class array_view; -template class array; +template class extent; +template class tiled_extent; +template class array_view; +template class array; @@ -138,6 +138,7 @@ inline uint64_t get_tick_frequency() { */ class accelerator_view { public: + accelerator_view() = delete; /** * Copy-constructs an accelerator_view object. This function does a shallow * copy with the newly created accelerator_view object pointing to the same @@ -145,8 +146,7 @@ class accelerator_view { * * @param[in] other The accelerator_view object to be copied. */ - accelerator_view(const accelerator_view& other) : - pQueue(other.pQueue) {} + accelerator_view(const accelerator_view&) = default; /** * Assigns an accelerator_view object to "this" accelerator_view object and @@ -157,10 +157,7 @@ class accelerator_view { * @param[in] other The accelerator_view object to be assigned from. * @return A reference to "this" accelerator_view object. */ - accelerator_view& operator=(const accelerator_view& other) { - pQueue = other.pQueue; - return *this; - } + accelerator_view& operator=(const accelerator_view&) = default; /** * Returns the queuing mode that this accelerator_view was created with. @@ -612,9 +609,12 @@ class accelerator_view { * - Dispatch the command into the queue and flush it to the GPU. * - Kernargs and signals are automatically reclaimed by the HCC runtime. */ - void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, - const void * args, size_t argsize, - hc::completion_future *cf=nullptr, const char *kernel_name = nullptr) + void dispatch_hsa_kernel( + const hsa_kernel_dispatch_packet_t* aql, + void* args, + size_t argsize, + hc::completion_future* cf = nullptr, + const char* kernel_name = nullptr) { pQueue->dispatch_hsa_kernel(aql, args, argsize, cf, kernel_name); } @@ -646,8 +646,8 @@ class accelerator_view { std::shared_ptr pQueue; friend class accelerator; - template friend class array; - template friend class array_view; + template friend class array; + template friend class array_view; template friend @@ -688,12 +688,6 @@ class accelerator_view { friend completion_future parallel_for_each( const accelerator_view&, const tiled_extent&, const Kernel&); - - accelerator_view() __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - throw runtime_exception{"errorMsg_throw", 0}; -#endif - } }; // ------------------------------------------------------------------------ @@ -708,6 +702,8 @@ class accelerator_view { class accelerator { public: + inline static constexpr const wchar_t cpu_accelerator[]{L"cpu"}; + /** * Constructs a new accelerator object that represents the default * accelerator. This is equivalent to calling the constructor @@ -1140,7 +1136,8 @@ class completion_future { * object which does not refer to any asynchronous operation. Default * constructed completion_future objects have valid() == false */ - completion_future() : __amp_future(), __thread_then(nullptr), __asyncOp(nullptr) {}; + completion_future() + : __amp_future(), __thread_then(nullptr), __asyncOp(nullptr) {}; /** * Copy constructor. Constructs a new completion_future object that refers @@ -1149,8 +1146,7 @@ class completion_future { * @param[in] other An object of type completion_future from which to * initialize this. */ - completion_future(const completion_future& other) - : __amp_future(other.__amp_future), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {} + completion_future(const completion_future&) = default; /** * Move constructor. Move constructs a new completion_future object that @@ -1161,8 +1157,7 @@ class completion_future { * @param[in] other An object of type completion_future which the new * completion_future */ - completion_future(completion_future&& other) - : __amp_future(std::move(other.__amp_future)), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {} + completion_future(completion_future&&) = default; /** * Copy assignment. Copy assigns the contents of other to this. This method @@ -1172,14 +1167,7 @@ class completion_future { * @param[in] other An object of type completion_future which is copy * assigned to this. */ - completion_future& operator=(const completion_future& _Other) { - if (this != &_Other) { - __amp_future = _Other.__amp_future; - __thread_then = _Other.__thread_then; - __asyncOp = _Other.__asyncOp; - } - return (*this); - } + completion_future& operator=(const completion_future&) = default; /** * Move assignment. Move assigns the contents of other to this. This method @@ -1190,14 +1178,7 @@ class completion_future { * @param[in] other An object of type completion_future which is move * assigned to this. */ - completion_future& operator=(completion_future&& _Other) { - if (this != &_Other) { - __amp_future = std::move(_Other.__amp_future); - __thread_then = _Other.__thread_then; - __asyncOp = _Other.__asyncOp; - } - return (*this); - } + completion_future& operator=(completion_future&&) = default; /** * This method is functionally identical to std::shared_future::get. @@ -1389,9 +1370,11 @@ class completion_future { std::thread* __thread_then = nullptr; std::shared_ptr __asyncOp; - completion_future(std::shared_ptr event) : __amp_future(*(event->getFuture())), __asyncOp(event) {} + completion_future(std::shared_ptr event) + : __amp_future{*(event->getFuture())}, __asyncOp{std::move(event)} + {} - completion_future(const std::shared_future &__future) + completion_future(const std::shared_future& __future) : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {} friend class detail::HSAQueue; @@ -1436,7 +1419,7 @@ class completion_future { completion_future copy_async(const array_view& src, OutputIter destBegin); // array_view - template friend class array_view; + template friend class array_view; // accelerator_view friend class accelerator_view; @@ -1772,21 +1755,25 @@ class extent { * * @param[in] idx The right-hand index to be added or subtracted. */ - extent operator+(const index& idx) __CPU__ __HC__ { + extent operator+(const index& idx) const [[cpu, hc]] + { extent __r = *this; __r += idx; return __r; } - extent operator-(const index& idx) __CPU__ __HC__ { + extent operator-(const index& idx) const [[cpu, hc]] + { extent __r = *this; __r -= idx; return __r; } - extent& operator+=(const index& idx) __CPU__ __HC__ { + extent& operator+=(const index& idx) [[cpu, hc]] + { base_.operator+=(idx.base_); return *this; } - extent& operator-=(const index& idx) __CPU__ __HC__ { + extent& operator-=(const index& idx) [[cpu, hc]] + { base_.operator-=(idx.base_); return *this; } @@ -3782,82 +3769,6 @@ struct __is_container }; -// ------------------------------------------------------------------------ -// utility helper classes for array -// ------------------------------------------------------------------------ - -template -struct array_projection_helper -{ - // array, where N>1 - // array_view operator[](int i0) __CPU__ __HC__; - // array_view operator[](int i0) const __CPU__ __HC__; - static_assert(N > 1, "projection_helper is only supported on array with a rank of 2 or higher"); - typedef array_view result_type; - typedef array_view const_result_type; - static result_type project(array& now, int stride) __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - if( stride < 0) - throw runtime_exception{"errorMsg_throw", 0}; -#endif - int comp[N - 1], i; - for (i = N - 1; i > 0; --i) - comp[i - 1] = now.extent[i]; - extent ext(comp); - int offset = ext.size() * stride; -#if __HCC_ACCELERATOR__ != 1 - if( offset >= now.extent.size()) - throw runtime_exception{"errorMsg_throw", 0}; -#endif - return result_type(now.m_device, ext, ext, index(), offset); - } - static const_result_type project(const array& now, int stride) __CPU__ __HC__ { - int comp[N - 1], i; - for (i = N - 1; i > 0; --i) - comp[i - 1] = now.extent[i]; - extent ext(comp); - int offset = ext.size() * stride; - return const_result_type(now.m_device, ext, ext, index(), offset); - } -}; - -template -struct array_projection_helper -{ - // array - // T& operator[](int i0) __CPU__ __HC__; - // const T& operator[](int i0) const __CPU__ __HC__; - typedef T& result_type; - typedef const T& const_result_type; - static result_type project(array& now, int i) __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - now.m_device.synchronize(true); -#endif - T *ptr = reinterpret_cast(now.m_device.get() + i); - return *ptr; - } - static const_result_type project(const array& now, int i) __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - now.m_device.synchronize(); -#endif - const T *ptr = reinterpret_cast(now.m_device.get() + i); - return *ptr; - } -}; - -template -const extent& check(const extent& ext) -{ -#if __HCC_ACCELERATOR__ != 1 - for (int i = 0; i < N; i++) - { - if(ext[i] <=0) - throw runtime_exception{"errorMsg_throw", 0}; - } -#endif - return ext; -} - // ------------------------------------------------------------------------ // forward declarations of copy routines used by array / array_view // ------------------------------------------------------------------------ @@ -4196,7 +4107,7 @@ class array : private array_base { const hc::extent& ext, accelerator_view av, access_type cpu_access_type = access_type_auto) - : + try : owner_{std::move(av)}, associate_{owner_}, extent_{ext}, @@ -4204,6 +4115,11 @@ class array : private array_base { data_{allocate_(), Deleter{}}, this_idx_{lock_this_()} {} + catch (const std::exception& ex) { + if (ext.size() != 0) throw ex; + + throw std::domain_error{"Tried to construct zero-sized array."}; + } /** @{ */ /** @@ -4213,30 +4129,34 @@ class array : private array_base { : array{ hc::extent{e0}, + static_cast(accelerator_pointer), accelerator::get_auto_selection_view(), - static_cast(accelerator_pointer)} + access_type_none} {} array(int e0, int e1, void* accelerator_pointer) : array{ hc::extent{e0, e1}, + static_cast(accelerator_pointer), accelerator::get_auto_selection_view(), - static_cast(accelerator_pointer)} + access_type_none} {} array(int e0, int e1, int e2, void* accelerator_pointer) : array{ hc::extent{e0, e1, e2}, + static_cast(accelerator_pointer), accelerator::get_auto_selection_view(), - static_cast(accelerator_pointer)} + access_type_none} {} array(const hc::extent& ext, void* accelerator_pointer) : array{ ext, + static_cast(accelerator_pointer), accelerator::get_auto_selection_view(), - static_cast(accelerator_pointer)} + access_type_none} {} /** @} */ @@ -4253,15 +4173,16 @@ class array : private array_base { const extent& ext, accelerator_view av, void* accelerator_pointer, - access_type cpu_access_type = access_type_auto) + access_type cpu_access_type = access_type_none) : - owner_{av}, - associate_{owner_}, - extent_{ext}, - cpu_access_{cpu_access_type}, - data_{static_cast(accelerator_pointer), Deleter{}}, - this_idx_{lock_this_()} - {} + array{ + ext, + static_cast(accelerator_pointer), + std::move(av), + cpu_access_type} + { + // TODO: handle access types other than none. + } /** @{ */ /** @@ -4489,7 +4410,7 @@ class array : private array_base { const hc::extent& ext, accelerator_view av, accelerator_view associated_av) - : + try : owner_{std::move(av)}, associate_{std::move(associated_av)}, extent_{ext}, @@ -4497,6 +4418,11 @@ class array : private array_base { data_{allocate_(), Deleter{}}, this_idx_{lock_this_()} {} + catch (const std::exception& ex) { + if (ext.size() != 0) throw ex; + + throw std::domain_error{"Tried to construct zero-sized array."}; + } /** @{ */ /** @@ -4732,7 +4658,7 @@ class array : private array_base { * @return Returns *this. */ array& operator=(array&& other) - { // TODO: potentially inefficient. + { // TODO: fix infinite recursion, this is temporary bad, explosive juju. array tmp{std::move(other)}; std::swap(*this, tmp); @@ -5275,12 +5201,44 @@ class array : private array_base { struct array_view_base { inline static constexpr std::size_t max_array_view_cnt_{65536}; - inline static std::mutex mutex_; // TODO: use shared_mutex if C++17 feasible - inline static std::unordered_map> cache_{}; + inline static std::array< // TODO: this is a placeholder, and most dubious. + std::pair< + std::atomic, + std::pair>>>, + max_array_view_cnt_> writers_{}; + inline static std::mutex mutex_{}; // TODO: use shared_mutex if C++17 feasible. + inline static std::unordered_map< + const void*, std::shared_ptr> cache_{}; + inline thread_local static std::vector captured_{}; static - const std::shared_ptr& cache_for_(void* ptr, std::size_t byte_cnt) + const std::shared_ptr& cache_for_sourceless_( + void* ptr, std::size_t byte_cnt) + { + static const accelerator acc{}; + + auto s = hsa_memory_allocate( + *static_cast(acc.get_hsa_am_system_region()), + byte_cnt, + &ptr); + + if (s != HSA_STATUS_SUCCESS) { + throw std::runtime_error{ + "Failed cache allocation for sourceless array_view."}; + } + + std::lock_guard lck{mutex_}; + + return cache_.emplace( + std::piecewise_construct, std::make_tuple(ptr), + std::make_tuple(ptr, hsa_memory_free)).first->second; + } + + const std::shared_ptr& cache_for_( + const void* ptr, std::size_t byte_cnt) { + if (ptr == this) return cache_for_sourceless_(this, byte_cnt); + std::lock_guard lck{mutex_}; const auto it = cache_.find(ptr); @@ -5304,6 +5262,17 @@ struct array_view_base { std::make_tuple(ptr), std::make_tuple(tmp, hsa_memory_free)).first->second; } + + static + std::size_t writers_for_() + { + for (decltype(writers_.size()) i = 0u; i != writers_.size(); ++i) { + if (writers_[i].first++ == 0) return i; + else --writers_[i].first; + } + + throw std::runtime_error{"Failed to associate writers for array_view."}; + } }; template @@ -5315,11 +5284,26 @@ class array_view : private array_view_base { std::is_trivially_destructible{}, "Only trivially destructible types are supported."); - std::shared_ptr data_; - accelerator_view owner_; + using ValT_ = typename std::remove_const::type; + + // TODO: compress data layout to make array_view more pointer like in cost. + #if !defined(__HCC_ACCELERATOR__) // TODO: temporary, assess shared_ptr use. + std::shared_ptr data_; + #else + struct { + typename std::aligned_storage< + sizeof(std::shared_ptr), + alignof(std::shared_ptr)>::type pad_; + + void* get() const [[cpu, hc]] { return nullptr; } + } data_; + #endif + const accelerator* owner_; hc::extent extent_; T* base_ptr_; - void* source_; + typename std::conditional< + std::is_const{}, const void*, void*>::type source_; + std::size_t writers_for_this_; template friend class array; template friend class array_view; @@ -5339,6 +5323,27 @@ class array_view : private array_view_base { template friend void copy(const array_view&, const array_view&); + + T* updated_data_() const [[cpu]] + { + if (writers_for_this_ == max_array_view_cnt_) return base_ptr_; + if (writers_[writers_for_this_].second.second.empty()) return base_ptr_; + + std::lock_guard lck{ + writers_[writers_for_this_].second.first}; + + for (auto&& x : writers_[writers_for_this_].second.second) { + if (!x.valid()) continue; + x.wait(); + } + writers_[writers_for_this_].second.second.clear(); + + return base_ptr_; + } + T* updated_data_() const [[hc]] + { + return base_ptr_; + } public: /** * The rank of this array. @@ -5363,11 +5368,23 @@ class array_view : private array_view_base { * @param[in] src An array which contains the data that this array_view is * bound to. */ - array_view(hc::array& src) [[cpu, hc]] + array_view(hc::array& src) [[cpu]] : array_view{src.get_extent(), src.data()} { // TODO: refactor to pass owner directly to delegated to ctor. - owner_ = src.get_accelerator_view(); + static const auto accs = accelerator::get_all(); + + for (auto&& acc : accs) { + if (acc != src.get_accelerator_view().get_accelerator()) continue; + + owner_ = &acc; + break; + } + + copy(src, base_ptr_); // TODO: could directly re-use the array storage. } + array_view(hc::array& src) [[hc]] + : array_view{src.get_extent(), src.data()} + {} template< typename Container, @@ -5400,7 +5417,7 @@ class array_view : private array_view_base { : array_view{extent, src.data()} { static_assert( - std::is_same::value, + std::is_same::value, "container element type and array view element type must match"); } @@ -5415,15 +5432,38 @@ class array_view : private array_view_base { * @param[in] ext The extent of this array_view. */ array_view(const hc::extent& ext, value_type* src) [[cpu]] - : + try : data_{cache_for_(src, ext.size() * sizeof(T))}, - owner_{accelerator{L"cpu"}.get_default_view()}, + owner_{nullptr}, extent_{ext}, base_ptr_{static_cast(data_.get())}, - source_{src} - {} + source_{(src == reinterpret_cast(this)) ? base_ptr_ : src}, + writers_for_this_{ + std::is_const{} ? max_array_view_cnt_ : writers_for_()} + { + if (source_ == base_ptr_) return; + + auto s = hsa_memory_copy( + const_cast(base_ptr_), // + source_, + extent_.size() * sizeof(T)); + + if (s == HSA_STATUS_SUCCESS) return; + + throw std::runtime_error{"Failed to copy source data into array_view."}; + } + catch (const std::exception& ex) { + if (ext.size() != 0) throw ex; + + throw std::domain_error{"Tried to construct zero-sized array_view."}; + } array_view(const hc::extent& ext, value_type* src) [[hc]] - : data_{nullptr, [](void*){}}, extent_{ext}, base_ptr_{src} + : + owner_{nullptr}, + extent_{ext}, + base_ptr_{src}, + source_{nullptr}, + writers_for_this_{max_array_view_cnt_} {} /** @@ -5523,8 +5563,73 @@ class array_view : private array_view_base { * array_view from which to initialize this * new array_view. */ - array_view(const array_view& other) [[cpu, hc]] = default; + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + array_view(const array_view& other) [[cpu]] + : + data_{other.data_}, + owner_{other.owner_}, + extent_{other.extent_}, + base_ptr_{other.base_ptr_}, + source_{other.source_}, + writers_for_this_{other.writers_for_this_} + { // N.B.: this is coupled with make_registered_kernel, and relies on it + // copying the user provided Callable. + ++writers_[writers_for_this_].first; + captured_.push_back(writers_for_this_); + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + array_view(const array_view& other) [[cpu]] + : + data_{other.data_}, + owner_{other.owner_}, + extent_{other.extent_}, + base_ptr_{other.base_ptr_}, + source_{other.source_}, + writers_for_this_{other.writers_for_this_} + { + ++writers_[writers_for_this_].first; + } + + array_view(const array_view& other) [[hc]] + : + owner_{nullptr}, + extent_{other.extent_}, + base_ptr_{other.base_ptr_}, + writers_for_this_{max_array_view_cnt_} + {} + template< + typename U, + typename V = T, + typename std::enable_if< + !std::is_const{} && std::is_const{}>::type* = nullptr> + array_view(const array_view& other) [[cpu]] + : + data_{other.data_}, + owner_{other.owner_}, + extent_{other.extent_}, + base_ptr_{other.base_ptr_}, + source_{other.source_}, + writers_for_this_{other.writers_for_this_} + { + ++writers_[writers_for_this_].first; + } + template< + typename U, + typename V = T, + typename std::enable_if< + !std::is_const{} && std::is_const{}>::type* = nullptr> + array_view(const array_view& other) [[hc]] + : + owner_{nullptr}, + extent_{other.extent_}, + base_ptr_{other.base_ptr_}, + writers_for_this_{max_array_view_cnt_} + {} /** * Move constructor. Constructs an array_view from the supplied argument * other. @@ -5536,13 +5641,15 @@ class array_view : private array_view_base { array_view(array_view&& other) [[cpu, hc]] : data_{std::move(other.data_)}, - owner_{std::move(other.owner_)}, + owner_{other.owner_}, extent_{std::move(other.extent_)}, base_ptr_{other.base_ptr_}, - source_{other.source_} + source_{other.source_}, + writers_for_this_{other.writers_for_this_} { other.base_ptr_ = nullptr; other.source_ = nullptr; + other.writers_for_this_ = max_array_view_cnt_; } /** @@ -5564,7 +5671,10 @@ class array_view : private array_view_base { */ accelerator_view get_source_accelerator_view() const { - return owner_; + static const auto cpu_av{ + accelerator{accelerator::cpu_accelerator}.get_default_view()}; + + return owner_ ? owner_->get_default_view() : cpu_av; } /** @@ -5575,7 +5685,15 @@ class array_view : private array_view_base { * into this array. * @return Returns *this. */ - array_view& operator=(const array_view& other) [[cpu, hc]] = default; + array_view& operator=(const array_view& other) [[cpu, hc]] + { + using std::swap; + + array_view tmp{other}; + swap(*this, tmp); + + return *this; + } /** * Moves the contents of the array_view "other" to this array_view, leaving @@ -5585,13 +5703,30 @@ class array_view : private array_view_base { * into this array. * @return Returns *this. */ - array_view& operator=(array_view&& other) - { + array_view& operator=(array_view&& other) [[cpu]] + { // TODO: redo. + using std::swap; + + swap(data_, other.data_); + swap(owner_, other.owner_); + swap(extent_, other.extent_); + swap(base_ptr_, other.base_ptr_); + swap(source_, other.source_); + swap(writers_for_this_, other.writers_for_this_); + + return *this; + } + array_view& operator=(array_view&& other) [[hc]] + { // TODO: redo. using std::swap; - swap(*this, other); + + swap(owner_, other.owner_); + swap(extent_, other.extent_); + swap(base_ptr_, other.base_ptr_); return *this; } + /** * Copies the data referred to by this array_view to the array given by * "dest", as if by calling "copy(*this, dest)" @@ -5631,7 +5766,14 @@ class array_view : private array_view_base { * * @return A pointer to the first element in the linearised array. */ - T* data() const [[cpu, hc]] + T* data() const [[cpu]] + { + static_assert( + N == 1, "data() is only permissible on array views of rank 1"); + + return updated_data_(); + } + T* data() const [[hc]] { static_assert( N == 1, "data() is only permissible on array views of rank 1"); @@ -5645,9 +5787,9 @@ class array_view : private array_view_base { * @return A (const) pointer to the first element in the array_view on the * device memory. */ - T* accelerator_pointer() const [[cpu, hc]] // TODO: this should also be removed. + T* accelerator_pointer() const [[cpu, hc]] // TODO: this should be removed. { - return data(); + return base_ptr_; } /** @@ -5657,15 +5799,16 @@ class array_view : private array_view_base { */ void refresh() const { - static const auto cpu_av = accelerator{L"cpu"}.get_default_view(); + static const accelerator cpu{accelerator::cpu_accelerator}; - if (owner_ == cpu_av) return; + if (owner_ && *owner_ == cpu) return; + if (base_ptr_ == source_) return; auto s = hsa_memory_copy( - base_ptr_, source_, extent_.size() * sizeof(T)); - if (s != HSA_STATUS_SUCCESS) { - throw std::runtime_error{"Failed to refresh cache for array_view."}; - } + const_cast(base_ptr_), source_, extent_.size() * sizeof(T)); + if (s == HSA_STATUS_SUCCESS) return; + + throw std::runtime_error{"Failed to refresh cache for array_view."}; } /** @@ -5701,13 +5844,25 @@ class array_view : private array_view_base { * type of access on the data source that the array_view is * synchronized for. */ + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> void synchronize(access_type type = access_type_read) const { - static const auto cpu_av = accelerator{L"cpu"}.get_default_view(); - - if (owner_ == cpu_av) return; if (type == access_type_none || type == access_type_write) return; + { + std::lock_guard lck{ + writers_[writers_for_this_].second.first}; + + for (auto&& x : writers_[writers_for_this_].second.second) { + if (x.valid()) x.wait(); + } + writers_[writers_for_this_].second.second.clear(); + } + + if (source_ == base_ptr_) return; + auto s = hsa_memory_copy( source_, base_ptr_, extent_.size() * sizeof(T)); @@ -5715,6 +5870,13 @@ class array_view : private array_view_base { throw std::runtime_error{"Failed to synchronise array_view."}; } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + void synchronize(access_type = access_type_read) const + { + return; + } /** * An asynchronous version of synchronize, which returns a completion @@ -5770,8 +5932,10 @@ class array_view : private array_view_base { */ void synchronize_to( const accelerator_view& av, access_type type = access_type_read) const - { - if (av != owner_) synchronize(type); + { // TODO: assess optimisation opportunities. + if (owner_ && av.get_accelerator() == *owner_) return; + + synchronize(type); } /** @@ -5793,8 +5957,9 @@ class array_view : private array_view_base { const accelerator_view& av, access_type type = access_type_read) const { if (type == access_type_none || type == access_type_write) return {}; + if (owner_ && av.get_accelerator() == *owner_) return {}; - if (av != owner_) return synchronize_async(type); + return synchronize_async(type); } /** @@ -5817,16 +5982,12 @@ class array_view : private array_view_base { * @param[in] idx An object of type index that specifies the location of * the element. */ - T& operator[](const index& idx) const [[cpu]] - { - return data()[detail::amp_helper, hc::extent>:: - flatten(idx, extent_)]; - } - T& operator[](const index& idx) const [[hc]] + T& operator[](const index& idx) const [[cpu, hc]] { - return data()[detail::amp_helper, hc::extent>:: + return updated_data_()[detail::amp_helper, hc::extent>:: flatten(idx, extent_)]; } + template::type* = nullptr> T& operator[](int i0) const [[cpu]][[hc]] { @@ -5921,9 +6082,8 @@ class array_view : private array_view_base { hc::extent ext; for (auto i = 1; i != N; ++i) ext[i - 1] = extent_[i]; - array_view tmp{ext, static_cast(source_)}; // TODO: this is incorrect. + array_view tmp{ext, static_cast(base_ptr_)}; // TODO: this is incorrect. tmp.base_ptr_ += i0 * ext.size(); - tmp.source_ += i0 * ext.size(); return tmp; } @@ -5954,15 +6114,34 @@ class array_view : private array_view_base { * and with the specified extent. */ array_view section( - const index& idx, const hc::extent& ext) const [[cpu]] + const index& origin, const hc::extent& ext) const [[cpu]] { - // if (!detail::amp_helper, hc::extent>::contains(idx, ext, extent_)) - // throw runtime_exception{"errorMsg_throw", 0}; + if (extent_.size() < (ext + origin).size()) { + throw runtime_exception{"errorMsg_throw", 0}; + } - // array_view av(cache, ext, extent_base, idx + index_base, offset); + const auto dx = detail::amp_helper, hc::extent>:: + flatten(origin, extent_); - // return av; - return *this; + array_view tmp{*this}; + tmp.extent_ = ext; + tmp.base_ptr_ += dx; + tmp.source_ = static_cast(tmp.source_) + dx; + + return tmp; + } + array_view section( + const index& origin, const hc::extent& ext) const [[hc]] + { + const auto dx = detail::amp_helper, hc::extent>:: + flatten(origin, extent_); + + array_view tmp{*this}; + tmp.extent_ = ext; + tmp.base_ptr_ += dx; + tmp.source_ = static_cast(tmp.source_) + dx; + + return tmp; } /** @@ -6089,7 +6268,13 @@ class array_view : private array_view_base { ~array_view() [[cpu]][[hc]] { #if __HCC_ACCELERATOR__ != 1 - synchronize(access_type_read_write); + if (!data_) return; + + if (writers_for_this_ != max_array_view_cnt_) { + if (--writers_[writers_for_this_].first == 0) { + synchronize(access_type_read_write); + } + } std::lock_guard lck{mutex_}; @@ -6099,842 +6284,61 @@ class array_view : private array_view_base { }; // ------------------------------------------------------------------------ -// array_view (read-only) +// copy // ------------------------------------------------------------------------ /** - * The partial specialization array_view represents a view over - * elements of type const T with rank N. The elements are readonly. At the - * boundary of a call site (such as parallel_for_each), this form of array_view - * need only be copied to the target accelerator if it isn't already there. It - * will not be copied out. + * The contents of "src" are copied into "dest". The source and destination may + * reside on different accelerators. If the extents of "src" and "dest" don't + * match, a runtime exception is thrown. + * + * @param[in] src An object of type array to be copied from. + * @param[out] dest An object of type array to be copied to. */ -template -class array_view +template +inline +void copy(const array& src, array& dest) { -public: - typedef typename std::remove_const::type nc_T; - -#if __HCC_ACCELERATOR__ == 1 - typedef detail::_data acc_buffer_t; -#else - typedef detail::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; + if (src.get_extent() != dest.get_extent()) { + throw std::logic_error{"Tried to copy arrays of mismatched extents."}; + } - /** - * The element type of this array. - */ - typedef const T value_type; + src.get_accelerator_view().wait(); // TODO: overly conservative, temporary. - /** - * There is no default constructor for array_view. - */ - array_view() = delete; + auto s = hsa_memory_copy( + dest.data(), src.data(), src.get_extent().size() * sizeof(T)); - /** - * Constructs an array_view which is bound to the data contained in the - * "src" array. The extent of the array_view is that of the src array, and - * the origin of the array view is at zero. - * - * @param[in] src An array which contains the data that this array_view is - * bound to. - */ - array_view(const array& src) __CPU__ __HC__ - : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} + if (s == HSA_STATUS_SUCCESS) return; - // FIXME: following interfaces were not implemented yet - // template - // explicit array_view::array_view(const Container& src); - // template - // explicit array_view::array_view(const value_type (&src) [Size]) __CPU__ __HC__; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A template argument that must resolve to a linear - * container that supports .data() and .size() members (such - * as std::vector or std::array) - * @param[in] extent The extent of this array_view. - */ - template ::value>::type> - array_view(const extent& extent, const Container& src) - : array_view(extent, src.data()) - { static_assert( std::is_same::type>::type, T>::value, "container element type and array view element type must match"); } - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than the - * size of extent, the behavior is undefined. - * @param[in] ext The extent of this array_view. - */ - array_view(const hc::extent& ext, const value_type* src) __CPU__ __HC__ -#if __HCC_ACCELERATOR__ == 1 - : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {} -#else - : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {} -#endif - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A template argument that must resolve to a contiguous - * container that supports .data() and .size() members (such - * as std::vector or std::array) - */ - template ::value>::type> - array_view(int e0, Container& src) : array_view(hc::extent<1>(e0), src) {} - template ::value>::type> - array_view(int e0, int e1, Container& src) - : array_view(hc::extent(e0, e1), src) {} - template ::value>::type> - array_view(int e0, int e1, int e2, Container& src) - : array_view(hc::extent(e0, e1, e2), src) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than - * the size of extent, the behavior is undefined. - */ - array_view(int e0, const value_type *src) __CPU__ __HC__ - : array_view(hc::extent<1>(e0), src) {} - array_view(int e0, int e1, const value_type *src) __CPU__ __HC__ - : array_view(hc::extent<2>(e0, e1), src) {} - array_view(int e0, int e1, int e2, const value_type *src) __CPU__ __HC__ - : array_view(hc::extent<3>(e0, e1, e2), src) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view or - * array_view from which to initialize this - * new array_view. - */ - array_view(const array_view& other) __CPU__ __HC__ - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view from which to - * initialize this new array_view. - */ - array_view(const array_view& other) __CPU__ __HC__ - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Access the extent that defines the shape of this array_view. - */ - hc::extent get_extent() const __CPU__ __HC__ { return extent; } - - /** - * Access the accelerator_view where the data source of the array_view is - * located. - * - * When the data source of the array_view is native CPU memory, the method - * returns accelerator(accelerator::cpu_accelerator).default_view. When the - * data source underlying the array_view is an array, the method returns - * the accelerator_view where the source array is located. - */ - accelerator_view get_source_accelerator_view() const { return cache.get_av(); } - - /** @{ */ - /** - * Assigns the contents of the array_view "other" to this array_view, using - * a shallow copy. Both array_views will refer to the same data. - * - * @param[in] other An object of type array_view from which to copy - * into this array. - * @return Returns *this. - */ - array_view& operator=(const array_view& other) __CPU__ __HC__ { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - return *this; - } - - array_view& operator=(const array_view& other) __CPU__ __HC__ { - if (this != &other) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - } - return *this; - } - - /** @} */ - - /** - * Copies the data referred to by this array_view to the array given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array to which to copy data from - * this array. - */ - void copy_to(array& dest) const { copy(*this, dest); } - - /** - * Copies the contents of this array_view to the array_view given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array_view to which to copy data - * from this array. - */ - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the first data element underlying this array_view. - * This is only available on array_views of rank 1. - * - * When the data source of the array_view is native CPU memory, the pointer - * returned by data() is valid for the lifetime of the data source. - * - * When the data source underlying the array_view is an array, or the array - * view is created without a data source, the pointer returned by data() in - * CPU context is ephemeral and is invalidated when the original data - * source or any of its views are accessed on an accelerator_view through a - * parallel_for_each or a copy operation. - * - * @return A const pointer to the first element in the linearised array. - */ - const T* data() const __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - cache.get_cpu_access(); -#endif - static_assert(N == 1, "data() is only permissible on array views of rank 1"); - return reinterpret_cast(cache.get() + offset + index_base[0]); - } - - /** - * Returns a pointer to the device memory underlying this array_view. - * - * @return A (const) pointer to the first element in the array_view on the - * device memory. - */ - T* accelerator_pointer() const __CPU__ __HC__ { - return reinterpret_cast(cache.get_device_pointer() + offset + index_base[0]); - } - - /** - * Calling this member function informs the array_view that its bound - * memory has been modified outside the array_view interface. This will - * render all cached information stale. - */ - void refresh() const { cache.refresh(); } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to its source data container. For - * example, for an array_view on system memory, if the data underlying the - * view are modified on a remote accelerator_view through a - * parallel_for_each invocation, calling synchronize ensures that the - * modifications are synchronized to the source data and will be visible - * through the system memory pointer which the array_view was created over. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the source data - * container through the "type" parameter. For example specifying a - * "access_type_read" (which is also the default value of the parameter) - * indicates that the data has been synchronized to its source location - * only for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to its source location - * both for reading and writing; i.e. any modifications to the source data - * directly through the source data container are legal after synchronizing - * the array_view with write access and before subsequently accessing the - * array_view on another remote location. - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * the source location, which is unnecessary if the contents are intended - * to be overwritten without reading. - */ - void synchronize() const { cache.get_cpu_access(); } - - /** - * An asynchronous version of synchronize, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - completion_future synchronize_async() const { - std::future fut = std::async([&]() mutable { synchronize(); }); - return completion_future(fut.share()); - } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to the specified accelerator_view - * "av". For example, for an array_view on system memory, if the data - * underlying the view is modified on the CPU, and synchronize_to is called - * on "this" array_view, then the array_view contents are cached on the - * specified accelerator_view location. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - */ - void synchronize_to(const accelerator_view& av) const [[cpu]] - { - cache.sync_to(av.pQueue); - } - - /** - * An asynchronous version of synchronize_to, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: this method is not implemented yet - completion_future synchronize_to_async(const accelerator_view& av) const; - - /** @{ */ - /** - * Returns a const reference to the element of this array_view that is at - * the location in N-dimensional space specified by "idx". - * - * @param[in] idx An object of type index that specifies the location of - * the element. - */ - const T& operator[](const index& idx) const __CPU__ __HC__ { -#if __HCC_ACCELERATOR__ != 1 - cache.get_cpu_access(); -#endif - const T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[detail::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; - } - const T& operator()(const index& idx) const __CPU__ __HC__ { - return (*this)[idx]; - } - - /** @} */ - - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * Unlike the other indexing operators for accessing the array_view on the - * CPU, this method does not implicitly synchronize this array_view's - * contents to the CPU. After accessing the array_view on a remote location - * or performing a copy operation involving this array_view, users are - * responsible to explicitly synchronize the array_view to the CPU before - * calling this method. Failure to do so results in undefined behavior. - */ - // FIXME: this method is not implemented - const T& get_ref(const index& idx) const __CPU__ __HC__; - - /** @{ */ - /** - * Equivalent to - * "array_view::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - const T& operator()(int i0) const __CPU__ __HC__ { - static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view"); - return (*this)[index<1>(i0)]; - } - - const T& operator()(int i0, int i1) const __CPU__ __HC__ { - static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view"); - return (*this)[index<2>(i0, i1)]; - } - const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ { - static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view"); - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @} */ - - /** @{ */ - /** - * This overload is defined for array_view where @f$N \ge 2@f$. - * - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, ...); - * - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param[in] i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array_view. - */ - typename projection_helper::const_result_type - operator[] (int i) const __CPU__ __HC__ { - return projection_helper::project(*this, i); - } - - // FIXME: typename projection_helper::const_result_type - // operator() (int i0) const __CPU__ __HC__ - // is not implemented - - /** @} */ - - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] idx Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const index& idx, - const hc::extent& ext) const __CPU__ __HC__ { - array_view av(cache, ext, extent_base, idx + index_base, offset); - return av; - } - - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const index& idx) const __CPU__ __HC__ { - hc::extent ext(extent); - detail::amp_helper, hc::extent>::minus(idx, ext); - return section(idx, ext); - } - - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const hc::extent& ext) const __CPU__ __HC__ { - index idx; - return section(idx, ext); - } - - /** @{ */ - /** - * Equivalent to - * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) const __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); - } - - array_view section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); - } - - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** - * This member function is similar to "array::reinterpret_as", - * although it only supports array_views of rank 1 (only those guarantee - * that all elements are laid out contiguously). - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array_view. - * - * @return Returns an array_view from this array_view with the element - * type reinterpreted from T to ElementType. - */ - template - array_view reinterpret_as() const __CPU__ __HC__ { - static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __HCC_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(cache), - hc::extent<1>(size), - (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); - return av; - } - - /** - * This member function is similar to "array::view_as", although it - * only supports array_views of rank 1 (only those guarantee that all - * elements are laid out contiguously). - * - * @return Returns an array_view from this array_view with the rank - * changed to K from 1. - */ - template - array_view view_as(hc::extent viewExtent) const __CPU__ __HC__ { - static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __HCC_ACCELERATOR__ != 1 - if ( viewExtent.size() > extent.size()) - throw runtime_exception{"errorMsg_throw", 0}; -#endif - array_view av(cache, viewExtent, offset + index_base[0]); - return av; - } - - ~array_view() __CPU__ __HC__ = default; - - // FIXME: the following functions may be considered to move to private - const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; } - - int get_offset() const __CPU__ __HC__ { return offset; } - - index get_index_base() const __CPU__ __HC__ { return index_base; } - -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template - friend - bool is_flat(const array_view&) noexcept; - template - friend - void copy(const array&, const array_view&); - template - friend - void copy(InputIter, InputIter, const array_view&); - template - friend - void copy(const array_view&, array&); - template - friend - void copy(const array_view&, OutputIter); - template - friend - void copy(const array_view&, const array_view&); - - // used by view_as and reinterpret_as - array_view(const acc_buffer_t& cache, const hc::extent& ext, - int offset) __CPU__ __HC__ - : cache(cache), extent(ext), extent_base(ext), offset(offset) {} - - // used by section and projection - array_view(const acc_buffer_t& cache, const hc::extent& ext_now, - const hc::extent& ext_b, - const index& idx_b, int off) __CPU__ __HC__ - : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), - offset(off) {} - - acc_buffer_t cache; - hc::extent extent; - hc::extent extent_base; - index index_base; - int offset; -}; - -// ------------------------------------------------------------------------ -// utility functions for copy -// ------------------------------------------------------------------------ - -template -static inline bool is_flat(const array_view& av) noexcept { - return av.extent == av.extent_base && av.index_base == index(); -} - -template -static inline bool is_flat(const array_view&) noexcept { return true; } - -template -struct copy_input -{ - void operator()(InputIter& It, T* ptr, const extent& ext, - const extent& base, const index& idx) - { - size_t stride = 1; - for (int i = dim; i < N; i++) - stride *= base[i]; - ptr += stride * idx[dim - 1]; - for (int i = 0; i < ext[dim - 1]; i++) { - copy_input()(It, ptr, ext, base, idx); - ptr += stride; - } - } -}; - -template -struct copy_input -{ - void operator()(InputIter& It, T* ptr, const extent& ext, - const extent&, const index& idx) - { - InputIter end = It; - std::advance(end, ext[N - 1]); - std::copy(It, end, ptr + idx[N - 1]); - It = end; - } -}; - -template -struct copy_output -{ - void operator()(const T* ptr, OutputIter& It, const extent& ext, - const extent& base, const index& idx) - { - size_t stride = 1; - for (int i = dim; i < N; i++) - stride *= base[i]; - ptr += stride * idx[dim - 1]; - for (int i = 0; i < ext[dim - 1]; i++) { - copy_output()(ptr, It, ext, base, idx); - ptr += stride; - } - } -}; - -template -struct copy_output -{ - void operator()(const T* ptr, OutputIter& It, const extent& ext, - const extent&, const index& idx) - { - ptr += idx[N - 1]; - It = std::copy(ptr, ptr + ext[N - 1], It); - } -}; - -template -struct copy_bidir -{ - void operator()(const T* src, T* dst, const extent& ext, - const extent& base1, const index& idx1, - const extent& base2, const index& idx2) - { - size_t stride1 = 1; - for (int i = dim; i < N; i++) - stride1 *= base1[i]; - src += stride1 * idx1[dim - 1]; - - size_t stride2 = 1; - for (int i = dim; i < N; i++) - stride2 *= base2[i]; - dst += stride2 * idx2[dim - 1]; - - for (int i = 0; i < ext[dim - 1]; i++) { - copy_bidir()(src, dst, ext, base1, idx1, base2, idx2); - src += stride1; - dst += stride2; - } - } -}; - -template -struct copy_bidir -{ - void operator()(const T* src, T* dst, const extent& ext, - const extent&, const index& idx1, - const extent&, const index& idx2) - { - src += idx1[N - 1]; - dst += idx2[N - 1]; - std::copy(src, src + ext[N - 1], dst); - } -}; - -template -struct do_copy -{ - template