From d5b89dcb7f1bae5590782ae1ece56e9a0d1d5a4c Mon Sep 17 00:00:00 2001 From: Per Inge Mathisen Date: Thu, 5 Dec 2024 16:28:24 +0100 Subject: [PATCH] The recently added no multithread build options are now runtime options lava-replay and lava-capture.py now both take --no-multithread option. The capture library checks the two new environment variables LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and LAVATUBE_DISABLE_MULTITHREADED_COMPRESS. This makes it easier to test them, and allows users to try them if they have out of memory situations. --- CMakeLists.txt | 6 +++++ README.md | 4 ++++ scripts/lava-capture.py | 4 ++++ src/filereader.h | 38 ++++++++++++++++++++------------ src/filewriter.cpp | 36 +++++++++++++----------------- src/filewriter.h | 49 ++++++++++++++++++++++++++++------------- src/replay.cpp | 5 +++++ src/util.cpp | 3 +++ src/util.h | 3 +++ src/write.cpp | 2 ++ 10 files changed, 100 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c624a2..c80abfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,6 +268,7 @@ add_test(NAME trace_test_1_0_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-rep add_test(NAME trace_test_1_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -D tracing_1_2_1.vk) add_test(NAME trace_test_1_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -H 0 tracing_1_2_1.vk) add_test(NAME trace_test_2_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -B tracing_1_2_0.vk) +add_test(NAME trace_test_2_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -nm tracing_1_2_0.vk) add_executable(tracing2 tests/tracing2.cpp ${VULKAN_TESTS_SRC}) target_include_directories(tracing2 ${COMMON_INCLUDE}) @@ -283,6 +284,10 @@ add_test(NAME trace_test_2_chunksize COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing set_tests_properties(trace_test_2_chunksize PROPERTIES ENVIRONMENT "LAVATUBE_CHUNK_SIZE=32767") add_test(NAME trace_test_2_virtqueue COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2) set_tests_properties(trace_test_2_virtqueue PROPERTIES ENVIRONMENT "LAVATUBE_VIRTUAL_QUEUES=1") +add_test(NAME trace_test_2_nompwrite COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2) +set_tests_properties(trace_test_2_nompwrite PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT=1") +add_test(NAME trace_test_2_nompcompress COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2) +set_tests_properties(trace_test_2_nompcompress PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_COMPRESS=1") add_executable(tracing3 tests/tracing3.cpp ${VULKAN_TESTS_SRC}) target_include_directories(tracing3 ${COMMON_INCLUDE}) @@ -495,6 +500,7 @@ set_tests_properties(layer_test_general_fencedelay PROPERTIES ENVIRONMENT "VK_LA add_test(NAME script_test_general COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 0 -V 2) add_test(NAME script_test_general_fencedelay COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general_fencedelay.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d --delayfence 2 ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 1) add_test(NAME script_test_copying_1 COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_1.vk --dedicated-buffer --gpu 0 --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 2) +add_test(NAME script_test_copying_nomp COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_nomp.vk --no-multithread --automate --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 3) #layer_test(general_vulkan13 general -V 3) # crashes on replay on privatedata layer_test(copying_1 copying_1) layer_test(copying_1_q1 copying_1 -q 1) diff --git a/README.md b/README.md index b7d70ea..3615ed4 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,10 @@ one graphics queue family containing two queues. If the host system does not sup two queues, work for the second queue will be passed to the first queue. All other queue families and queues will be hidden. +Lavatube uses separate threads for both compression and writeout to disk with their +own queues, which may cause you to run out of memory. To disable this, you can set +the environment variables LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and +LAVATUBE_DISABLE_MULTITHREADED_COMPRESS. Further reading =============== diff --git a/scripts/lava-capture.py b/scripts/lava-capture.py index 13b7bf1..5bfa496 100755 --- a/scripts/lava-capture.py +++ b/scripts/lava-capture.py @@ -22,6 +22,7 @@ def args(): parser.add_argument('--delayfence', dest='delayfence', metavar='', help='Delay successful fence waits the given number of times') parser.add_argument('--gpu', dest='gpu', metavar='', help='Use the specified GPU for tracing') parser.add_argument('--automate', dest='automate', action='store_true', help='Try to automate the run as much as possible if app supports CBS') + parser.add_argument('--no-multithread', dest='nomp', action='store_true', help='Turn off multi-threaded compression and disk writeout (saves memory)') parser.add_argument('programAndArgs', metavar=' []', nargs=argparse.REMAINDER, help='Application to capture and any program arguments') return parser @@ -82,6 +83,9 @@ def PrintEnvVar(envVar): if args.log: os.environ['LAVATUBE_DEBUG_FILE'] = args.log if args.layer: os.environ['VK_LAYER_PATH'] = args.layer else: os.environ['VK_LAYER_PATH'] = '/opt/lavatube' + if args.nomp: + os.environ['LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT'] = '1' + os.environ['LAVATUBE_DISABLE_MULTITHREADED_COMPRESS'] = '1' if args.dir is not None: os.chdir(args.dir) if not args.programAndArgs: diff --git a/src/filereader.h b/src/filereader.h index 8d30fa7..dad23ab 100644 --- a/src/filereader.h +++ b/src/filereader.h @@ -24,18 +24,16 @@ class file_reader void new_chunk() { -#ifdef MULTITHREADED_READ bool caught_decompressor = false; // if we caught up with the decompressor and had to wait -#endif + // There should not be anything 'left over' in the chunk by now assert(chunk.size() - uidx == 0); // Grab a new chunk to process uidx = 0xffff; // make sure it is a non-zero value to indicate we have work left to do while (uidx != 0) { -#ifdef MULTITHREADED_READ chunk_mutex.lock(); -#endif + if (uncompressed_chunks.size()) { chunk.release(); @@ -51,21 +49,23 @@ class file_reader { assert(!done_decompressing); // if this triggers, it means we tried to read more data than there is } -#ifdef MULTITHREADED_READ chunk_mutex.unlock(); -#endif + if (uidx != 0) { -#ifdef MULTITHREADED_READ - usleep(10000); // wait for more data - if (!caught_decompressor) + if (multithreaded_read) + { + if (!caught_decompressor) + { + caught_decompressor = true; + times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop + } + usleep(10000); // wait for more data + } + else { - caught_decompressor = true; - times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop + if (!decompress_chunk()) break; // generate new chunk } -#else - if (!decompress_chunk()) break; // generate new chunk -#endif } } } @@ -213,9 +213,19 @@ class file_reader return false; } + void disable_multithreaded_read() // we can only disable on the fly, enable makes less sense + { + chunk_mutex.lock(); + done_decompressing = true; + decompressor_thread.join(); + multithreaded_read = false; + chunk_mutex.unlock(); + } + private: void decompressor(); // runs in separate thread, moves chunks from file to uncompressed chunks + bool multithreaded_read = true; unsigned tid = -1; lava::mutex chunk_mutex; FILE* fp = nullptr; diff --git a/src/filewriter.cpp b/src/filewriter.cpp index 35f9603..19fbd7d 100644 --- a/src/filewriter.cpp +++ b/src/filewriter.cpp @@ -101,16 +101,13 @@ void file_writer::finalize() printf("Filewriter finalizing thread %u: %lu total bytes, %lu in last chunk, %d uncompressed chunks, and %d compressed chunks to be written out\n", mTid, (unsigned long)uncompressed_bytes, (unsigned long)uidx, (int)uncompressed_chunks.size(), (int)compressed_chunks.size()); chunk.shrink(uidx); -#ifndef MULTITHREADED_COMPRESS - chunk = compress_chunk(chunk); -#ifdef MULTITHREADED_WRITE - compressed_chunks.push_front(chunk); -#else - write_chunk(chunk); -#endif -#else - uncompressed_chunks.push_front(chunk); -#endif + if (!multithreaded_compress) + { + chunk = compress_chunk(chunk); + if (multithreaded_write) compressed_chunks.push_front(chunk); + else write_chunk(chunk); + } + else uncompressed_chunks.push_front(chunk); chunk = buffer(uncompressed_chunk_size); // ready to go again chunk_mutex.unlock(); // wrap up work in work lists @@ -162,7 +159,6 @@ void file_writer::serializer() { // lock, steal compressed buffer, unlock, store to disk, sleep, repeat set_thread_name("serializer"); -#ifdef MULTITHREADED_WRITE while (1) { buffer active; @@ -191,7 +187,6 @@ void file_writer::serializer() usleep(2000); } } -#endif } buffer file_writer::compress_chunk(buffer& uncompressed) @@ -218,7 +213,6 @@ void file_writer::compressor() { // lock, grab pointer to uncompressed, make new compressed, unlock, compress, sleep, repeat set_thread_name("compressor"); -#ifdef MULTITHREADED_COMPRESS while (1) { buffer uncompressed; @@ -241,13 +235,14 @@ void file_writer::compressor() if (uncompressed.size() > 0) { buffer compressed = compress_chunk(uncompressed); -#ifdef MULTITHREADED_WRITE - chunk_mutex.lock(); - compressed_chunks.push_front(compressed); - chunk_mutex.unlock(); -#else - write_chunk(compressed); -#endif + + if (multithreaded_write) + { + chunk_mutex.lock(); + compressed_chunks.push_front(compressed); + chunk_mutex.unlock(); + } + else write_chunk(compressed); } // if not done and no work done, wait a bit else if (!done_feeding) @@ -255,5 +250,4 @@ void file_writer::compressor() usleep(2000); } } -#endif } diff --git a/src/filewriter.h b/src/filewriter.h index ccfc090..836c106 100644 --- a/src/filewriter.h +++ b/src/filewriter.h @@ -11,9 +11,6 @@ #include "lavamutex.h" #include "util.h" -#define MULTITHREADED_COMPRESS -#define MULTITHREADED_WRITE - class file_writer { file_writer(const file_writer&) = delete; @@ -23,19 +20,21 @@ class file_writer { // shrink existing chunk to actually used size chunk.shrink(uidx); + // move chunk into list of chunks to compress -#ifdef MULTITHREADED_COMPRESS - chunk_mutex.lock(); - uncompressed_chunks.push_front(chunk); - chunk_mutex.unlock(); -#else - buffer compressed = compress_chunk(chunk); -#ifdef MULTITHREADED_WRITE - compressed_chunks.push_front(compressed); -#else - write_chunk(compressed); -#endif -#endif + if (multithreaded_compress) + { + chunk_mutex.lock(); + uncompressed_chunks.push_front(chunk); + chunk_mutex.unlock(); + } + else + { + buffer compressed = compress_chunk(chunk); + if (multithreaded_write) compressed_chunks.push_front(compressed); + else write_chunk(compressed); + } + // create a new chunk for writing into (we could employ a free list here as a possible optimization) if (size > uncompressed_chunk_size) // make sure our new chunk is big enough { @@ -141,6 +140,24 @@ class file_writer void change_default_chunk_size(size_t size) { assert(uidx < size); uncompressed_chunk_size = size; chunk.shrink(size); } + void disable_multithreaded_compress() + { + chunk_mutex.lock(); + done_compressing.exchange(false); + if (compressor_thread.joinable()) compressor_thread.join(); + multithreaded_compress = false; + chunk_mutex.unlock(); + } + + void disable_multithreaded_writeout() + { + chunk_mutex.lock(); + done_feeding.exchange(false); + if (serializer_thread.joinable()) serializer_thread.join(); + multithreaded_write = false; + chunk_mutex.unlock(); + } + protected: uint64_t uncompressed_bytes = 0; // total amount of uncompressed bytes written so far uint64_t checkpoint_bytes = 0; // bytes at freeze checkpoint @@ -156,6 +173,8 @@ class file_writer buffer compress_chunk(buffer& uncompressed); // returns compressed buffer void write_chunk(buffer& active); + bool multithreaded_compress = true; + bool multithreaded_write = true; lava::mutex chunk_mutex; FILE* fp = nullptr; size_t uncompressed_chunk_size = 1024 * 1024 * 64; // use 64mb chunks by default diff --git a/src/replay.cpp b/src/replay.cpp index 82178d4..adffd47 100644 --- a/src/replay.cpp +++ b/src/replay.cpp @@ -41,6 +41,7 @@ static void usage() printf("-A/--allocator type Use custom memory allocator callbacks [none, debug]\n"); printf("-N/--no-anisotropy Disable any use of sampler anisotropy\n"); printf("-B/--blackhole Do not actually submit any work to the GPU. May be useful for CPU measurements.\n"); + printf("-nm/--no-multithread Do not do decompression and file read in a separate thread. May save some CPU load and memory.\n"); exit(-1); } @@ -231,6 +232,10 @@ int main(int argc, char **argv) { p__blackhole = 1; } + else if (match(argv[i], "-nm", "--no-multithread", remaining)) + { + p__disable_multithread_read = 1; + } else if (match(argv[i], "-w", "--wsi", remaining)) { std::string wsi = get_str(argv[++i], remaining); diff --git a/src/util.cpp b/src/util.cpp index 99566c4..cd60efa 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -95,6 +95,9 @@ uint_fast8_t p__no_anisotropy = get_env_bool("LAVATUBE_NO_ANISOTROPY", 0); uint_fast8_t p__delay_fence_success_frames = get_env_int("LAVATUBE_DELAY_FENCE_SUCCESS_FRAMES", 0); // off by default int p__chunksize = get_env_int("LAVATUBE_CHUNK_SIZE", 64 * 1024 * 1024); uint_fast8_t p__external_memory = get_env_bool("LAVATUBE_EXTERNAL_MEMORY", 0); +uint_fast8_t p__disable_multithread_writeout = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT", 0); +uint_fast8_t p__disable_multithread_compress = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_COMPRESS", 0); +uint_fast8_t p__disable_multithread_read = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_READ", 0); const char* errorString(const VkResult errorCode) { diff --git a/src/util.h b/src/util.h index fb22d8a..4e5f458 100644 --- a/src/util.h +++ b/src/util.h @@ -62,6 +62,9 @@ extern uint_fast8_t p__delay_fence_success_frames; extern FILE* p__debug_destination; extern int p__chunksize; extern uint_fast8_t p__external_memory; +extern uint_fast8_t p__disable_multithread_writeout; +extern uint_fast8_t p__disable_multithread_compress; +extern uint_fast8_t p__disable_multithread_read; /// Logging to be enable as needed by source recompilation #define NEVER(_format, ...) diff --git a/src/write.cpp b/src/write.cpp index c617966..d6eb37f 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -52,6 +52,8 @@ lava_file_writer::lava_file_writer(uint16_t _tid, lava_writer* _parent) : parent { mTid = _tid; get_thread_name(thread_name); + if (p__disable_multithread_compress) disable_multithreaded_compress(); + if (p__disable_multithread_writeout) disable_multithreaded_writeout(); } void lava_file_writer::set(const std::string& path)