The recently added no multithread build options are now runtime options

lava-replay and lava-capture.py now both take --no-multithread option. The capture library checks the two new environment variables LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and LAVATUBE_DISABLE_MULTITHREADED_COMPRESS. This makes it easier to test them, and allows users to try them if they have out of memory situations.
ARM-software · Dec 5, 2024 · d5b89dc · d5b89dc
1 parent bba92bd
commit d5b89dc
Show file tree

Hide file tree

Showing 10 changed files with 100 additions and 50 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -268,6 +268,7 @@ add_test(NAME trace_test_1_0_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-rep
 add_test(NAME trace_test_1_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -D tracing_1_2_1.vk)
 add_test(NAME trace_test_1_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -H 0 tracing_1_2_1.vk)
 add_test(NAME trace_test_2_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -B tracing_1_2_0.vk)
+add_test(NAME trace_test_2_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -nm tracing_1_2_0.vk)
 
 add_executable(tracing2 tests/tracing2.cpp ${VULKAN_TESTS_SRC})
 target_include_directories(tracing2 ${COMMON_INCLUDE})
@@ -283,6 +284,10 @@ add_test(NAME trace_test_2_chunksize COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing
 set_tests_properties(trace_test_2_chunksize PROPERTIES ENVIRONMENT "LAVATUBE_CHUNK_SIZE=32767")
 add_test(NAME trace_test_2_virtqueue COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
 set_tests_properties(trace_test_2_virtqueue PROPERTIES ENVIRONMENT "LAVATUBE_VIRTUAL_QUEUES=1")
+add_test(NAME trace_test_2_nompwrite COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
+set_tests_properties(trace_test_2_nompwrite PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT=1")
+add_test(NAME trace_test_2_nompcompress COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
+set_tests_properties(trace_test_2_nompcompress PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_COMPRESS=1")
 
 add_executable(tracing3 tests/tracing3.cpp ${VULKAN_TESTS_SRC})
 target_include_directories(tracing3 ${COMMON_INCLUDE})
@@ -495,6 +500,7 @@ set_tests_properties(layer_test_general_fencedelay PROPERTIES ENVIRONMENT "VK_LA
 add_test(NAME script_test_general COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 0 -V 2)
 add_test(NAME script_test_general_fencedelay COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general_fencedelay.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d --delayfence 2 ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 1)
 add_test(NAME script_test_copying_1 COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_1.vk --dedicated-buffer --gpu 0 --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 2)
+add_test(NAME script_test_copying_nomp COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_nomp.vk --no-multithread --automate --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 3)
 #layer_test(general_vulkan13 general -V 3) # crashes on replay on privatedata
 layer_test(copying_1 copying_1)
 layer_test(copying_1_q1 copying_1 -q 1)

diff --git a/README.md b/README.md
@@ -163,6 +163,10 @@ one graphics queue family containing two queues. If the host system does not sup
 two queues, work for the second queue will be passed to the first queue. All other
 queue families and queues will be hidden.
 
+Lavatube uses separate threads for both compression and writeout to disk with their
+own queues, which may cause you to run out of memory. To disable this, you can set
+the environment variables LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and
+LAVATUBE_DISABLE_MULTITHREADED_COMPRESS.
 
 Further reading
 ===============

diff --git a/scripts/lava-capture.py b/scripts/lava-capture.py
@@ -22,6 +22,7 @@ def args():
 	parser.add_argument('--delayfence', dest='delayfence', metavar='<times>', help='Delay successful fence waits the given number of times')
 	parser.add_argument('--gpu', dest='gpu', metavar='<gpu>', help='Use the specified GPU for tracing')
 	parser.add_argument('--automate', dest='automate', action='store_true', help='Try to automate the run as much as possible if app supports CBS')
+	parser.add_argument('--no-multithread', dest='nomp', action='store_true', help='Turn off multi-threaded compression and disk writeout (saves memory)')
 	parser.add_argument('programAndArgs', metavar='<program> [<program args>]', nargs=argparse.REMAINDER, help='Application to capture and any program arguments')
 	return parser
 
@@ -82,6 +83,9 @@ def PrintEnvVar(envVar):
 	if args.log: os.environ['LAVATUBE_DEBUG_FILE'] = args.log
 	if args.layer: os.environ['VK_LAYER_PATH'] = args.layer
 	else: os.environ['VK_LAYER_PATH'] = '/opt/lavatube'
+	if args.nomp:
+		os.environ['LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT'] = '1'
+		os.environ['LAVATUBE_DISABLE_MULTITHREADED_COMPRESS'] = '1'
 	if args.dir is not None:
 		os.chdir(args.dir)
 	if not args.programAndArgs:

diff --git a/src/filereader.h b/src/filereader.h
@@ -24,18 +24,16 @@ class file_reader
 
 	void new_chunk()
 	{
-#ifdef MULTITHREADED_READ
 		bool caught_decompressor = false; // if we caught up with the decompressor and had to wait
-#endif
+
 		// There should not be anything 'left over' in the chunk by now
 		assert(chunk.size() - uidx == 0);
 		// Grab a new chunk to process
 		uidx = 0xffff; // make sure it is a non-zero value to indicate we have work left to do
 		while (uidx != 0)
 		{
-#ifdef MULTITHREADED_READ
 			chunk_mutex.lock();
-#endif
+
 			if (uncompressed_chunks.size())
 			{
 				chunk.release();
@@ -51,21 +49,23 @@ class file_reader
 			{
 				assert(!done_decompressing); // if this triggers, it means we tried to read more data than there is
 			}
-#ifdef MULTITHREADED_READ
 			chunk_mutex.unlock();
-#endif
+
 			if (uidx != 0)
 			{
-#ifdef MULTITHREADED_READ
-				usleep(10000); // wait for more data
-				if (!caught_decompressor)
+				if (multithreaded_read)
+				{
+					if (!caught_decompressor)
+					{
+						caught_decompressor = true;
+						times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop
+					}
+					usleep(10000); // wait for more data
+				}
+				else
 				{
-					caught_decompressor = true;
-					times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop
+					if (!decompress_chunk()) break; // generate new chunk
 				}
-#else
-				if (!decompress_chunk()) break; // generate new chunk
-#endif
 			}
 		}
 	}
@@ -213,9 +213,19 @@ class file_reader
 		return false;
 	}
 
+	void disable_multithreaded_read() // we can only disable on the fly, enable makes less sense
+	{
+		chunk_mutex.lock();
+		done_decompressing = true;
+		decompressor_thread.join();
+		multithreaded_read = false;
+		chunk_mutex.unlock();
+	}
+
 private:
 	void decompressor(); // runs in separate thread, moves chunks from file to uncompressed chunks
 
+	bool multithreaded_read = true;
 	unsigned tid = -1;
 	lava::mutex chunk_mutex;
 	FILE* fp = nullptr;

diff --git a/src/filewriter.cpp b/src/filewriter.cpp
@@ -101,16 +101,13 @@ void file_writer::finalize()
 	printf("Filewriter finalizing thread %u: %lu total bytes, %lu in last chunk, %d uncompressed chunks, and %d compressed chunks to be written out\n",
 	       mTid, (unsigned long)uncompressed_bytes, (unsigned long)uidx, (int)uncompressed_chunks.size(), (int)compressed_chunks.size());
 	chunk.shrink(uidx);
-#ifndef MULTITHREADED_COMPRESS
-	chunk = compress_chunk(chunk);
-#ifdef MULTITHREADED_WRITE
-	compressed_chunks.push_front(chunk);
-#else
-	write_chunk(chunk);
-#endif
-#else
-	uncompressed_chunks.push_front(chunk);
-#endif
+	if (!multithreaded_compress)
+	{
+		chunk = compress_chunk(chunk);
+		if (multithreaded_write) compressed_chunks.push_front(chunk);
+		else write_chunk(chunk);
+	}
+	else uncompressed_chunks.push_front(chunk);
 	chunk = buffer(uncompressed_chunk_size); // ready to go again
 	chunk_mutex.unlock();
 	// wrap up work in work lists
@@ -162,7 +159,6 @@ void file_writer::serializer()
 {
 	// lock, steal compressed buffer, unlock, store to disk, sleep, repeat
 	set_thread_name("serializer");
-#ifdef MULTITHREADED_WRITE
 	while (1)
 	{
 		buffer active;
@@ -191,7 +187,6 @@ void file_writer::serializer()
 			usleep(2000);
 		}
 	}
-#endif
 }
 
 buffer file_writer::compress_chunk(buffer& uncompressed)
@@ -218,7 +213,6 @@ void file_writer::compressor()
 {
 	// lock, grab pointer to uncompressed, make new compressed, unlock, compress, sleep, repeat
 	set_thread_name("compressor");
-#ifdef MULTITHREADED_COMPRESS
 	while (1)
 	{
 		buffer uncompressed;
@@ -241,19 +235,19 @@ void file_writer::compressor()
 		if (uncompressed.size() > 0)
 		{
 			buffer compressed = compress_chunk(uncompressed);
-#ifdef MULTITHREADED_WRITE
-			chunk_mutex.lock();
-			compressed_chunks.push_front(compressed);
-			chunk_mutex.unlock();
-#else
-			write_chunk(compressed);
-#endif
+
+			if (multithreaded_write)
+			{
+				chunk_mutex.lock();
+				compressed_chunks.push_front(compressed);
+				chunk_mutex.unlock();
+			}
+			else write_chunk(compressed);
 		}
 		// if not done and no work done, wait a bit
 		else if (!done_feeding)
 		{
 			usleep(2000);
 		}
 	}
-#endif
 }
diff --git a/src/filewriter.h b/src/filewriter.h
@@ -11,9 +11,6 @@
 #include "lavamutex.h"
 #include "util.h"
 
-#define MULTITHREADED_COMPRESS
-#define MULTITHREADED_WRITE
-
 class file_writer
 {
 	file_writer(const file_writer&) = delete;
@@ -23,19 +20,21 @@ class file_writer
 	{
 		// shrink existing chunk to actually used size
 		chunk.shrink(uidx);
+
 		// move chunk into list of chunks to compress
-#ifdef MULTITHREADED_COMPRESS
-		chunk_mutex.lock();
-		uncompressed_chunks.push_front(chunk);
-		chunk_mutex.unlock();
-#else
-		buffer compressed = compress_chunk(chunk);
-#ifdef MULTITHREADED_WRITE
-		compressed_chunks.push_front(compressed);
-#else
-		write_chunk(compressed);
-#endif
-#endif
+		if (multithreaded_compress)
+		{
+			chunk_mutex.lock();
+			uncompressed_chunks.push_front(chunk);
+			chunk_mutex.unlock();
+		}
+		else
+		{
+			buffer compressed = compress_chunk(chunk);
+			if (multithreaded_write) compressed_chunks.push_front(compressed);
+			else write_chunk(compressed);
+		}
+
 		// create a new chunk for writing into (we could employ a free list here as a possible optimization)
 		if (size > uncompressed_chunk_size) // make sure our new chunk is big enough
 		{
@@ -141,6 +140,24 @@ class file_writer
 
 	void change_default_chunk_size(size_t size) { assert(uidx < size); uncompressed_chunk_size = size; chunk.shrink(size); }
 
+	void disable_multithreaded_compress()
+	{
+		chunk_mutex.lock();
+		done_compressing.exchange(false);
+		if (compressor_thread.joinable()) compressor_thread.join();
+		multithreaded_compress = false;
+		chunk_mutex.unlock();
+	}
+
+	void disable_multithreaded_writeout()
+	{
+		chunk_mutex.lock();
+		done_feeding.exchange(false);
+		if (serializer_thread.joinable()) serializer_thread.join();
+		multithreaded_write = false;
+		chunk_mutex.unlock();
+	}
+
 protected:
 	uint64_t uncompressed_bytes = 0; // total amount of uncompressed bytes written so far
 	uint64_t checkpoint_bytes = 0; // bytes at freeze checkpoint
@@ -156,6 +173,8 @@ class file_writer
 	buffer compress_chunk(buffer& uncompressed); // returns compressed buffer
 	void write_chunk(buffer& active);
 
+	bool multithreaded_compress = true;
+	bool multithreaded_write = true;
 	lava::mutex chunk_mutex;
 	FILE* fp = nullptr;
 	size_t uncompressed_chunk_size = 1024 * 1024 * 64; // use 64mb chunks by default

diff --git a/src/replay.cpp b/src/replay.cpp
@@ -41,6 +41,7 @@ static void usage()
 	printf("-A/--allocator type    Use custom memory allocator callbacks [none, debug]\n");
 	printf("-N/--no-anisotropy     Disable any use of sampler anisotropy\n");
 	printf("-B/--blackhole         Do not actually submit any work to the GPU. May be useful for CPU measurements.\n");
+	printf("-nm/--no-multithread   Do not do decompression and file read in a separate thread. May save some CPU load and memory.\n");
 	exit(-1);
 }
 
@@ -231,6 +232,10 @@ int main(int argc, char **argv)
 		{
 			p__blackhole = 1;
 		}
+		else if (match(argv[i], "-nm", "--no-multithread", remaining))
+		{
+			p__disable_multithread_read = 1;
+		}
 		else if (match(argv[i], "-w", "--wsi", remaining))
 		{
 			std::string wsi = get_str(argv[++i], remaining);

diff --git a/src/util.cpp b/src/util.cpp
@@ -95,6 +95,9 @@ uint_fast8_t p__no_anisotropy = get_env_bool("LAVATUBE_NO_ANISOTROPY", 0);
 uint_fast8_t p__delay_fence_success_frames = get_env_int("LAVATUBE_DELAY_FENCE_SUCCESS_FRAMES", 0); // off by default
 int p__chunksize = get_env_int("LAVATUBE_CHUNK_SIZE", 64 * 1024 * 1024);
 uint_fast8_t p__external_memory = get_env_bool("LAVATUBE_EXTERNAL_MEMORY", 0);
+uint_fast8_t p__disable_multithread_writeout = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT", 0);
+uint_fast8_t p__disable_multithread_compress = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_COMPRESS", 0);
+uint_fast8_t p__disable_multithread_read = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_READ", 0);
 
 const char* errorString(const VkResult errorCode)
 {

diff --git a/src/util.h b/src/util.h
@@ -62,6 +62,9 @@ extern uint_fast8_t p__delay_fence_success_frames;
 extern FILE* p__debug_destination;
 extern int p__chunksize;
 extern uint_fast8_t p__external_memory;
+extern uint_fast8_t p__disable_multithread_writeout;
+extern uint_fast8_t p__disable_multithread_compress;
+extern uint_fast8_t p__disable_multithread_read;
 
 /// Logging to be enable as needed by source recompilation
 #define NEVER(_format, ...)

diff --git a/src/write.cpp b/src/write.cpp
@@ -52,6 +52,8 @@ lava_file_writer::lava_file_writer(uint16_t _tid, lava_writer* _parent) : parent
 {
 	mTid = _tid;
 	get_thread_name(thread_name);
+	if (p__disable_multithread_compress) disable_multithreaded_compress();
+	if (p__disable_multithread_writeout) disable_multithreaded_writeout();
 }
 
 void lava_file_writer::set(const std::string& path)