From d5b89dcb7f1bae5590782ae1ece56e9a0d1d5a4c Mon Sep 17 00:00:00 2001
From: Per Inge Mathisen <per.mathisen@arm.com>
Date: Thu, 5 Dec 2024 16:28:24 +0100
Subject: [PATCH] The recently added no multithread build options are now
 runtime options

lava-replay and lava-capture.py now both take --no-multithread option.
The capture library checks the two new environment variables
LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and
LAVATUBE_DISABLE_MULTITHREADED_COMPRESS.

This makes it easier to test them, and allows users to try them if they
have out of memory situations.
---
 CMakeLists.txt          |  6 +++++
 README.md               |  4 ++++
 scripts/lava-capture.py |  4 ++++
 src/filereader.h        | 38 ++++++++++++++++++++------------
 src/filewriter.cpp      | 36 +++++++++++++-----------------
 src/filewriter.h        | 49 ++++++++++++++++++++++++++++-------------
 src/replay.cpp          |  5 +++++
 src/util.cpp            |  3 +++
 src/util.h              |  3 +++
 src/write.cpp           |  2 ++
 10 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c624a2..c80abfb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -268,6 +268,7 @@ add_test(NAME trace_test_1_0_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-rep
 add_test(NAME trace_test_1_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -D tracing_1_2_1.vk)
 add_test(NAME trace_test_1_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -H 0 tracing_1_2_1.vk)
 add_test(NAME trace_test_2_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -B tracing_1_2_0.vk)
+add_test(NAME trace_test_2_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -nm tracing_1_2_0.vk)
 
 add_executable(tracing2 tests/tracing2.cpp ${VULKAN_TESTS_SRC})
 target_include_directories(tracing2 ${COMMON_INCLUDE})
@@ -283,6 +284,10 @@ add_test(NAME trace_test_2_chunksize COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing
 set_tests_properties(trace_test_2_chunksize PROPERTIES ENVIRONMENT "LAVATUBE_CHUNK_SIZE=32767")
 add_test(NAME trace_test_2_virtqueue COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
 set_tests_properties(trace_test_2_virtqueue PROPERTIES ENVIRONMENT "LAVATUBE_VIRTUAL_QUEUES=1")
+add_test(NAME trace_test_2_nompwrite COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
+set_tests_properties(trace_test_2_nompwrite PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT=1")
+add_test(NAME trace_test_2_nompcompress COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
+set_tests_properties(trace_test_2_nompcompress PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_COMPRESS=1")
 
 add_executable(tracing3 tests/tracing3.cpp ${VULKAN_TESTS_SRC})
 target_include_directories(tracing3 ${COMMON_INCLUDE})
@@ -495,6 +500,7 @@ set_tests_properties(layer_test_general_fencedelay PROPERTIES ENVIRONMENT "VK_LA
 add_test(NAME script_test_general COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 0 -V 2)
 add_test(NAME script_test_general_fencedelay COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general_fencedelay.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d --delayfence 2 ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 1)
 add_test(NAME script_test_copying_1 COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_1.vk --dedicated-buffer --gpu 0 --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 2)
+add_test(NAME script_test_copying_nomp COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_nomp.vk --no-multithread --automate --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 3)
 #layer_test(general_vulkan13 general -V 3) # crashes on replay on privatedata
 layer_test(copying_1 copying_1)
 layer_test(copying_1_q1 copying_1 -q 1)
diff --git a/README.md b/README.md
index b7d70ea..3615ed4 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,10 @@ one graphics queue family containing two queues. If the host system does not sup
 two queues, work for the second queue will be passed to the first queue. All other
 queue families and queues will be hidden.
 
+Lavatube uses separate threads for both compression and writeout to disk with their
+own queues, which may cause you to run out of memory. To disable this, you can set
+the environment variables LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and
+LAVATUBE_DISABLE_MULTITHREADED_COMPRESS.
 
 Further reading
 ===============
diff --git a/scripts/lava-capture.py b/scripts/lava-capture.py
index 13b7bf1..5bfa496 100755
--- a/scripts/lava-capture.py
+++ b/scripts/lava-capture.py
@@ -22,6 +22,7 @@ def args():
 	parser.add_argument('--delayfence', dest='delayfence', metavar='<times>', help='Delay successful fence waits the given number of times')
 	parser.add_argument('--gpu', dest='gpu', metavar='<gpu>', help='Use the specified GPU for tracing')
 	parser.add_argument('--automate', dest='automate', action='store_true', help='Try to automate the run as much as possible if app supports CBS')
+	parser.add_argument('--no-multithread', dest='nomp', action='store_true', help='Turn off multi-threaded compression and disk writeout (saves memory)')
 	parser.add_argument('programAndArgs', metavar='<program> [<program args>]', nargs=argparse.REMAINDER, help='Application to capture and any program arguments')
 	return parser
 
@@ -82,6 +83,9 @@ def PrintEnvVar(envVar):
 	if args.log: os.environ['LAVATUBE_DEBUG_FILE'] = args.log
 	if args.layer: os.environ['VK_LAYER_PATH'] = args.layer
 	else: os.environ['VK_LAYER_PATH'] = '/opt/lavatube'
+	if args.nomp:
+		os.environ['LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT'] = '1'
+		os.environ['LAVATUBE_DISABLE_MULTITHREADED_COMPRESS'] = '1'
 	if args.dir is not None:
 		os.chdir(args.dir)
 	if not args.programAndArgs:
diff --git a/src/filereader.h b/src/filereader.h
index 8d30fa7..dad23ab 100644
--- a/src/filereader.h
+++ b/src/filereader.h
@@ -24,18 +24,16 @@ class file_reader
 
 	void new_chunk()
 	{
-#ifdef MULTITHREADED_READ
 		bool caught_decompressor = false; // if we caught up with the decompressor and had to wait
-#endif
+
 		// There should not be anything 'left over' in the chunk by now
 		assert(chunk.size() - uidx == 0);
 		// Grab a new chunk to process
 		uidx = 0xffff; // make sure it is a non-zero value to indicate we have work left to do
 		while (uidx != 0)
 		{
-#ifdef MULTITHREADED_READ
 			chunk_mutex.lock();
-#endif
+
 			if (uncompressed_chunks.size())
 			{
 				chunk.release();
@@ -51,21 +49,23 @@ class file_reader
 			{
 				assert(!done_decompressing); // if this triggers, it means we tried to read more data than there is
 			}
-#ifdef MULTITHREADED_READ
 			chunk_mutex.unlock();
-#endif
+
 			if (uidx != 0)
 			{
-#ifdef MULTITHREADED_READ
-				usleep(10000); // wait for more data
-				if (!caught_decompressor)
+				if (multithreaded_read)
+				{
+					if (!caught_decompressor)
+					{
+						caught_decompressor = true;
+						times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop
+					}
+					usleep(10000); // wait for more data
+				}
+				else
 				{
-					caught_decompressor = true;
-					times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop
+					if (!decompress_chunk()) break; // generate new chunk
 				}
-#else
-				if (!decompress_chunk()) break; // generate new chunk
-#endif
 			}
 		}
 	}
@@ -213,9 +213,19 @@ class file_reader
 		return false;
 	}
 
+	void disable_multithreaded_read() // we can only disable on the fly, enable makes less sense
+	{
+		chunk_mutex.lock();
+		done_decompressing = true;
+		decompressor_thread.join();
+		multithreaded_read = false;
+		chunk_mutex.unlock();
+	}
+
 private:
 	void decompressor(); // runs in separate thread, moves chunks from file to uncompressed chunks
 
+	bool multithreaded_read = true;
 	unsigned tid = -1;
 	lava::mutex chunk_mutex;
 	FILE* fp = nullptr;
diff --git a/src/filewriter.cpp b/src/filewriter.cpp
index 35f9603..19fbd7d 100644
--- a/src/filewriter.cpp
+++ b/src/filewriter.cpp
@@ -101,16 +101,13 @@ void file_writer::finalize()
 	printf("Filewriter finalizing thread %u: %lu total bytes, %lu in last chunk, %d uncompressed chunks, and %d compressed chunks to be written out\n",
 	       mTid, (unsigned long)uncompressed_bytes, (unsigned long)uidx, (int)uncompressed_chunks.size(), (int)compressed_chunks.size());
 	chunk.shrink(uidx);
-#ifndef MULTITHREADED_COMPRESS
-	chunk = compress_chunk(chunk);
-#ifdef MULTITHREADED_WRITE
-	compressed_chunks.push_front(chunk);
-#else
-	write_chunk(chunk);
-#endif
-#else
-	uncompressed_chunks.push_front(chunk);
-#endif
+	if (!multithreaded_compress)
+	{
+		chunk = compress_chunk(chunk);
+		if (multithreaded_write) compressed_chunks.push_front(chunk);
+		else write_chunk(chunk);
+	}
+	else uncompressed_chunks.push_front(chunk);
 	chunk = buffer(uncompressed_chunk_size); // ready to go again
 	chunk_mutex.unlock();
 	// wrap up work in work lists
@@ -162,7 +159,6 @@ void file_writer::serializer()
 {
 	// lock, steal compressed buffer, unlock, store to disk, sleep, repeat
 	set_thread_name("serializer");
-#ifdef MULTITHREADED_WRITE
 	while (1)
 	{
 		buffer active;
@@ -191,7 +187,6 @@ void file_writer::serializer()
 			usleep(2000);
 		}
 	}
-#endif
 }
 
 buffer file_writer::compress_chunk(buffer& uncompressed)
@@ -218,7 +213,6 @@ void file_writer::compressor()
 {
 	// lock, grab pointer to uncompressed, make new compressed, unlock, compress, sleep, repeat
 	set_thread_name("compressor");
-#ifdef MULTITHREADED_COMPRESS
 	while (1)
 	{
 		buffer uncompressed;
@@ -241,13 +235,14 @@ void file_writer::compressor()
 		if (uncompressed.size() > 0)
 		{
 			buffer compressed = compress_chunk(uncompressed);
-#ifdef MULTITHREADED_WRITE
-			chunk_mutex.lock();
-			compressed_chunks.push_front(compressed);
-			chunk_mutex.unlock();
-#else
-			write_chunk(compressed);
-#endif
+
+			if (multithreaded_write)
+			{
+				chunk_mutex.lock();
+				compressed_chunks.push_front(compressed);
+				chunk_mutex.unlock();
+			}
+			else write_chunk(compressed);
 		}
 		// if not done and no work done, wait a bit
 		else if (!done_feeding)
@@ -255,5 +250,4 @@ void file_writer::compressor()
 			usleep(2000);
 		}
 	}
-#endif
 }
diff --git a/src/filewriter.h b/src/filewriter.h
index ccfc090..836c106 100644
--- a/src/filewriter.h
+++ b/src/filewriter.h
@@ -11,9 +11,6 @@
 #include "lavamutex.h"
 #include "util.h"
 
-#define MULTITHREADED_COMPRESS
-#define MULTITHREADED_WRITE
-
 class file_writer
 {
 	file_writer(const file_writer&) = delete;
@@ -23,19 +20,21 @@ class file_writer
 	{
 		// shrink existing chunk to actually used size
 		chunk.shrink(uidx);
+
 		// move chunk into list of chunks to compress
-#ifdef MULTITHREADED_COMPRESS
-		chunk_mutex.lock();
-		uncompressed_chunks.push_front(chunk);
-		chunk_mutex.unlock();
-#else
-		buffer compressed = compress_chunk(chunk);
-#ifdef MULTITHREADED_WRITE
-		compressed_chunks.push_front(compressed);
-#else
-		write_chunk(compressed);
-#endif
-#endif
+		if (multithreaded_compress)
+		{
+			chunk_mutex.lock();
+			uncompressed_chunks.push_front(chunk);
+			chunk_mutex.unlock();
+		}
+		else
+		{
+			buffer compressed = compress_chunk(chunk);
+			if (multithreaded_write) compressed_chunks.push_front(compressed);
+			else write_chunk(compressed);
+		}
+
 		// create a new chunk for writing into (we could employ a free list here as a possible optimization)
 		if (size > uncompressed_chunk_size) // make sure our new chunk is big enough
 		{
@@ -141,6 +140,24 @@ class file_writer
 
 	void change_default_chunk_size(size_t size) { assert(uidx < size); uncompressed_chunk_size = size; chunk.shrink(size); }
 
+	void disable_multithreaded_compress()
+	{
+		chunk_mutex.lock();
+		done_compressing.exchange(false);
+		if (compressor_thread.joinable()) compressor_thread.join();
+		multithreaded_compress = false;
+		chunk_mutex.unlock();
+	}
+
+	void disable_multithreaded_writeout()
+	{
+		chunk_mutex.lock();
+		done_feeding.exchange(false);
+		if (serializer_thread.joinable()) serializer_thread.join();
+		multithreaded_write = false;
+		chunk_mutex.unlock();
+	}
+
 protected:
 	uint64_t uncompressed_bytes = 0; // total amount of uncompressed bytes written so far
 	uint64_t checkpoint_bytes = 0; // bytes at freeze checkpoint
@@ -156,6 +173,8 @@ class file_writer
 	buffer compress_chunk(buffer& uncompressed); // returns compressed buffer
 	void write_chunk(buffer& active);
 
+	bool multithreaded_compress = true;
+	bool multithreaded_write = true;
 	lava::mutex chunk_mutex;
 	FILE* fp = nullptr;
 	size_t uncompressed_chunk_size = 1024 * 1024 * 64; // use 64mb chunks by default
diff --git a/src/replay.cpp b/src/replay.cpp
index 82178d4..adffd47 100644
--- a/src/replay.cpp
+++ b/src/replay.cpp
@@ -41,6 +41,7 @@ static void usage()
 	printf("-A/--allocator type    Use custom memory allocator callbacks [none, debug]\n");
 	printf("-N/--no-anisotropy     Disable any use of sampler anisotropy\n");
 	printf("-B/--blackhole         Do not actually submit any work to the GPU. May be useful for CPU measurements.\n");
+	printf("-nm/--no-multithread   Do not do decompression and file read in a separate thread. May save some CPU load and memory.\n");
 	exit(-1);
 }
 
@@ -231,6 +232,10 @@ int main(int argc, char **argv)
 		{
 			p__blackhole = 1;
 		}
+		else if (match(argv[i], "-nm", "--no-multithread", remaining))
+		{
+			p__disable_multithread_read = 1;
+		}
 		else if (match(argv[i], "-w", "--wsi", remaining))
 		{
 			std::string wsi = get_str(argv[++i], remaining);
diff --git a/src/util.cpp b/src/util.cpp
index 99566c4..cd60efa 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -95,6 +95,9 @@ uint_fast8_t p__no_anisotropy = get_env_bool("LAVATUBE_NO_ANISOTROPY", 0);
 uint_fast8_t p__delay_fence_success_frames = get_env_int("LAVATUBE_DELAY_FENCE_SUCCESS_FRAMES", 0); // off by default
 int p__chunksize = get_env_int("LAVATUBE_CHUNK_SIZE", 64 * 1024 * 1024);
 uint_fast8_t p__external_memory = get_env_bool("LAVATUBE_EXTERNAL_MEMORY", 0);
+uint_fast8_t p__disable_multithread_writeout = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT", 0);
+uint_fast8_t p__disable_multithread_compress = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_COMPRESS", 0);
+uint_fast8_t p__disable_multithread_read = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_READ", 0);
 
 const char* errorString(const VkResult errorCode)
 {
diff --git a/src/util.h b/src/util.h
index fb22d8a..4e5f458 100644
--- a/src/util.h
+++ b/src/util.h
@@ -62,6 +62,9 @@ extern uint_fast8_t p__delay_fence_success_frames;
 extern FILE* p__debug_destination;
 extern int p__chunksize;
 extern uint_fast8_t p__external_memory;
+extern uint_fast8_t p__disable_multithread_writeout;
+extern uint_fast8_t p__disable_multithread_compress;
+extern uint_fast8_t p__disable_multithread_read;
 
 /// Logging to be enable as needed by source recompilation
 #define NEVER(_format, ...)
diff --git a/src/write.cpp b/src/write.cpp
index c617966..d6eb37f 100644
--- a/src/write.cpp
+++ b/src/write.cpp
@@ -52,6 +52,8 @@ lava_file_writer::lava_file_writer(uint16_t _tid, lava_writer* _parent) : parent
 {
 	mTid = _tid;
 	get_thread_name(thread_name);
+	if (p__disable_multithread_compress) disable_multithreaded_compress();
+	if (p__disable_multithread_writeout) disable_multithreaded_writeout();
 }
 
 void lava_file_writer::set(const std::string& path)