Skip to content

Commit

Permalink
The recently added no multithread build options are now runtime options
Browse files Browse the repository at this point in the history
lava-replay and lava-capture.py now both take --no-multithread option.
The capture library checks the two new environment variables
LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and
LAVATUBE_DISABLE_MULTITHREADED_COMPRESS.

This makes it easier to test them, and allows users to try them if they
have out of memory situations.
  • Loading branch information
per-mathisen-arm committed Dec 5, 2024
1 parent bba92bd commit d5b89dc
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 50 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ add_test(NAME trace_test_1_0_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-rep
add_test(NAME trace_test_1_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -D tracing_1_2_1.vk)
add_test(NAME trace_test_1_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -H 0 tracing_1_2_1.vk)
add_test(NAME trace_test_2_1_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -B tracing_1_2_0.vk)
add_test(NAME trace_test_2_2_replay COMMAND ${CMAKE_CURRENT_BINARY_DIR}/lava-replay -nm tracing_1_2_0.vk)

add_executable(tracing2 tests/tracing2.cpp ${VULKAN_TESTS_SRC})
target_include_directories(tracing2 ${COMMON_INCLUDE})
Expand All @@ -283,6 +284,10 @@ add_test(NAME trace_test_2_chunksize COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing
set_tests_properties(trace_test_2_chunksize PROPERTIES ENVIRONMENT "LAVATUBE_CHUNK_SIZE=32767")
add_test(NAME trace_test_2_virtqueue COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
set_tests_properties(trace_test_2_virtqueue PROPERTIES ENVIRONMENT "LAVATUBE_VIRTUAL_QUEUES=1")
add_test(NAME trace_test_2_nompwrite COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
set_tests_properties(trace_test_2_nompwrite PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT=1")
add_test(NAME trace_test_2_nompcompress COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tracing2)
set_tests_properties(trace_test_2_nompcompress PROPERTIES ENVIRONMENT "LAVATUBE_DISABLE_MULTITHREADED_COMPRESS=1")

add_executable(tracing3 tests/tracing3.cpp ${VULKAN_TESTS_SRC})
target_include_directories(tracing3 ${COMMON_INCLUDE})
Expand Down Expand Up @@ -495,6 +500,7 @@ set_tests_properties(layer_test_general_fencedelay PROPERTIES ENVIRONMENT "VK_LA
add_test(NAME script_test_general COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 0 -V 2)
add_test(NAME script_test_general_fencedelay COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_general_fencedelay.vk --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d --delayfence 2 ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_general -f 1)
add_test(NAME script_test_copying_1 COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_1.vk --dedicated-buffer --gpu 0 --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 2)
add_test(NAME script_test_copying_nomp COMMAND ${CMAKE_SOURCE_DIR}/scripts/lava-capture.py -o script_test_copying_nomp.vk --no-multithread --automate --layer-path ${CMAKE_CURRENT_BINARY_DIR}/implicit_layer.d ${CMAKE_CURRENT_BINARY_DIR}/tracetooltests/vulkan_copying_1 -V 3)
#layer_test(general_vulkan13 general -V 3) # crashes on replay on privatedata
layer_test(copying_1 copying_1)
layer_test(copying_1_q1 copying_1 -q 1)
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ one graphics queue family containing two queues. If the host system does not sup
two queues, work for the second queue will be passed to the first queue. All other
queue families and queues will be hidden.

Lavatube uses separate threads for both compression and writeout to disk with their
own queues, which may cause you to run out of memory. To disable this, you can set
the environment variables LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT and
LAVATUBE_DISABLE_MULTITHREADED_COMPRESS.

Further reading
===============
Expand Down
4 changes: 4 additions & 0 deletions scripts/lava-capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def args():
parser.add_argument('--delayfence', dest='delayfence', metavar='<times>', help='Delay successful fence waits the given number of times')
parser.add_argument('--gpu', dest='gpu', metavar='<gpu>', help='Use the specified GPU for tracing')
parser.add_argument('--automate', dest='automate', action='store_true', help='Try to automate the run as much as possible if app supports CBS')
parser.add_argument('--no-multithread', dest='nomp', action='store_true', help='Turn off multi-threaded compression and disk writeout (saves memory)')
parser.add_argument('programAndArgs', metavar='<program> [<program args>]', nargs=argparse.REMAINDER, help='Application to capture and any program arguments')
return parser

Expand Down Expand Up @@ -82,6 +83,9 @@ def PrintEnvVar(envVar):
if args.log: os.environ['LAVATUBE_DEBUG_FILE'] = args.log
if args.layer: os.environ['VK_LAYER_PATH'] = args.layer
else: os.environ['VK_LAYER_PATH'] = '/opt/lavatube'
if args.nomp:
os.environ['LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT'] = '1'
os.environ['LAVATUBE_DISABLE_MULTITHREADED_COMPRESS'] = '1'
if args.dir is not None:
os.chdir(args.dir)
if not args.programAndArgs:
Expand Down
38 changes: 24 additions & 14 deletions src/filereader.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,16 @@ class file_reader

void new_chunk()
{
#ifdef MULTITHREADED_READ
bool caught_decompressor = false; // if we caught up with the decompressor and had to wait
#endif

// There should not be anything 'left over' in the chunk by now
assert(chunk.size() - uidx == 0);
// Grab a new chunk to process
uidx = 0xffff; // make sure it is a non-zero value to indicate we have work left to do
while (uidx != 0)
{
#ifdef MULTITHREADED_READ
chunk_mutex.lock();
#endif

if (uncompressed_chunks.size())
{
chunk.release();
Expand All @@ -51,21 +49,23 @@ class file_reader
{
assert(!done_decompressing); // if this triggers, it means we tried to read more data than there is
}
#ifdef MULTITHREADED_READ
chunk_mutex.unlock();
#endif

if (uidx != 0)
{
#ifdef MULTITHREADED_READ
usleep(10000); // wait for more data
if (!caught_decompressor)
if (multithreaded_read)
{
if (!caught_decompressor)
{
caught_decompressor = true;
times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop
}
usleep(10000); // wait for more data
}
else
{
caught_decompressor = true;
times_caught_decompressor++; // only count the unique times this happened, not each iteration of the wait loop
if (!decompress_chunk()) break; // generate new chunk
}
#else
if (!decompress_chunk()) break; // generate new chunk
#endif
}
}
}
Expand Down Expand Up @@ -213,9 +213,19 @@ class file_reader
return false;
}

void disable_multithreaded_read() // we can only disable on the fly, enable makes less sense
{
chunk_mutex.lock();
done_decompressing = true;
decompressor_thread.join();
multithreaded_read = false;
chunk_mutex.unlock();
}

private:
void decompressor(); // runs in separate thread, moves chunks from file to uncompressed chunks

bool multithreaded_read = true;
unsigned tid = -1;
lava::mutex chunk_mutex;
FILE* fp = nullptr;
Expand Down
36 changes: 15 additions & 21 deletions src/filewriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,13 @@ void file_writer::finalize()
printf("Filewriter finalizing thread %u: %lu total bytes, %lu in last chunk, %d uncompressed chunks, and %d compressed chunks to be written out\n",
mTid, (unsigned long)uncompressed_bytes, (unsigned long)uidx, (int)uncompressed_chunks.size(), (int)compressed_chunks.size());
chunk.shrink(uidx);
#ifndef MULTITHREADED_COMPRESS
chunk = compress_chunk(chunk);
#ifdef MULTITHREADED_WRITE
compressed_chunks.push_front(chunk);
#else
write_chunk(chunk);
#endif
#else
uncompressed_chunks.push_front(chunk);
#endif
if (!multithreaded_compress)
{
chunk = compress_chunk(chunk);
if (multithreaded_write) compressed_chunks.push_front(chunk);
else write_chunk(chunk);
}
else uncompressed_chunks.push_front(chunk);
chunk = buffer(uncompressed_chunk_size); // ready to go again
chunk_mutex.unlock();
// wrap up work in work lists
Expand Down Expand Up @@ -162,7 +159,6 @@ void file_writer::serializer()
{
// lock, steal compressed buffer, unlock, store to disk, sleep, repeat
set_thread_name("serializer");
#ifdef MULTITHREADED_WRITE
while (1)
{
buffer active;
Expand Down Expand Up @@ -191,7 +187,6 @@ void file_writer::serializer()
usleep(2000);
}
}
#endif
}

buffer file_writer::compress_chunk(buffer& uncompressed)
Expand All @@ -218,7 +213,6 @@ void file_writer::compressor()
{
// lock, grab pointer to uncompressed, make new compressed, unlock, compress, sleep, repeat
set_thread_name("compressor");
#ifdef MULTITHREADED_COMPRESS
while (1)
{
buffer uncompressed;
Expand All @@ -241,19 +235,19 @@ void file_writer::compressor()
if (uncompressed.size() > 0)
{
buffer compressed = compress_chunk(uncompressed);
#ifdef MULTITHREADED_WRITE
chunk_mutex.lock();
compressed_chunks.push_front(compressed);
chunk_mutex.unlock();
#else
write_chunk(compressed);
#endif

if (multithreaded_write)
{
chunk_mutex.lock();
compressed_chunks.push_front(compressed);
chunk_mutex.unlock();
}
else write_chunk(compressed);
}
// if not done and no work done, wait a bit
else if (!done_feeding)
{
usleep(2000);
}
}
#endif
}
49 changes: 34 additions & 15 deletions src/filewriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
#include "lavamutex.h"
#include "util.h"

#define MULTITHREADED_COMPRESS
#define MULTITHREADED_WRITE

class file_writer
{
file_writer(const file_writer&) = delete;
Expand All @@ -23,19 +20,21 @@ class file_writer
{
// shrink existing chunk to actually used size
chunk.shrink(uidx);

// move chunk into list of chunks to compress
#ifdef MULTITHREADED_COMPRESS
chunk_mutex.lock();
uncompressed_chunks.push_front(chunk);
chunk_mutex.unlock();
#else
buffer compressed = compress_chunk(chunk);
#ifdef MULTITHREADED_WRITE
compressed_chunks.push_front(compressed);
#else
write_chunk(compressed);
#endif
#endif
if (multithreaded_compress)
{
chunk_mutex.lock();
uncompressed_chunks.push_front(chunk);
chunk_mutex.unlock();
}
else
{
buffer compressed = compress_chunk(chunk);
if (multithreaded_write) compressed_chunks.push_front(compressed);
else write_chunk(compressed);
}

// create a new chunk for writing into (we could employ a free list here as a possible optimization)
if (size > uncompressed_chunk_size) // make sure our new chunk is big enough
{
Expand Down Expand Up @@ -141,6 +140,24 @@ class file_writer

void change_default_chunk_size(size_t size) { assert(uidx < size); uncompressed_chunk_size = size; chunk.shrink(size); }

void disable_multithreaded_compress()
{
chunk_mutex.lock();
done_compressing.exchange(false);
if (compressor_thread.joinable()) compressor_thread.join();
multithreaded_compress = false;
chunk_mutex.unlock();
}

void disable_multithreaded_writeout()
{
chunk_mutex.lock();
done_feeding.exchange(false);
if (serializer_thread.joinable()) serializer_thread.join();
multithreaded_write = false;
chunk_mutex.unlock();
}

protected:
uint64_t uncompressed_bytes = 0; // total amount of uncompressed bytes written so far
uint64_t checkpoint_bytes = 0; // bytes at freeze checkpoint
Expand All @@ -156,6 +173,8 @@ class file_writer
buffer compress_chunk(buffer& uncompressed); // returns compressed buffer
void write_chunk(buffer& active);

bool multithreaded_compress = true;
bool multithreaded_write = true;
lava::mutex chunk_mutex;
FILE* fp = nullptr;
size_t uncompressed_chunk_size = 1024 * 1024 * 64; // use 64mb chunks by default
Expand Down
5 changes: 5 additions & 0 deletions src/replay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ static void usage()
printf("-A/--allocator type Use custom memory allocator callbacks [none, debug]\n");
printf("-N/--no-anisotropy Disable any use of sampler anisotropy\n");
printf("-B/--blackhole Do not actually submit any work to the GPU. May be useful for CPU measurements.\n");
printf("-nm/--no-multithread Do not do decompression and file read in a separate thread. May save some CPU load and memory.\n");
exit(-1);
}

Expand Down Expand Up @@ -231,6 +232,10 @@ int main(int argc, char **argv)
{
p__blackhole = 1;
}
else if (match(argv[i], "-nm", "--no-multithread", remaining))
{
p__disable_multithread_read = 1;
}
else if (match(argv[i], "-w", "--wsi", remaining))
{
std::string wsi = get_str(argv[++i], remaining);
Expand Down
3 changes: 3 additions & 0 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ uint_fast8_t p__no_anisotropy = get_env_bool("LAVATUBE_NO_ANISOTROPY", 0);
uint_fast8_t p__delay_fence_success_frames = get_env_int("LAVATUBE_DELAY_FENCE_SUCCESS_FRAMES", 0); // off by default
int p__chunksize = get_env_int("LAVATUBE_CHUNK_SIZE", 64 * 1024 * 1024);
uint_fast8_t p__external_memory = get_env_bool("LAVATUBE_EXTERNAL_MEMORY", 0);
uint_fast8_t p__disable_multithread_writeout = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_WRITEOUT", 0);
uint_fast8_t p__disable_multithread_compress = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_COMPRESS", 0);
uint_fast8_t p__disable_multithread_read = get_env_bool("LAVATUBE_DISABLE_MULTITHREADED_READ", 0);

const char* errorString(const VkResult errorCode)
{
Expand Down
3 changes: 3 additions & 0 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ extern uint_fast8_t p__delay_fence_success_frames;
extern FILE* p__debug_destination;
extern int p__chunksize;
extern uint_fast8_t p__external_memory;
extern uint_fast8_t p__disable_multithread_writeout;
extern uint_fast8_t p__disable_multithread_compress;
extern uint_fast8_t p__disable_multithread_read;

/// Logging to be enable as needed by source recompilation
#define NEVER(_format, ...)
Expand Down
2 changes: 2 additions & 0 deletions src/write.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ lava_file_writer::lava_file_writer(uint16_t _tid, lava_writer* _parent) : parent
{
mTid = _tid;
get_thread_name(thread_name);
if (p__disable_multithread_compress) disable_multithreaded_compress();
if (p__disable_multithread_writeout) disable_multithreaded_writeout();
}

void lava_file_writer::set(const std::string& path)
Expand Down

0 comments on commit d5b89dc

Please sign in to comment.