cpp-gamedev
diff --git a/‎guide/src/SUMMARY.md
Lines changed: 3 additions & 1 deletion b/‎guide/src/SUMMARY.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎guide/src/memory/command_block.md
Lines changed: 84 additions & 0 deletions b/‎guide/src/memory/command_block.md
Lines changed: 84 additions & 0 deletions
diff --git a/‎guide/src/memory/device_buffers.md
Lines changed: 139 additions & 0 deletions b/‎guide/src/memory/device_buffers.md
Lines changed: 139 additions & 0 deletions
diff --git a/‎guide/src/memory/ibo_quad.png renamed to ‎guide/src/memory/vbo_quad.png b/‎guide/src/memory/ibo_quad.png renamed to ‎guide/src/memory/vbo_quad.png
diff --git a/‎guide/src/memory/host_vertex_buffer.md renamed to ‎guide/src/memory/vertex_buffer.md
Lines changed: 3 additions & 3 deletions b/‎guide/src/memory/host_vertex_buffer.md renamed to ‎guide/src/memory/vertex_buffer.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/app.cpp
Lines changed: 44 additions & 11 deletions b/‎src/app.cpp
Lines changed: 44 additions & 11 deletions
diff --git a/‎src/app.hpp
Lines changed: 5 additions & 0 deletions b/‎src/app.hpp
Lines changed: 5 additions & 0 deletions
@@ -39,4 +39,6 @@
 - [Memory Allocation](memory/README.md)
   - [Vulkan Memory Allocator](memory/vma.md)
   - [Buffers](memory/buffers.md)
-  - [Host Vertex Buffer](memory/host_vertex_buffer.md)
+  - [Vertex Buffer](memory/vertex_buffer.md)
+  - [Command Block](memory/command_block.md)
+  - [Device Buffers](memory/device_buffers.md)
@@ -0,0 +1,84 @@
+# Command Block
+
+Long-lived vertex buffers perform better when backed by Device memory, especially for 3D meshes. Data is transferred to device buffers in two steps: 
+
+1. Allocate a host buffer and copy the data to its mapped memory
+1. Allocate a device buffer, record a Buffer Copy operation and submit it
+
+The second step requires a command buffer and queue submission (_and_ waiting for the submitted work to complete). Encapsulate this behavior into a class, it will also be used for creating images:
+
+```cpp
+class CommandBlock {
+  public:
+  explicit CommandBlock(vk::Device device, vk::Queue queue,
+                        vk::CommandPool command_pool);
+
+  [[nodiscard]] auto command_buffer() const -> vk::CommandBuffer {
+    return *m_command_buffer;
+  }
+
+  void submit_and_wait();
+
+  private:
+  vk::Device m_device{};
+  vk::Queue m_queue{};
+  vk::UniqueCommandBuffer m_command_buffer{};
+};
+```
+
+The constructor takes an existing command pool created for such ad-hoc allocations, and the queue for submission later. This way it can be passed around after creation and used by other code.
+
+```cpp
+CommandBlock::CommandBlock(vk::Device const device, vk::Queue const queue,
+               vk::CommandPool const command_pool)
+  : m_device(device), m_queue(queue) {
+  // allocate a UniqueCommandBuffer which will free the underlying command
+  // buffer from its owning pool on destruction.
+  auto allocate_info = vk::CommandBufferAllocateInfo{};
+  allocate_info.setCommandPool(command_pool)
+    .setCommandBufferCount(1)
+    .setLevel(vk::CommandBufferLevel::ePrimary);
+  // all the current VulkanHPP functions for UniqueCommandBuffer allocation
+  // return vectors.
+  auto command_buffers = m_device.allocateCommandBuffersUnique(allocate_info);
+  m_command_buffer = std::move(command_buffers.front());
+
+  // start recording commands before returning.
+  auto begin_info = vk::CommandBufferBeginInfo{};
+  begin_info.setFlags(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
+  m_command_buffer->begin(begin_info);
+}
+```
+
+`submit_and_wait()` resets the unique command buffer at the end, to free it from its command pool:
+
+```cpp
+void CommandBlock::submit_and_wait() {
+  if (!m_command_buffer) { return; }
+
+  // end recording and submit.
+  m_command_buffer->end();
+  auto submit_info = vk::SubmitInfo2KHR{};
+  auto const command_buffer_info =
+    vk::CommandBufferSubmitInfo{*m_command_buffer};
+  submit_info.setCommandBufferInfos(command_buffer_info);
+  auto fence = m_device.createFenceUnique({});
+  m_queue.submit2(submit_info, *fence);
+
+  // wait for submit fence to be signaled.
+  static constexpr auto timeout_v =
+    static_cast<std::uint64_t>(std::chrono::nanoseconds(30s).count());
+  auto const result = m_device.waitForFences(*fence, vk::True, timeout_v);
+  if (result != vk::Result::eSuccess) {
+    std::println(stderr, "Failed to submit Command Buffer");
+  }
+  // free the command buffer.
+  m_command_buffer.reset();
+}
+```
+
+## Multithreading considerations
+
+Instead of blocking the main thread on every Command Block's `submit_and_wait()`, you might be wondering if command block usage could be multithreaded. The answer is yes! But with some extra work: each thread will require its own command pool - just using one owned (unique) pool per Command Block (with no need to free the buffer) is a good starting point. All queue operations need to be synchronized, ie a critical section protected by a mutex. This includes Swapchain acquire/present calls, and Queue submissions. A `class Queue` value type that stores a copy of the `vk::Queue` and a pointer/reference to its `std::mutex` - and wraps the submit call - can be passed to command blocks. Just this much will enable asynchronous asset loading etc, as each loading thread will use its own command pool, and queue submissions all around will be critical sections. `VmaAllocator` is internally synchronized (can be disabled at build time), so performing allocations through the same allocator on multiple threads is safe.
+
+For multi-threaded rendering, use a Secondary command buffer per thread to record rendering commands, accumulate and execute them in the main (Primary) command buffer currently in `RenderSync`. This is not particularly helpful unless you have thousands of expensive draw calls and dozens of render passes, as recording even a hundred draws will likely be faster on a single thread.
@@ -0,0 +1,139 @@
+# Device Buffers
+
+This guide will only use device buffers for vertex buffers, where both vertex and index data will be strung together in a single VBO. The create function can thus take the data and perform the buffer copy operation before returning. In essence this return value is a "GPU const" buffer. To enable utilizing separate spans for vertices and indices (instead of forcing allocation of a contiguous bytestream and copying the data), the create function takes a slightly awkward span of spans:
+
+```cpp
+// disparate byte spans.
+using ByteSpans = std::span<std::span<std::byte const> const>;
+
+// returns a Device Buffer with each byte span sequentially written.
+[[nodiscard]] auto create_device_buffer(VmaAllocator allocator,
+                                        vk::BufferUsageFlags usage,
+                                        CommandBlock command_block,
+                                        ByteSpans const& byte_spans) -> Buffer;
+```
+
+Implement `create_device_buffer()`:
+
+```cpp
+auto vma::create_device_buffer(VmaAllocator allocator,
+                               vk::BufferUsageFlags usage,
+                               CommandBlock command_block,
+                               ByteSpans const& byte_spans) -> Buffer {
+  auto const total_size = std::accumulate(
+    byte_spans.begin(), byte_spans.end(), 0uz,
+    [](std::size_t const n, std::span<std::byte const> bytes) {
+      return n + bytes.size();
+    });
+
+  // create staging Host Buffer with TransferSrc usage.
+  auto staging_buffer = create_host_buffer(
+    allocator, vk::BufferUsageFlagBits::eTransferSrc, total_size);
+
+  // create the Device Buffer, ensuring TransferDst usage.
+  usage |= vk::BufferUsageFlagBits::eTransferDst;
+  auto allocation_ci = VmaAllocationCreateInfo{};
+  allocation_ci.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+  allocation_ci.flags =
+    VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+  auto ret = create_buffer(allocator, allocation_ci, usage, total_size);
+
+  // can't do anything if either buffer creation failed.
+  if (!staging_buffer.get().buffer || !ret.get().buffer) { return {}; }
+
+  // copy byte spans into staging buffer.
+  auto dst = staging_buffer.get().mapped_span();
+  for (auto const bytes : byte_spans) {
+    std::memcpy(dst.data(), bytes.data(), bytes.size());
+    dst = dst.subspan(bytes.size());
+  }
+
+  // record buffer copy operation.
+  auto buffer_copy = vk::BufferCopy2{};
+  buffer_copy.setSize(total_size);
+  auto copy_buffer_info = vk::CopyBufferInfo2{};
+  copy_buffer_info.setSrcBuffer(staging_buffer.get().buffer)
+    .setDstBuffer(ret.get().buffer)
+    .setRegions(buffer_copy);
+  command_block.command_buffer().copyBuffer2(copy_buffer_info);
+
+  // submit and wait.
+  // waiting here is necessary to keep the staging buffer alive while the GPU
+  // accesses it through the recorded commands.
+  // this is also why the function takes ownership of the passed CommandBlock
+  // instead of just referencing it / taking a vk::CommandBuffer.
+  command_block.submit_and_wait();
+
+  return ret;
+}
+```
+
+Add a command block pool to `App`, and a helper function to create command blocks:
+
+```cpp
+void App::create_cmd_block_pool() {
+  auto command_pool_ci = vk::CommandPoolCreateInfo{};
+  command_pool_ci
+    .setQueueFamilyIndex(m_gpu.queue_family)
+    // this flag indicates that the allocated Command Buffers will be
+    // short-lived.
+    .setFlags(vk::CommandPoolCreateFlagBits::eTransient);
+  m_cmd_block_pool = m_device->createCommandPoolUnique(command_pool_ci);
+}
+
+auto App::create_command_block() const -> CommandBlock {
+  return CommandBlock{*m_device, m_queue, *m_cmd_block_pool};
+}
+```
+
+Update `create_vertex_buffer()` to create a quad with indices:
+
+```cpp
+template <typename T>
+[[nodiscard]] constexpr auto to_byte_array(T const& t) {
+  return std::bit_cast<std::array<std::byte, sizeof(T)>>(t);
+}
+
+// ...
+void App::create_vertex_buffer() {
+  // vertices of a quad.
+  static constexpr auto vertices_v = std::array{
+    Vertex{.position = {-0.5f, -0.5f}, .color = {1.0f, 0.0f, 0.0f}},
+    Vertex{.position = {0.5f, -0.5f}, .color = {0.0f, 1.0f, 0.0f}},
+    Vertex{.position = {0.5f, 0.5f}, .color = {0.0f, 0.0f, 1.0f}},
+    Vertex{.position = {-0.5f, 0.5f}, .color = {1.0f, 1.0f, 0.0f}},
+  };
+  static constexpr auto indices_v = std::array{
+    0u, 1u, 2u, 2u, 3u, 0u,
+  };
+  static constexpr auto vertices_bytes_v = to_byte_array(vertices_v);
+  static constexpr auto indices_bytes_v = to_byte_array(indices_v);
+  static constexpr auto total_bytes_v =
+    std::array<std::span<std::byte const>, 2>{
+      vertices_bytes_v,
+      indices_bytes_v,
+    };
+  // we want to write total_bytes_v to a Device VertexBuffer | IndexBuffer.
+  m_vbo = vma::create_device_buffer(m_allocator.get(),
+                                    vk::BufferUsageFlagBits::eVertexBuffer |
+                                      vk::BufferUsageFlagBits::eIndexBuffer,
+                                    create_command_block(), total_bytes_v);
+}
+```
+
+Update `draw()`:
+
+```cpp
+void App::draw(vk::CommandBuffer const command_buffer) const {
+  m_shader->bind(command_buffer, m_framebuffer_size);
+  // single VBO at binding 0 at no offset.
+  command_buffer.bindVertexBuffers(0, m_vbo.get().buffer, vk::DeviceSize{});
+  // u32 indices after offset of 4 vertices.
+  command_buffer.bindIndexBuffer(m_vbo.get().buffer, 4 * sizeof(Vertex),
+                                 vk::IndexType::eUint32);
+  // m_vbo has 6 indices.
+  command_buffer.drawIndexed(6, 1, 0, 0, 0);
+}
+```
+
+![VBO Quad](./vbo_quad.png)
@@ -1,6 +1,6 @@
-# Host Vertex Buffer
+# Vertex Buffer
 
-The goal here is to move the hard-coded vertices in the shader to application code. For the time being we will use an ad-hoc Host type `vma::Buffer` and focus more on the rest of the infrastructure like vertex attributes.
+The goal here is to move the hard-coded vertices in the shader to application code. For the time being we will use an ad-hoc Host `vma::Buffer` and focus more on the rest of the infrastructure like vertex attributes.
 
 First add a new header, `vertex.hpp`:
 
@@ -97,6 +97,6 @@ command_buffer.bindVertexBuffers(0, m_vbo->get_raw().buffer,
 command_buffer.draw(3, 1, 0, 0);
 ```
 
-You should see the same triangle as before. But now we can use whatever set of vertices we like! The Primitive Topology is Triange List by default, so every three vertices in the array is drawn as a triangle, eg for 9 vertices: `[[0, 1, 2], [3, 4, 5], [6, 7, 8]]`, where each inner `[]` represents a triangle comprised of the vertices at those indices.
+You should see the same triangle as before. But now we can use whatever set of vertices we like! The Primitive Topology is Triange List by default, so every three vertices in the array is drawn as a triangle, eg for 9 vertices: `[[0, 1, 2], [3, 4, 5], [6, 7, 8]]`, where each inner `[]` represents a triangle comprised of the vertices at those indices. Try playing around with customized vertices and topologies, use Render Doc to debug unexpected outputs / bugs.
 
 Host Vertex Buffers are useful for primitives that are temporary and/or frequently changing, such as UI objects. A 2D framework can use such VBOs exclusively: a simple approach would be a pool of buffers per virtual frame where for each draw a buffer is obtained from the current virtual frame's pool and vertices are copied in.
@@ -1,5 +1,6 @@
 #include <app.hpp>
 #include <vertex.hpp>
+#include <bit>
 #include <cassert>
 #include <chrono>
 #include <fstream>
@@ -12,6 +13,11 @@ namespace lvk {
 using namespace std::chrono_literals;
 
 namespace {
+template <typename T>
+[[nodiscard]] constexpr auto to_byte_array(T const& t) {
+	return std::bit_cast<std::array<std::byte, sizeof(T)>>(t);
+}
+
 [[nodiscard]] auto locate_assets_dir() -> fs::path {
 	// look for '<path>/assets/', starting from the working
 	// directory and walking up the parent directory tree.
@@ -83,6 +89,7 @@ void App::run() {
 	create_render_sync();
 	create_imgui();
 	create_shader();
+	create_cmd_block_pool();
 
 	create_vertex_buffer();
 
@@ -254,26 +261,49 @@ void App::create_shader() {
 	m_shader.emplace(shader_ci);
 }
 
+void App::create_cmd_block_pool() {
+	auto command_pool_ci = vk::CommandPoolCreateInfo{};
+	command_pool_ci
+		.setQueueFamilyIndex(m_gpu.queue_family)
+		// this flag indicates that the allocated Command Buffers will be
+		// short-lived.
+		.setFlags(vk::CommandPoolCreateFlagBits::eTransient);
+	m_cmd_block_pool = m_device->createCommandPoolUnique(command_pool_ci);
+}
+
 void App::create_vertex_buffer() {
-	// vertices previously hard-coded in the vertex shader.
+	// vertices of a quad.
 	static constexpr auto vertices_v = std::array{
 		Vertex{.position = {-0.5f, -0.5f}, .color = {1.0f, 0.0f, 0.0f}},
 		Vertex{.position = {0.5f, -0.5f}, .color = {0.0f, 1.0f, 0.0f}},
-		Vertex{.position = {0.0f, 0.5f}, .color = {0.0f, 0.0f, 1.0f}},
+		Vertex{.position = {0.5f, 0.5f}, .color = {0.0f, 0.0f, 1.0f}},
+		Vertex{.position = {-0.5f, 0.5f}, .color = {1.0f, 1.0f, 0.0f}},
 	};
-	// we want to write vertices_v to a Host VertexBuffer.
-	m_vbo = vma::create_host_buffer(m_allocator.get(),
-									vk::BufferUsageFlagBits::eVertexBuffer,
-									sizeof(vertices_v));
-
-	// host buffers have a memory-mapped pointer available to memcpy data to.
-	std::memcpy(m_vbo.get().mapped, vertices_v.data(), sizeof(vertices_v));
+	static constexpr auto indices_v = std::array{
+		0u, 1u, 2u, 2u, 3u, 0u,
+	};
+	static constexpr auto vertices_bytes_v = to_byte_array(vertices_v);
+	static constexpr auto indices_bytes_v = to_byte_array(indices_v);
+	static constexpr auto total_bytes_v =
+		std::array<std::span<std::byte const>, 2>{
+			vertices_bytes_v,
+			indices_bytes_v,
+		};
+	// we want to write total_bytes_v to a Device VertexBuffer | IndexBuffer.
+	m_vbo = vma::create_device_buffer(m_allocator.get(),
+									  vk::BufferUsageFlagBits::eVertexBuffer |
+										  vk::BufferUsageFlagBits::eIndexBuffer,
+									  create_command_block(), total_bytes_v);
 }
 
 auto App::asset_path(std::string_view const uri) const -> fs::path {
 	return m_assets_dir / uri;
 }
 
+auto App::create_command_block() const -> CommandBlock {
+	return CommandBlock{*m_device, m_queue, *m_cmd_block_pool};
+}
+
 void App::main_loop() {
 	while (glfwWindowShouldClose(m_window.get()) == GLFW_FALSE) {
 		glfwPollEvents();
@@ -450,7 +480,10 @@ void App::draw(vk::CommandBuffer const command_buffer) const {
 	m_shader->bind(command_buffer, m_framebuffer_size);
 	// single VBO at binding 0 at no offset.
 	command_buffer.bindVertexBuffers(0, m_vbo.get().buffer, vk::DeviceSize{});
-	// m_vbo has 3 vertices.
-	command_buffer.draw(3, 1, 0, 0);
+	// u32 indices after offset of 4 vertices.
+	command_buffer.bindIndexBuffer(m_vbo.get().buffer, 4 * sizeof(Vertex),
+								   vk::IndexType::eUint32);
+	// m_vbo has 6 indices.
+	command_buffer.drawIndexed(6, 1, 0, 0, 0);
 }
 } // namespace lvk
@@ -1,4 +1,5 @@
 #pragma once
+#include <command_block.hpp>
 #include <dear_imgui.hpp>
 #include <gpu.hpp>
 #include <resource_buffering.hpp>
@@ -38,9 +39,11 @@ class App {
 	void create_imgui();
 	void create_allocator();
 	void create_shader();
+	void create_cmd_block_pool();
 	void create_vertex_buffer();
 
 	[[nodiscard]] auto asset_path(std::string_view uri) const -> fs::path;
+	[[nodiscard]] auto create_command_block() const -> CommandBlock;
 
 	void main_loop();
 
@@ -70,6 +73,8 @@ class App {
 	std::optional<Swapchain> m_swapchain{};
 	// command pool for all render Command Buffers.
 	vk::UniqueCommandPool m_render_cmd_pool{};
+	// command pool for all Command Blocks.
+	vk::UniqueCommandPool m_cmd_block_pool{};
 	// Sync and Command Buffer for virtual frames.
 	Buffered<RenderSync> m_render_sync{};
 	// Current virtual frame index.