diff --git a/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala b/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala index 31927a59..8657d927 100644 --- a/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala +++ b/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala @@ -1,12 +1,14 @@ package io.computenode.cyfra.vulkan - import io.computenode.cyfra.vulkan.compute.{Binding, ComputePipeline, InputBufferSize, LayoutInfo, LayoutSet, Shader} import io.computenode.cyfra.vulkan.executor.BufferAction.{LoadFrom, LoadTo} import io.computenode.cyfra.vulkan.executor.SequenceExecutor import io.computenode.cyfra.vulkan.executor.SequenceExecutor.{ComputationSequence, Compute, Dependency, LayoutLocation} +import io.computenode.cyfra.vulkan.memory.Buffer import munit.FunSuite import org.lwjgl.BufferUtils +import org.lwjgl.vulkan.VK10.* +import org.lwjgl.util.vma.Vma.* class SequenceExecutorTest extends FunSuite: private val vulkanContext = VulkanContext(true) @@ -24,10 +26,31 @@ class SequenceExecutorTest extends FunSuite: ) val sequenceExecutor = new SequenceExecutor(sequence, vulkanContext) val input = 0 until 1024 - val buffer = BufferUtils.createByteBuffer(input.length * 4) - input.foreach(buffer.putInt) - buffer.flip() - val res = sequenceExecutor.execute(Seq(buffer), input.length) - val output = input.map(_ => res.head.getInt) + + val inputBuffer = new Buffer( + input.length * 4, // 4 bytes per int + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_CPU_ONLY, + vulkanContext.allocator + ) + + val mappedBuffer = inputBuffer.map() + input.foreach(mappedBuffer.putInt) + inputBuffer.unmap() + + val res = sequenceExecutor.execute(Seq(inputBuffer), input.length) + + val outputMappedBuffer = res.head.map() + val output = (0 until input.length).map(_ => outputMappedBuffer.getInt) + res.head.unmap() assertEquals(input.map(_ + 20000).toList, output.toList) + + // Clean up + inputBuffer.destroy() + res.foreach(_.destroy()) + sequenceExecutor.destroy() + copy1.destroy() + copy2.destroy() + shader.destroy() \ No newline at end of file diff --git a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala index 514558ec..9c820b29 100644 --- a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala +++ b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala @@ -32,9 +32,10 @@ class AnimatedFunctionRenderer(params: AnimatedFunctionRenderer.Parameters) exte protected override def renderFrame(scene: AnimatedFunction, time: Float32, fn: RenderFn): Array[fRGBA] = val mem = Array.fill(params.width * params.height)((0.5f, 0.5f, 0.5f, 0.5f)) - UniformContext.withUniform(AnimationIteration(time)): + val uniformStruct = AnimationIteration(time) + UniformContext.withUniform(uniformStruct): val fmem = Vec4FloatMem(mem) - fmem.map(fn).asInstanceOf[Vec4FloatMem].toArray + fmem.map(uniformStruct, fn).asInstanceOf[Vec4FloatMem].toArray protected override def renderFunction(scene: AnimatedFunction): RenderFn = GFunction.from2D(params.width, { diff --git a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala index 2057a54b..7ddd54a2 100644 --- a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala +++ b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala @@ -34,10 +34,11 @@ class ImageRtRenderer(params: ImageRtRenderer.Parameters) extends RtRenderer(par private def render(scene: Scene, fn: GFunction[RaytracingIteration, Vec4[Float32], Vec4[Float32]]): LazyList[Array[fRGBA]] = val initialMem = Array.fill(params.width * params.height)((0.5f, 0.5f, 0.5f, 0.5f)) LazyList.iterate((initialMem, 0), params.iterations + 1) { case (mem, render) => - UniformContext.withUniform(RaytracingIteration(render)): + val uniformStruct = RaytracingIteration(render) + UniformContext.withUniform(uniformStruct): val fmem = Vec4FloatMem(mem) val result = timed(s"Rendered iteration $render")( - fmem.map(fn).asInstanceOf[Vec4FloatMem].toArray + fmem.map(uniformStruct, fn).asInstanceOf[Vec4FloatMem].toArray ) (result, render + 1) }.drop(1).map(_._1) diff --git a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala index 12e94f4d..b2de521b 100644 --- a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala +++ b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala @@ -29,9 +29,10 @@ class AnimationRtRenderer(params: AnimationRtRenderer.Parameters) extends RtRend ): Array[fRGBA] = val initialMem = Array.fill(params.width * params.height)((0.5f, 0.5f, 0.5f, 0.5f)) List.iterate((initialMem, 0), params.iterations + 1) { case (mem, render) => - UniformContext.withUniform(RaytracingIteration(render, time)): + val uniformStruct = RaytracingIteration(render, time) + UniformContext.withUniform(uniformStruct): val fmem = Vec4FloatMem(mem) - val result = fmem.map(fn).asInstanceOf[Vec4FloatMem].toArray + val result = fmem.map(uniformStruct, fn).asInstanceOf[Vec4FloatMem].toArray (result, render + 1) }.map(_._1).last diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala index 3a80afca..b9491ba2 100644 --- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala +++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala @@ -11,85 +11,140 @@ import SequenceExecutor.* import io.computenode.cyfra.runtime.mem.GMem.totalStride import io.computenode.cyfra.spirv.SpirvTypes.typeStride import io.computenode.cyfra.spirv.compilers.DSLCompiler -import io.computenode.cyfra.spirv.compilers.ExpressionCompiler.{UniformStructRef, WorkerIndex} +import io.computenode.cyfra.spirv.compilers.ExpressionCompiler +import io.computenode.cyfra.dsl.Expression.E import mem.{FloatMem, GMem, Vec4FloatMem} import org.lwjgl.system.{Configuration, MemoryUtil} import izumi.reflect.Tag - -import java.io.FileOutputStream +import io.computenode.cyfra.vulkan.memory.Buffer +import org.lwjgl.vulkan.VK10.* +import org.lwjgl.util.vma.Vma.* import java.nio.ByteBuffer +import java.io.{FileOutputStream, IOException} import java.nio.channels.FileChannel -import java.util.concurrent.Executors +import scala.collection.mutable +import scala.collection.mutable.ListBuffer import scala.concurrent.{ExecutionContext, ExecutionContextExecutor} -class GContext: - - Configuration.STACK_SIZE.set(1024) // fix lwjgl stack size +class GContext(debug: Boolean = false): + val vkContext = VulkanContext(debug) + private val pipelineCache = mutable.Map[Any, ComputePipeline]() - val vkContext = new VulkanContext(enableValidationLayers = true) + private def createPipeline[G <: GStruct[G] : GStructSchema, H <: Value : Tag : FromExpr, R <: Value : Tag : FromExpr]( + function: GFunction[G, H, R] + ): ComputePipeline = { + val uniformStructSchemaImpl = summon[GStructSchema[G]] + val tagGImpl: Tag[G] = uniformStructSchemaImpl.structTag - implicit val ec: ExecutionContextExecutor = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(16)) - - def compile[ - G <: GStruct[G] : Tag : GStructSchema, - H <: Value : Tag : FromExpr, - R <: Value : Tag : FromExpr - ](function: GFunction[G, H, R]): ComputePipeline = { - val uniformStructSchema = summon[GStructSchema[G]] - val uniformStruct = uniformStructSchema.fromTree(UniformStructRef) + val uniformStruct = uniformStructSchemaImpl.fromTree( + ExpressionCompiler.UniformStructRef[G](using tagGImpl).asInstanceOf[E[G]] + ) val tree = function - .fn + .fn .apply( uniformStruct, - WorkerIndex, + ExpressionCompiler.WorkerIndex, GArray[H](0) ) - val shaderCode = DSLCompiler.compile(tree, function.arrayInputs, function.arrayOutputs, uniformStructSchema) + val shaderCode = DSLCompiler.compile(tree, function.arrayInputs, function.arrayOutputs, uniformStructSchemaImpl) dumpSpvToFile(shaderCode, "program.spv") // TODO remove before release - val inOut = 0 to 1 map (Binding(_, InputBufferSize(typeStride(summon[Tag[H]])))) - val uniform = Option.when(uniformStructSchema.fields.nonEmpty)(Binding(2, UniformSize(totalStride(uniformStructSchema)))) - val layoutInfo = LayoutInfo(Seq(LayoutSet(0, inOut ++ uniform))) + + val inputBinding = Binding(0, InputBufferSize(typeStride(summon[Tag[H]]))) + val outputBinding = Binding(1, InputBufferSize(typeStride(summon[Tag[R]]))) + + val uniformBindingOpt = Option.when(uniformStructSchemaImpl.fields.nonEmpty)( + Binding(2, UniformSize(GMem.totalStride(uniformStructSchemaImpl))) + ) + + val bindings = Seq(inputBinding, outputBinding) ++ uniformBindingOpt.toSeq + val layoutInfo = LayoutInfo(Seq(LayoutSet(0, bindings))) + val shader = new Shader(shaderCode, new org.joml.Vector3i(256, 1, 1), layoutInfo, "main", vkContext.device) new ComputePipeline(shader, vkContext) } private def dumpSpvToFile(code: ByteBuffer, path: String): Unit = - val fc: FileChannel = new FileOutputStream("program.spv").getChannel - fc.write(code) - fc.close() - code.rewind() + try { + val fc: FileChannel = new FileOutputStream(path).getChannel + fc.write(code) + fc.close() + } catch { + case e: IOException => e.printStackTrace() + } finally { + code.rewind() + } def execute[ G <: GStruct[G] : Tag : GStructSchema, - H <: Value, - R <: Value - ](mem: GMem[H], fn: GFunction[?, H, R])(using uniformContext: UniformContext[_]): GMem[R] = - val isUniformEmpty = uniformContext.uniform.schema.fields.isEmpty - val actions = Map( - LayoutLocation(0, 0) -> BufferAction.LoadTo, - LayoutLocation(0, 1) -> BufferAction.LoadFrom - ) ++ ( - if isUniformEmpty then Map.empty - else Map(LayoutLocation(0, 2) -> BufferAction.LoadTo) - ) - val sequence = ComputationSequence(Seq(Compute(fn.pipeline, actions)), Seq.empty) - val executor = new SequenceExecutor(sequence, vkContext) + H <: Value : Tag : FromExpr, + R <: Value : FromExpr : Tag + ](mem: GMem[H], uniformStruct: G, fn: GFunction[G, H, R]): GMem[R] = { + val pipeline = pipelineCache.getOrElseUpdate(fn.fn, createPipeline(fn)) + + val sourceBuffersForExecutor = ListBuffer[Buffer]() + val bufferActions = mutable.Map[LayoutLocation, BufferAction]() + + bufferActions.put(LayoutLocation(0, 0), BufferAction.LoadTo) + sourceBuffersForExecutor.addOne(mem.vulkanBuffer) + + bufferActions.put(LayoutLocation(0, 1), BufferAction.LoadFrom) + + var uniformStagingBufferOpt: Option[Buffer] = None + val uniformStructSchema = summon[GStructSchema[G]] + if (uniformStructSchema.fields.nonEmpty) { + val uniformCPUByteBuffer = GMem.serializeUniform(uniformStruct) + val uniformStagingVkBuffer = new Buffer( + uniformCPUByteBuffer.remaining(), // Changed from .toLong to direct Int, or .toInt if remaining() can exceed Int + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_CPU_ONLY, + vkContext.allocator + ) + uniformStagingVkBuffer.map { mappedUniform => + mappedUniform.put(uniformCPUByteBuffer) + } + + uniformStagingBufferOpt = Some(uniformStagingVkBuffer) + bufferActions.put(LayoutLocation(0, 2), BufferAction.LoadTo) + sourceBuffersForExecutor.addOne(uniformStagingVkBuffer) + } + + val computeStep = Compute(pipeline, bufferActions.toMap) + val sequence = ComputationSequence(Seq(computeStep), dependencies = Nil) + val sequenceExecutor = new SequenceExecutor(sequence, vkContext) + + val outputVulkanBuffers = sequenceExecutor.execute(sourceBuffersForExecutor.toSeq, mem.size) - val data = mem.toReadOnlyBuffer - val inData = - if isUniformEmpty then Seq(data) - else Seq(data, GMem.serializeUniform(uniformContext.uniform)) - val out = executor.execute(inData, mem.size) - executor.destroy() - - val outTags = fn.arrayOutputs - assert(outTags.size == 1) - - outTags.head match - case t if t == Tag[Float32] => - new FloatMem(mem.size, out.head).asInstanceOf[GMem[R]] - case t if t == Tag[Vec4[Float32]] => - new Vec4FloatMem(mem.size, out.head).asInstanceOf[GMem[R]] - case _ => assert(false, "Supported output types are Float32 and Vec4[Float32]") + uniformStagingBufferOpt.foreach(_.destroy()) + + if (outputVulkanBuffers.isEmpty) { + throw new IllegalStateException("SequenceExecutor did not return an output buffer.") + } + val resultVulkanBuffer = outputVulkanBuffers.head + + val tagR = summon[Tag[R]] + val resultMem = + if (tagR.tag =:= Tag[Float32].tag) { + new FloatMem(mem.size, resultVulkanBuffer).asInstanceOf[GMem[R]] + } else if (tagR.tag =:= Tag[Vec4[Float32]].tag) { + new Vec4FloatMem(mem.size, resultVulkanBuffer).asInstanceOf[GMem[R]] + } else { + resultVulkanBuffer.destroy() + throw new UnsupportedOperationException(s"Cannot create GMem for result type ${tagR.tag}. Output buffer has been destroyed.") + } + resultMem + } + + def execute[H <: Value : Tag : FromExpr, R <: Value : FromExpr : Tag]( + mem: GMem[H], + fn: GFunction[GStruct.Empty, H, R] + ): GMem[R] = + execute[GStruct.Empty, H, R](mem, GStruct.Empty(), fn) + + def cleanup(): Unit = { + pipelineCache.values.foreach(_.destroy()) + pipelineCache.clear() + vkContext.destroy() + } diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala index 48c2d5b5..59cca842 100644 --- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala +++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala @@ -1,38 +1,39 @@ package io.computenode.cyfra.runtime -import io.computenode.cyfra.dsl.{*, given} +import io.computenode.cyfra.dsl.{*, given} import io.computenode.cyfra.dsl.Value.Int32 -import io.computenode.cyfra.vulkan.compute.ComputePipeline +import io.computenode.cyfra.dsl.Expression.E import izumi.reflect.Tag case class GFunction[ - G <: GStruct[G] : GStructSchema : Tag, - H <: Value : Tag : FromExpr, + G <: GStruct[G] : GStructSchema : Tag, + H <: Value : Tag : FromExpr, R <: Value : Tag : FromExpr -](fn: (G, Int32, GArray[H]) => R)(implicit context: GContext){ +]( + val fn: (G, Int32, GArray[H]) => R +) { def arrayInputs: List[Tag[_]] = List(summon[Tag[H]]) def arrayOutputs: List[Tag[_]] = List(summon[Tag[R]]) - val pipeline: ComputePipeline = context.compile(this) } object GFunction: def apply[ H <: Value : Tag : FromExpr, R <: Value : Tag : FromExpr - ](fn: H => R)(using context: GContext): GFunction[GStruct.Empty, H, R] = + ](userSimpleFn: H => R): GFunction[GStruct.Empty, H, R] = new GFunction[GStruct.Empty, H, R]( - (_, index: Int32, gArray: GArray[H]) => fn(gArray.at(index)) + (_: GStruct.Empty, workerIdx: Int32, gArray: GArray[H]) => userSimpleFn(gArray.at(workerIdx)) ) def from2D[ G <: GStruct[G] : GStructSchema : Tag, H <: Value : Tag : FromExpr, R <: Value : Tag : FromExpr - ](width: Int, fn: (G, (Int32, Int32), GArray2D[H]) => R)(using context: GContext): GFunction[G, H, R] = - GFunction[G, H, R]( - (g: G, index: Int32, a: GArray[H]) => + ](width: Int, userFn2D: (G, (Int32, Int32), GArray2D[H]) => R): GFunction[G, H, R] = + new GFunction[G, H, R]( + (g: G, index: Int32, garray: GArray[H]) => val x: Int32 = index mod width val y: Int32 = index / width - val arr = GArray2D(width, a) - fn(g, (x, y), arr) + val arr2d = GArray2D(width, garray) + userFn2D(g, (x, y), arr2d) ) diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala index f8919e12..36414925 100644 --- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala +++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala @@ -1,29 +1,85 @@ package io.computenode.cyfra.runtime.mem import io.computenode.cyfra.dsl.Value.Float32 +import io.computenode.cyfra.vulkan.memory.Buffer +import io.computenode.cyfra.runtime.GContext +import org.lwjgl.vulkan.VK10.* +import org.lwjgl.util.vma.Vma.* import java.nio.ByteBuffer -import org.lwjgl.system.MemoryUtil -class FloatMem(val size: Int, protected val data: ByteBuffer) extends RamGMem[Float32, Float]: - def toArray: Array[Float] = - val res = data.asFloatBuffer() - val result = new Array[Float](size) - res.get(result) +class FloatMem(val size: Int, val vulkanBuffer: Buffer) extends RamGMem[Float32, Float]: + def toArray(using context: GContext): Array[Float] = + val allocator = context.vkContext.allocator + val commandPool = context.vkContext.commandPool + val bufferSize = size.toLong * FloatMem.FloatSize + + val stagingBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_GPU_TO_CPU, + allocator + ) + + Buffer.copyBuffer(vulkanBuffer, stagingBuffer, bufferSize, commandPool).block().close() + + val result = stagingBuffer.map { byteBuffer => + val floatBuffer = byteBuffer.asFloatBuffer() + val arr = new Array[Float](size) + floatBuffer.get(arr) + arr + } + + stagingBuffer.destroy() result + def cleanup(): Unit = + vulkanBuffer.destroy() object FloatMem { val FloatSize = 4 - def apply(floats: Array[Float]): FloatMem = + def apply(floats: Array[Float])(using context: GContext): FloatMem = val size = floats.length - val data = ByteBuffer.allocateDirect(size * FloatSize) - data.asFloatBuffer().put(floats) - data.rewind() - new FloatMem(size, data) - - def apply(size: Int): FloatMem = - val data = ByteBuffer.allocateDirect(size * FloatSize) - new FloatMem(size, data) + val bufferSize = size.toLong * FloatSize + val allocator = context.vkContext.allocator + val commandPool = context.vkContext.commandPool + + val stagingBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_CPU_ONLY, + allocator + ) + + stagingBuffer.map { byteBuffer => + byteBuffer.asFloatBuffer().put(floats) + } + + val deviceBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + 0, + VMA_MEMORY_USAGE_GPU_ONLY, + allocator + ) + + Buffer.copyBuffer(stagingBuffer, deviceBuffer, bufferSize, commandPool).block().close() + stagingBuffer.destroy() + + new FloatMem(size, deviceBuffer) + + def apply(size: Int)(using context: GContext): FloatMem = + val bufferSize = size.toLong * FloatSize + val allocator = context.vkContext.allocator + val deviceBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + 0, + VMA_MEMORY_USAGE_GPU_ONLY, + allocator + ) + new FloatMem(size, deviceBuffer) } diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala index eb04de4c..1f2c750c 100644 --- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala +++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala @@ -9,18 +9,25 @@ import io.computenode.cyfra.spirv.SpirvTypes.typeStride import io.computenode.cyfra.runtime.{GFunction, GContext} import izumi.reflect.Tag -import org.lwjgl.system.MemoryUtil - import java.nio.ByteBuffer +import io.computenode.cyfra.vulkan.memory.Buffer -trait GMem[H <: Value]: +trait GMem[H <: Value : Tag : FromExpr]: def size: Int - def toReadOnlyBuffer: ByteBuffer + def vulkanBuffer: Buffer + def map[ G <: GStruct[G] : Tag : GStructSchema, R <: Value : FromExpr : Tag - ](fn: GFunction[G, H, R])(using context: GContext): GMem[R] = - context.execute(this, fn) + ](uniformStruct: G, fn: GFunction[G, H, R])(using context: GContext): GMem[R] = + context.execute(this, uniformStruct, fn) + + def map[R <: Value : FromExpr : Tag] + (fn: GFunction[GStruct.Empty, H, R])(using context: GContext): GMem[R] = + context.execute(this, fn) + + def cleanup(): Unit +end GMem object GMem: type fRGBA = (Float, Float, Float, Float) diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala index 43e45f30..3aebacd1 100644 --- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala +++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala @@ -1,9 +1,10 @@ package io.computenode.cyfra.runtime.mem import io.computenode.cyfra.dsl.Value +import io.computenode.cyfra.vulkan.memory.Buffer import java.nio.ByteBuffer -trait RamGMem[T <: Value, R] extends GMem[T]: - protected val data: ByteBuffer - def toReadOnlyBuffer: ByteBuffer = data.asReadOnlyBuffer() +trait RamGMem[T <: Value, R] extends GMem[T] { + +} \ No newline at end of file diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala index eaa84e4c..5c93c70e 100644 --- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala +++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala @@ -2,36 +2,92 @@ package io.computenode.cyfra.runtime.mem import io.computenode.cyfra.dsl.Value.{Float32, Vec4} import io.computenode.cyfra.runtime.mem.GMem.fRGBA +import io.computenode.cyfra.vulkan.memory.Buffer +import io.computenode.cyfra.runtime.GContext +import org.lwjgl.vulkan.VK10.* +import org.lwjgl.util.vma.Vma.* -import org.lwjgl.system.MemoryUtil import java.nio.ByteBuffer -class Vec4FloatMem(val size: Int, protected val data: ByteBuffer) extends RamGMem[Vec4[Float32], fRGBA]: - def toArray: Array[fRGBA] = { - val res = data.asFloatBuffer() - val result = new Array[fRGBA](size) - for (i <- 0 until size) - result(i) = (res.get(), res.get(), res.get(), res.get()) +class Vec4FloatMem(val size: Int, val vulkanBuffer: Buffer) extends RamGMem[Vec4[Float32], fRGBA]: + def toArray(using context: GContext): Array[fRGBA] = { + val allocator = context.vkContext.allocator + val commandPool = context.vkContext.commandPool + val bufferSize = size.toLong * Vec4FloatMem.Vec4FloatSize + + val stagingBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_GPU_TO_CPU, + allocator + ) + + Buffer.copyBuffer(vulkanBuffer, stagingBuffer, bufferSize, commandPool).block().close() + + val result = stagingBuffer.map { byteBuffer => + val floatBuffer = byteBuffer.asFloatBuffer() + val arr = new Array[fRGBA](size) + for (i <- 0 until size) + arr(i) = (floatBuffer.get(), floatBuffer.get(), floatBuffer.get(), floatBuffer.get()) + arr + } + stagingBuffer.destroy() result } + def cleanup(): Unit = + vulkanBuffer.destroy() object Vec4FloatMem: val Vec4FloatSize = 16 - def apply(vecs: Array[fRGBA]): Vec4FloatMem = { + def apply(vecs: Array[fRGBA])(using context: GContext): Vec4FloatMem = { val size = vecs.length - val data = ByteBuffer.allocateDirect(size * Vec4FloatSize) - vecs.foreach { case (x, y, z, a) => - data.putFloat(x) - data.putFloat(y) - data.putFloat(z) - data.putFloat(a) + val bufferSize = size.toLong * Vec4FloatSize + val allocator = context.vkContext.allocator + val commandPool = context.vkContext.commandPool + + val stagingBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_CPU_ONLY, + allocator + ) + + stagingBuffer.map { byteBuffer => + val floatBuffer = byteBuffer.asFloatBuffer() + vecs.foreach { case (x, y, z, a) => + floatBuffer.put(x) + floatBuffer.put(y) + floatBuffer.put(z) + floatBuffer.put(a) + } } - data.rewind() - new Vec4FloatMem(size, data) + + val deviceBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + 0, + VMA_MEMORY_USAGE_GPU_ONLY, + allocator + ) + + Buffer.copyBuffer(stagingBuffer, deviceBuffer, bufferSize, commandPool).block().close() + stagingBuffer.destroy() + + new Vec4FloatMem(size, deviceBuffer) } - def apply(size: Int): Vec4FloatMem = - val data = ByteBuffer.allocateDirect(size * Vec4FloatSize) - new Vec4FloatMem(size, data) + def apply(size: Int)(using context: GContext): Vec4FloatMem = + val bufferSize = size.toLong * Vec4FloatSize + val allocator = context.vkContext.allocator + val deviceBuffer = new Buffer( + bufferSize.toInt, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + 0, + VMA_MEMORY_USAGE_GPU_ONLY, + allocator + ) + new Vec4FloatMem(size, deviceBuffer) diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala index f5e8a368..37e1dacc 100644 --- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala +++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala @@ -7,11 +7,10 @@ import io.computenode.cyfra.vulkan.core.Device import io.computenode.cyfra.vulkan.memory.{Allocator, Buffer, DescriptorPool, DescriptorSet} import org.lwjgl.BufferUtils import org.lwjgl.util.vma.Vma.VMA_MEMORY_USAGE_UNKNOWN +import org.lwjgl.util.vma.Vma.VMA_MEMORY_USAGE_GPU_TO_CPU import org.lwjgl.vulkan.* import org.lwjgl.vulkan.VK10.* -import java.nio.ByteBuffer - private[cyfra] abstract class AbstractExecutor(dataLength: Int, val bufferActions: Seq[BufferAction], context: VulkanContext) { protected val device: Device = context.device protected val queue: Queue = context.computeQueue @@ -37,18 +36,11 @@ private[cyfra] abstract class AbstractExecutor(dataLength: Int, val bufferAction commandBuffer } - def execute(input: Seq[ByteBuffer]): Seq[ByteBuffer] = { - val stagingBuffer = new Buffer( - getBiggestTransportData * dataLength, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, - VMA_MEMORY_USAGE_UNKNOWN, - allocator - ) + def execute(input: Seq[Buffer]): Seq[Buffer] = { for (i <- bufferActions.indices if bufferActions(i) == BufferAction.LoadTo) do { - val buffer = input(i) - Buffer.copyBuffer(buffer, stagingBuffer, buffer.remaining()) - Buffer.copyBuffer(stagingBuffer, buffers(i), buffer.remaining(), commandPool).block().destroy() + val inputHostBuffer = input(i) + val gpuDeviceBuffer = buffers(i) + Buffer.copyBuffer(inputHostBuffer, gpuDeviceBuffer, inputHostBuffer.size, commandPool).block().destroy() } pushStack { stack => @@ -64,14 +56,17 @@ private[cyfra] abstract class AbstractExecutor(dataLength: Int, val bufferAction } val output = for (i <- bufferActions.indices if bufferActions(i) == BufferAction.LoadFrom) yield { - val fence = Buffer.copyBuffer(buffers(i), stagingBuffer, buffers(i).size, commandPool) - val outBuffer = BufferUtils.createByteBuffer(buffers(i).size) - fence.block().destroy() - Buffer.copyBuffer(stagingBuffer, outBuffer, outBuffer.remaining()) - outBuffer - + val gpuDeviceBuffer = buffers(i) + val outputHostBuffer = new Buffer( + gpuDeviceBuffer.size, + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_GPU_TO_CPU, + allocator + ) + Buffer.copyBuffer(gpuDeviceBuffer, outputHostBuffer, gpuDeviceBuffer.size, commandPool).block().destroy() + outputHostBuffer } - stagingBuffer.destroy() output } diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala deleted file mode 100644 index aedc82a4..00000000 --- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala +++ /dev/null @@ -1,64 +0,0 @@ -package io.computenode.cyfra.vulkan.executor - -import io.computenode.cyfra.vulkan.compute.* -import io.computenode.cyfra.vulkan.VulkanContext -import io.computenode.cyfra.vulkan.compute.{Binding, ComputePipeline, InputBufferSize, Shader, UniformSize} -import io.computenode.cyfra.vulkan.memory.{Buffer, DescriptorSet} -import io.computenode.cyfra.vulkan.util.Util.{check, pushStack} -import org.lwjgl.system.MemoryStack -import org.lwjgl.system.MemoryStack.stackPush -import org.lwjgl.util.vma.Vma.* -import org.lwjgl.vulkan.* -import org.lwjgl.vulkan.VK10.* - -import scala.collection.mutable -import scala.util.Using - -/** @author - * MarconZet Created 15.04.2020 - */ -private[cyfra] class MapExecutor(dataLength: Int, bufferActions: Seq[BufferAction], computePipeline: ComputePipeline, context: VulkanContext) - extends AbstractExecutor(dataLength, bufferActions, context) { - private lazy val shader: Shader = computePipeline.computeShader - - protected def getBiggestTransportData: Int = shader.layoutInfo.sets - .flatMap(_.bindings) - .collect { case Binding(_, InputBufferSize(n)) => - n - } - .max - - protected def setupBuffers(): (Seq[DescriptorSet], Seq[Buffer]) = pushStack { stack => - val bindings = shader.layoutInfo.sets.flatMap(_.bindings) - val buffers = bindings.zipWithIndex.map { case (binding, i) => - val bufferSize = binding.size match { - case InputBufferSize(n) => n * dataLength - case UniformSize(n) => n - } - new Buffer(bufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | bufferActions(i).action, 0, VMA_MEMORY_USAGE_GPU_ONLY, allocator) - } - - val bufferDeque = mutable.ArrayDeque.from(buffers) - val descriptorSetLayouts = computePipeline.descriptorSetLayouts - val descriptorSets = for (i <- descriptorSetLayouts.indices) yield { - val descriptorSet = new DescriptorSet(device, descriptorSetLayouts(i)._1, descriptorSetLayouts(i)._2.bindings, descriptorPool) - val size = descriptorSetLayouts(i)._2.bindings.size - descriptorSet.update(bufferDeque.take(size).toSeq) - bufferDeque.drop(size) - descriptorSet - } - (descriptorSets, buffers) - } - - protected def recordCommandBuffer(commandBuffer: VkCommandBuffer): Unit = - pushStack { stack => - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.get) - - val pDescriptorSets = stack.longs(descriptorSets.map(_.get): _*) - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.pipelineLayout, 0, pDescriptorSets, null) - - val workgroup = shader.workgroupDimensions - vkCmdDispatch(commandBuffer, dataLength / workgroup.x(), 1 / workgroup.y(), 1 / workgroup.z()) - } - -} diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala index 8945893b..7c85ac52 100644 --- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala +++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala @@ -149,7 +149,7 @@ private[cyfra] class SequenceExecutor(computeSequence: ComputationSequence, cont setToBuffers } - def execute(inputs: Seq[ByteBuffer], dataLength: Int): Seq[ByteBuffer] = pushStack { stack => + def execute(inputs: Seq[Buffer], dataLength: Int): Seq[Buffer] = pushStack { stack => timed("Vulkan full execute"): val setToBuffers = createBuffers(dataLength) @@ -160,17 +160,9 @@ private[cyfra] class SequenceExecutor(computeSequence: ComputationSequence, cont } }.flatten - val stagingBuffer = new Buffer( - inputs.map(_.remaining()).max, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, - VMA_MEMORY_USAGE_UNKNOWN, - allocator - ) - - buffersWithAction(BufferAction.LoadTo).zipWithIndex.foreach { case (buffer, i) => - Buffer.copyBuffer(inputs(i), stagingBuffer, buffer.size) - Buffer.copyBuffer(stagingBuffer, buffer, buffer.size, commandPool).block().destroy() + buffersWithAction(BufferAction.LoadTo).zipWithIndex.foreach { case (gpuDeviceBuffer, i) => + val inputHostBuffer = inputs(i) + Buffer.copyBuffer(inputHostBuffer, gpuDeviceBuffer, inputHostBuffer.size, commandPool).block().destroy() } val fence = new Fence(device) @@ -185,14 +177,18 @@ private[cyfra] class SequenceExecutor(computeSequence: ComputationSequence, cont check(vkQueueSubmit(queue.get, submitInfo, fence.get), "Failed to submit command buffer to queue") fence.block().destroy() - val output = buffersWithAction(BufferAction.LoadFrom).map { buffer => - Buffer.copyBuffer(buffer, stagingBuffer, buffer.size, commandPool).block().destroy() - val out = BufferUtils.createByteBuffer(buffer.size) - Buffer.copyBuffer(stagingBuffer, out, buffer.size) - out + val output = buffersWithAction(BufferAction.LoadFrom).map { gpuDeviceBuffer => + val outputHostBuffer = new Buffer( + gpuDeviceBuffer.size, + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VMA_MEMORY_USAGE_GPU_TO_CPU, + allocator + ) + Buffer.copyBuffer(gpuDeviceBuffer, outputHostBuffer, gpuDeviceBuffer.size, commandPool).block().destroy() + outputHostBuffer } - stagingBuffer.destroy() commandPool.freeCommandBuffer(commandBuffer) setToBuffers.keys.foreach(_.update(Seq.empty)) setToBuffers.flatMap(_._2).foreach(_.destroy()) diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala index 91c27ec1..d7f7ea41 100644 --- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala +++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala @@ -41,12 +41,34 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage: (pBuffer.get(), pAllocation.get()) } + def map[R](f: ByteBuffer => R): R = { + var dataPtr: Long = NULL + try { + dataPtr = pushStack { stack => + val pData = stack.callocPointer(1) + check(vmaMapMemory(allocator.get, allocation, pData), s"Failed to map buffer memory for buffer handle $handle allocation $allocation") + val ptr = pData.get(0) + if (ptr == NULL) { + throw new VulkanAssertionError(s"vmaMapMemory returned NULL for buffer handle $handle, allocation $allocation", -1) + } + ptr + } + val byteBuffer = memByteBuffer(dataPtr, this.size) + f(byteBuffer) + } finally { + if (dataPtr != NULL) { + vmaUnmapMemory(allocator.get, allocation) + } + } + } + def get(dst: Array[Byte]): Unit = { val len = Math.min(dst.length, size) - val byteBuffer = memCalloc(len) - Buffer.copyBuffer(this, byteBuffer, len) - byteBuffer.get(dst) - memFree(byteBuffer) + this.map { mappedBuffer => + val bufferSlice = mappedBuffer.slice() + bufferSlice.limit(len) + bufferSlice.get(dst, 0, len) + } } protected def close(): Unit = @@ -54,23 +76,20 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage: } object Buffer { - def copyBuffer(src: ByteBuffer, dst: Buffer, bytes: Long): Unit = - pushStack { stack => - val pData = stack.callocPointer(1) - check(vmaMapMemory(dst.allocator.get, dst.allocation, pData), "Failed to map destination buffer memory") - val data = pData.get() - memCopy(memAddress(src), data, bytes) + def copyBuffer(src: ByteBuffer, dst: Buffer, bytes: Long): Unit = { + dst.map { dstMappedBuffer => + val srcSlice = src.slice() + srcSlice.limit(bytes.toInt) + dstMappedBuffer.put(srcSlice) vmaFlushAllocation(dst.allocator.get, dst.allocation, 0, bytes) - vmaUnmapMemory(dst.allocator.get, dst.allocation) } + } def copyBuffer(src: Buffer, dst: ByteBuffer, bytes: Long): Unit = - pushStack { stack => - val pData = stack.callocPointer(1) - check(vmaMapMemory(src.allocator.get, src.allocation, pData), "Failed to map destination buffer memory") - val data = pData.get() - memCopy(data, memAddress(dst), bytes) - vmaUnmapMemory(src.allocator.get, src.allocation) + src.map { srcMappedBuffer => + val srcSlice = srcMappedBuffer.slice() + srcSlice.limit(bytes.toInt) + dst.put(srcSlice) } def copyBuffer(src: Buffer, dst: Buffer, bytes: Long, commandPool: CommandPool): Fence =