Use concurrent sharing mode for image if compute and graphics queue family indices differ

Use semaphores to sync compute and graphics work
Fixes #667
This commit is contained in:
Sascha Willems 2020-08-29 12:19:05 +02:00
parent 5d594acb02
commit b9f4b1adf1

View file

@ -36,6 +36,7 @@ public:
VkDescriptorSet descriptorSetPostCompute; // Image display shader bindings after compute shader image manipulation VkDescriptorSet descriptorSetPostCompute; // Image display shader bindings after compute shader image manipulation
VkPipeline pipeline; // Image display pipeline VkPipeline pipeline; // Image display pipeline
VkPipelineLayout pipelineLayout; // Layout of the graphics pipeline VkPipelineLayout pipelineLayout; // Layout of the graphics pipeline
VkSemaphore semaphore; // Execution dependency between compute & graphic submission
} graphics; } graphics;
// Resources for the compute part of the example // Resources for the compute part of the example
@ -43,13 +44,12 @@ public:
VkQueue queue; // Separate queue for compute commands (queue family may differ from the one used for graphics) VkQueue queue; // Separate queue for compute commands (queue family may differ from the one used for graphics)
VkCommandPool commandPool; // Use a separate command pool (queue family may differ from the one used for graphics) VkCommandPool commandPool; // Use a separate command pool (queue family may differ from the one used for graphics)
VkCommandBuffer commandBuffer; // Command buffer storing the dispatch commands and barriers VkCommandBuffer commandBuffer; // Command buffer storing the dispatch commands and barriers
VkFence fence; // Synchronization fence to avoid rewriting compute CB if still in use VkSemaphore semaphore; // Execution dependency between compute & graphic submission
VkDescriptorSetLayout descriptorSetLayout; // Compute shader binding layout VkDescriptorSetLayout descriptorSetLayout; // Compute shader binding layout
VkDescriptorSet descriptorSet; // Compute shader bindings VkDescriptorSet descriptorSet; // Compute shader bindings
VkPipelineLayout pipelineLayout; // Layout of the compute pipeline VkPipelineLayout pipelineLayout; // Layout of the compute pipeline
std::vector<VkPipeline> pipelines; // Compute pipelines for image filters std::vector<VkPipeline> pipelines; // Compute pipelines for image filters
int32_t pipelineIndex = 0; // Current image filtering compute pipeline index int32_t pipelineIndex = 0; // Current image filtering compute pipeline index
uint32_t queueFamilyIndex; // Family index of the graphics queue, used for barriers
} compute; } compute;
vks::Buffer vertexBuffer; vks::Buffer vertexBuffer;
@ -83,6 +83,7 @@ public:
vkDestroyPipeline(device, graphics.pipeline, nullptr); vkDestroyPipeline(device, graphics.pipeline, nullptr);
vkDestroyPipelineLayout(device, graphics.pipelineLayout, nullptr); vkDestroyPipelineLayout(device, graphics.pipelineLayout, nullptr);
vkDestroyDescriptorSetLayout(device, graphics.descriptorSetLayout, nullptr); vkDestroyDescriptorSetLayout(device, graphics.descriptorSetLayout, nullptr);
vkDestroySemaphore(device, graphics.semaphore, nullptr);
// Compute // Compute
for (auto& pipeline : compute.pipelines) for (auto& pipeline : compute.pipelines)
@ -91,7 +92,7 @@ public:
} }
vkDestroyPipelineLayout(device, compute.pipelineLayout, nullptr); vkDestroyPipelineLayout(device, compute.pipelineLayout, nullptr);
vkDestroyDescriptorSetLayout(device, compute.descriptorSetLayout, nullptr); vkDestroyDescriptorSetLayout(device, compute.descriptorSetLayout, nullptr);
vkDestroyFence(device, compute.fence, nullptr); vkDestroySemaphore(device, compute.semaphore, nullptr);
vkDestroyCommandPool(device, compute.commandPool, nullptr); vkDestroyCommandPool(device, compute.commandPool, nullptr);
vertexBuffer.destroy(); vertexBuffer.destroy();
@ -127,8 +128,18 @@ public:
// Image will be sampled in the fragment shader and used as storage target in the compute shader // Image will be sampled in the fragment shader and used as storage target in the compute shader
imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT; imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
imageCreateInfo.flags = 0; imageCreateInfo.flags = 0;
// Sharing mode exclusive means that ownership of the image does not need to be explicitly transferred between the compute and graphics queue // If compute and graphics queue family indices differ, we create an image that can be shared between them
imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; // This can result in worse performance than exclusive sharing mode, but save some synchronization to keep the sample simple
std::vector<uint32_t> queueFamilyIndices;
if (vulkanDevice->queueFamilyIndices.graphics != vulkanDevice->queueFamilyIndices.compute) {
queueFamilyIndices = {
vulkanDevice->queueFamilyIndices.graphics,
vulkanDevice->queueFamilyIndices.compute
};
imageCreateInfo.sharingMode = VK_SHARING_MODE_CONCURRENT;
imageCreateInfo.queueFamilyIndexCount = 2;
imageCreateInfo.pQueueFamilyIndices = queueFamilyIndices.data();
}
VkMemoryAllocateInfo memAllocInfo = vks::initializers::memoryAllocateInfo(); VkMemoryAllocateInfo memAllocInfo = vks::initializers::memoryAllocateInfo();
VkMemoryRequirements memReqs; VkMemoryRequirements memReqs;
@ -224,6 +235,8 @@ public:
imageMemoryBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }; imageMemoryBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
imageMemoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; imageMemoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
imageMemoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; imageMemoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
imageMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
imageMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
vkCmdPipelineBarrier( vkCmdPipelineBarrier(
drawCmdBuffers[i], drawCmdBuffers[i],
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
@ -472,51 +485,17 @@ public:
VK_CHECK_RESULT(vkCreateGraphicsPipelines(device, pipelineCache, 1, &pipelineCreateInfo, nullptr, &graphics.pipeline)); VK_CHECK_RESULT(vkCreateGraphicsPipelines(device, pipelineCache, 1, &pipelineCreateInfo, nullptr, &graphics.pipeline));
} }
// Find and create a compute capable device queue void prepareGraphics()
void getComputeQueue()
{ {
uint32_t queueFamilyCount; // Semaphore for compute & graphics sync
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyCount, NULL); VkSemaphoreCreateInfo semaphoreCreateInfo = vks::initializers::semaphoreCreateInfo();
assert(queueFamilyCount >= 1); VK_CHECK_RESULT(vkCreateSemaphore(device, &semaphoreCreateInfo, nullptr, &graphics.semaphore));
std::vector<VkQueueFamilyProperties> queueFamilyProperties;
queueFamilyProperties.resize(queueFamilyCount);
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyCount, queueFamilyProperties.data());
// Some devices have dedicated compute queues, so we first try to find a queue that supports compute and not graphics
bool computeQueueFound = false;
for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilyProperties.size()); i++)
{
if ((queueFamilyProperties[i].queueFlags & VK_QUEUE_COMPUTE_BIT) && ((queueFamilyProperties[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) == 0))
{
compute.queueFamilyIndex = i;
computeQueueFound = true;
break;
}
}
// If there is no dedicated compute queue, just find the first queue family that supports compute
if (!computeQueueFound)
{
for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilyProperties.size()); i++)
{
if (queueFamilyProperties[i].queueFlags & VK_QUEUE_COMPUTE_BIT)
{
compute.queueFamilyIndex = i;
computeQueueFound = true;
break;
}
}
}
// Compute is mandatory in Vulkan, so there must be at least one queue family that supports compute
assert(computeQueueFound);
// Get a compute queue from the device
vkGetDeviceQueue(device, compute.queueFamilyIndex, 0, &compute.queue);
} }
void prepareCompute() void prepareCompute()
{ {
getComputeQueue(); // Get a compute queue from the device
vkGetDeviceQueue(device, vulkanDevice->queueFamilyIndices.compute, 0, &compute.queue);
// Create compute pipeline // Create compute pipeline
// Compute pipelines are created separate from graphics pipelines even if they use the same queue // Compute pipelines are created separate from graphics pipelines even if they use the same queue
@ -563,7 +542,7 @@ public:
// Separate command pool as queue family for compute may be different than graphics // Separate command pool as queue family for compute may be different than graphics
VkCommandPoolCreateInfo cmdPoolInfo = {}; VkCommandPoolCreateInfo cmdPoolInfo = {};
cmdPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; cmdPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
cmdPoolInfo.queueFamilyIndex = compute.queueFamilyIndex; cmdPoolInfo.queueFamilyIndex = vulkanDevice->queueFamilyIndices.compute;
cmdPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; cmdPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
VK_CHECK_RESULT(vkCreateCommandPool(device, &cmdPoolInfo, nullptr, &compute.commandPool)); VK_CHECK_RESULT(vkCreateCommandPool(device, &cmdPoolInfo, nullptr, &compute.commandPool));
@ -576,9 +555,16 @@ public:
VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &cmdBufAllocateInfo, &compute.commandBuffer)); VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &cmdBufAllocateInfo, &compute.commandBuffer));
// Fence for compute CB sync // Semaphore for compute & graphics sync
VkFenceCreateInfo fenceCreateInfo = vks::initializers::fenceCreateInfo(VK_FENCE_CREATE_SIGNALED_BIT); VkSemaphoreCreateInfo semaphoreCreateInfo = vks::initializers::semaphoreCreateInfo();
VK_CHECK_RESULT(vkCreateFence(device, &fenceCreateInfo, nullptr, &compute.fence)); VK_CHECK_RESULT(vkCreateSemaphore(device, &semaphoreCreateInfo, nullptr, &compute.semaphore));
// Signal the semaphore
VkSubmitInfo submitInfo = vks::initializers::submitInfo();
submitInfo.signalSemaphoreCount = 1;
submitInfo.pSignalSemaphores = &compute.semaphore;
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE));
VK_CHECK_RESULT(vkQueueWaitIdle(queue));
// Build a single command buffer containing the compute dispatch commands // Build a single command buffer containing the compute dispatch commands
buildComputeCommandBuffer(); buildComputeCommandBuffer();
@ -611,22 +597,35 @@ public:
{ {
VulkanExampleBase::prepareFrame(); VulkanExampleBase::prepareFrame();
VkPipelineStageFlags graphicsWaitStageMasks[] = { VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
VkSemaphore graphicsWaitSemaphores[] = { compute.semaphore, semaphores.presentComplete };
VkSemaphore graphicsSignalSemaphores[] = { graphics.semaphore, semaphores.renderComplete };
// Submit graphics commands
submitInfo.commandBufferCount = 1; submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &drawCmdBuffers[currentBuffer]; submitInfo.pCommandBuffers = &drawCmdBuffers[currentBuffer];
submitInfo.waitSemaphoreCount = 2;
submitInfo.pWaitSemaphores = graphicsWaitSemaphores;
submitInfo.pWaitDstStageMask = graphicsWaitStageMasks;
submitInfo.signalSemaphoreCount = 2;
submitInfo.pSignalSemaphores = graphicsSignalSemaphores;
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE));
VulkanExampleBase::submitFrame(); VulkanExampleBase::submitFrame();
// Submit compute commands // Wait for rendering finished
// Use a fence to ensure that the compute command buffer has finished executing before using it again VkPipelineStageFlags waitStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
vkWaitForFences(device, 1, &compute.fence, VK_TRUE, UINT64_MAX);
vkResetFences(device, 1, &compute.fence);
// Submit compute commands
VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo(); VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo();
computeSubmitInfo.commandBufferCount = 1; computeSubmitInfo.commandBufferCount = 1;
computeSubmitInfo.pCommandBuffers = &compute.commandBuffer; computeSubmitInfo.pCommandBuffers = &compute.commandBuffer;
computeSubmitInfo.waitSemaphoreCount = 1;
VK_CHECK_RESULT(vkQueueSubmit(compute.queue, 1, &computeSubmitInfo, compute.fence)); computeSubmitInfo.pWaitSemaphores = &graphics.semaphore;
computeSubmitInfo.pWaitDstStageMask = &waitStageMask;
computeSubmitInfo.signalSemaphoreCount = 1;
computeSubmitInfo.pSignalSemaphores = &compute.semaphore;
VK_CHECK_RESULT(vkQueueSubmit(compute.queue, 1, &computeSubmitInfo, VK_NULL_HANDLE));
} }
void prepare() void prepare()
@ -641,6 +640,7 @@ public:
preparePipelines(); preparePipelines();
setupDescriptorPool(); setupDescriptorPool();
setupDescriptorSet(); setupDescriptorSet();
prepareGraphics();
prepareCompute(); prepareCompute();
buildCommandBuffers(); buildCommandBuffers();
prepared = true; prepared = true;
@ -651,11 +651,9 @@ public:
if (!prepared) if (!prepared)
return; return;
draw(); draw();
} if (camera.updated) {
updateUniformBuffers();
virtual void viewChanged() }
{
updateUniformBuffers();
} }
virtual void OnUpdateUIOverlay(vks::UIOverlay *overlay) virtual void OnUpdateUIOverlay(vks::UIOverlay *overlay)