diff --git a/drivers/d3d12/rendering_device_driver_d3d12.cpp b/drivers/d3d12/rendering_device_driver_d3d12.cpp index 0ef88e7d528..479afbba93f 100644 --- a/drivers/d3d12/rendering_device_driver_d3d12.cpp +++ b/drivers/d3d12/rendering_device_driver_d3d12.cpp @@ -2003,6 +2003,8 @@ static D3D12_BARRIER_LAYOUT _rd_texture_layout_to_d3d12_barrier_layout(RDD::Text switch (p_texture_layout) { case RDD::TEXTURE_LAYOUT_UNDEFINED: return D3D12_BARRIER_LAYOUT_UNDEFINED; + case RDD::TEXTURE_LAYOUT_GENERAL: + return D3D12_BARRIER_LAYOUT_COMMON; case RDD::TEXTURE_LAYOUT_STORAGE_OPTIMAL: return D3D12_BARRIER_LAYOUT_UNORDERED_ACCESS; case RDD::TEXTURE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: @@ -6175,6 +6177,8 @@ uint64_t RenderingDeviceDriverD3D12::api_trait_get(ApiTrait p_trait) { return false; case API_TRAIT_CLEARS_WITH_COPY_ENGINE: return false; + case API_TRAIT_USE_GENERAL_IN_COPY_QUEUES: + return true; default: return RenderingDeviceDriver::api_trait_get(p_trait); } diff --git a/drivers/vulkan/rendering_device_driver_vulkan.cpp b/drivers/vulkan/rendering_device_driver_vulkan.cpp index d20f396281e..154095552bf 100644 --- a/drivers/vulkan/rendering_device_driver_vulkan.cpp +++ b/drivers/vulkan/rendering_device_driver_vulkan.cpp @@ -266,6 +266,7 @@ static const VkFormat RD_TO_VK_FORMAT[RDD::DATA_FORMAT_MAX] = { static VkImageLayout RD_TO_VK_LAYOUT[RDD::TEXTURE_LAYOUT_MAX] = { VK_IMAGE_LAYOUT_UNDEFINED, // TEXTURE_LAYOUT_UNDEFINED + VK_IMAGE_LAYOUT_GENERAL, // TEXTURE_LAYOUT_GENERAL VK_IMAGE_LAYOUT_GENERAL, // TEXTURE_LAYOUT_STORAGE_OPTIMAL VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // TEXTURE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, // TEXTURE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL diff --git a/servers/rendering/rendering_device.cpp b/servers/rendering/rendering_device.cpp index aee23ab476e..942d5631e4d 100644 --- a/servers/rendering/rendering_device.cpp +++ b/servers/rendering/rendering_device.cpp @@ -1243,6 +1243,7 @@ Error RenderingDevice::_texture_initialize(RID p_texture, uint32_t p_layer, cons TransferWorker *transfer_worker = nullptr; const uint8_t *read_ptr = p_data.ptr(); uint8_t *write_ptr = nullptr; + const RDD::TextureLayout copy_dst_layout = driver->api_trait_get(RDD::API_TRAIT_USE_GENERAL_IN_COPY_QUEUES) ? RDD::TEXTURE_LAYOUT_GENERAL : RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL; for (uint32_t pass = 0; pass < 2; pass++) { const bool copy_pass = (pass == 1); if (copy_pass) { @@ -1267,7 +1268,7 @@ Error RenderingDevice::_texture_initialize(RID p_texture, uint32_t p_layer, cons tb.texture = texture->driver_id; tb.dst_access = RDD::BARRIER_ACCESS_COPY_WRITE_BIT; tb.prev_layout = RDD::TEXTURE_LAYOUT_UNDEFINED; - tb.next_layout = RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL; + tb.next_layout = copy_dst_layout; tb.subresources.aspect = texture->barrier_aspect_flags; tb.subresources.mipmap_count = texture->mipmaps; tb.subresources.base_layer = p_layer; @@ -1313,7 +1314,7 @@ Error RenderingDevice::_texture_initialize(RID p_texture, uint32_t p_layer, cons copy_region.texture_subresources.layer_count = 1; copy_region.texture_offset = Vector3i(0, 0, z); copy_region.texture_region_size = Vector3i(logic_width, logic_height, 1); - driver->command_copy_buffer_to_texture(transfer_worker->command_buffer, transfer_worker->staging_buffer, texture->driver_id, RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL, copy_region); + driver->command_copy_buffer_to_texture(transfer_worker->command_buffer, transfer_worker->staging_buffer, texture->driver_id, copy_dst_layout, copy_region); } staging_local_offset += to_allocate; @@ -1332,14 +1333,13 @@ Error RenderingDevice::_texture_initialize(RID p_texture, uint32_t p_layer, cons RDD::TextureBarrier tb; tb.texture = texture->driver_id; tb.src_access = RDD::BARRIER_ACCESS_COPY_WRITE_BIT; - tb.prev_layout = RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL; + tb.prev_layout = copy_dst_layout; tb.next_layout = RDD::TEXTURE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; tb.subresources.aspect = texture->barrier_aspect_flags; tb.subresources.mipmap_count = texture->mipmaps; tb.subresources.base_layer = p_layer; tb.subresources.layer_count = 1; - - driver->command_pipeline_barrier(transfer_worker->command_buffer, RDD::PIPELINE_STAGE_COPY_BIT, RDD::PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, {}, {}, tb); + transfer_worker->texture_barriers.push_back(tb); } _release_transfer_worker(transfer_worker); @@ -5152,6 +5152,21 @@ void RenderingDevice::_wait_for_transfer_worker(TransferWorker *p_transfer_worke MutexLock lock(p_transfer_worker->operations_mutex); p_transfer_worker->operations_processed = p_transfer_worker->operations_submitted; } + + if (!p_transfer_worker->texture_barriers.is_empty()) { + MutexLock transfer_worker_lock(transfer_worker_pool_mutex); + _flush_barriers_for_transfer_worker(p_transfer_worker); + } +} + +void RenderingDevice::_flush_barriers_for_transfer_worker(TransferWorker *p_transfer_worker) { + if (!p_transfer_worker->texture_barriers.is_empty()) { + for (uint32_t i = 0; i < p_transfer_worker->texture_barriers.size(); i++) { + transfer_worker_pool_texture_barriers.push_back(p_transfer_worker->texture_barriers[i]); + } + + p_transfer_worker->texture_barriers.clear(); + } } void RenderingDevice::_check_transfer_worker_operation(uint32_t p_transfer_worker_index, uint64_t p_transfer_worker_operation) { @@ -5193,11 +5208,11 @@ void RenderingDevice::_check_transfer_worker_index_array(IndexArray *p_index_arr } } -void RenderingDevice::_submit_transfer_workers(bool p_operations_used_by_draw) { +void RenderingDevice::_submit_transfer_workers(RDD::CommandBufferID p_draw_command_buffer) { MutexLock transfer_worker_lock(transfer_worker_pool_mutex); for (uint32_t i = 0; i < transfer_worker_pool.size(); i++) { TransferWorker *worker = transfer_worker_pool[i]; - if (p_operations_used_by_draw) { + if (p_draw_command_buffer) { MutexLock lock(worker->operations_mutex); if (worker->operations_processed >= transfer_worker_operation_used_by_draw[worker->index]) { // The operation used by the draw has already been processed, we don't need to wait on the worker. @@ -5208,12 +5223,21 @@ void RenderingDevice::_submit_transfer_workers(bool p_operations_used_by_draw) { { MutexLock lock(worker->thread_mutex); if (worker->recording) { - VectorView semaphores = p_operations_used_by_draw ? frames[frame].transfer_worker_semaphores[i] : VectorView(); + VectorView semaphores = p_draw_command_buffer ? frames[frame].transfer_worker_semaphores[i] : VectorView(); _end_transfer_worker(worker); _submit_transfer_worker(worker, semaphores); } + + if (p_draw_command_buffer) { + _flush_barriers_for_transfer_worker(worker); + } } } + + if (p_draw_command_buffer && !transfer_worker_pool_texture_barriers.is_empty()) { + driver->command_pipeline_barrier(p_draw_command_buffer, RDD::PIPELINE_STAGE_COPY_BIT, RDD::PIPELINE_STAGE_ALL_COMMANDS_BIT, {}, {}, transfer_worker_pool_texture_barriers); + transfer_worker_pool_texture_barriers.clear(); + } } void RenderingDevice::_wait_for_transfer_workers() { @@ -5807,10 +5831,10 @@ void RenderingDevice::_end_frame() { ERR_PRINT("Found open compute list at the end of the frame, this should never happen (further compute will likely not work)."); } - _submit_transfer_workers(true); - // The command buffer must be copied into a stack variable as the driver workarounds can change the command buffer in use. RDD::CommandBufferID command_buffer = frames[frame].command_buffer; + _submit_transfer_workers(command_buffer); + draw_graph.end(RENDER_GRAPH_REORDER, RENDER_GRAPH_FULL_BARRIERS, command_buffer, frames[frame].command_buffer_pool); driver->command_buffer_end(command_buffer); driver->end_segment(); @@ -6387,7 +6411,7 @@ void RenderingDevice::finalize() { } // Wait for transfer workers to finish. - _submit_transfer_workers(false); + _submit_transfer_workers(); _wait_for_transfer_workers(); // Delete everything the graph has created. diff --git a/servers/rendering/rendering_device.h b/servers/rendering/rendering_device.h index b61418b1fc4..71ffbfbd885 100644 --- a/servers/rendering/rendering_device.h +++ b/servers/rendering/rendering_device.h @@ -1267,6 +1267,7 @@ private: RDD::CommandBufferID command_buffer; RDD::CommandPoolID command_pool; RDD::FenceID command_fence; + LocalVector texture_barriers; bool recording = false; bool submitted = false; BinaryMutex thread_mutex; @@ -1280,6 +1281,7 @@ private: uint32_t transfer_worker_pool_max_size = 1; LocalVector transfer_worker_operation_used_by_draw; LocalVector transfer_worker_pool_available_list; + LocalVector transfer_worker_pool_texture_barriers; BinaryMutex transfer_worker_pool_mutex; ConditionVariable transfer_worker_pool_condition; @@ -1288,12 +1290,13 @@ private: void _end_transfer_worker(TransferWorker *p_transfer_worker); void _submit_transfer_worker(TransferWorker *p_transfer_worker, VectorView p_signal_semaphores = VectorView()); void _wait_for_transfer_worker(TransferWorker *p_transfer_worker); + void _flush_barriers_for_transfer_worker(TransferWorker *p_transfer_worker); void _check_transfer_worker_operation(uint32_t p_transfer_worker_index, uint64_t p_transfer_worker_operation); void _check_transfer_worker_buffer(Buffer *p_buffer); void _check_transfer_worker_texture(Texture *p_texture); void _check_transfer_worker_vertex_array(VertexArray *p_vertex_array); void _check_transfer_worker_index_array(IndexArray *p_index_array); - void _submit_transfer_workers(bool p_operations_used_by_draw); + void _submit_transfer_workers(RDD::CommandBufferID p_draw_command_buffer = RDD::CommandBufferID()); void _wait_for_transfer_workers(); void _free_transfer_workers(); diff --git a/servers/rendering/rendering_device_driver.cpp b/servers/rendering/rendering_device_driver.cpp index 3b8e3efeb8e..9ff7b83215e 100644 --- a/servers/rendering/rendering_device_driver.cpp +++ b/servers/rendering/rendering_device_driver.cpp @@ -374,6 +374,8 @@ uint64_t RenderingDeviceDriver::api_trait_get(ApiTrait p_trait) { return 1; case API_TRAIT_CLEARS_WITH_COPY_ENGINE: return true; + case API_TRAIT_USE_GENERAL_IN_COPY_QUEUES: + return false; default: ERR_FAIL_V(0); } diff --git a/servers/rendering/rendering_device_driver.h b/servers/rendering/rendering_device_driver.h index 91da67c8d72..637d52c0605 100644 --- a/servers/rendering/rendering_device_driver.h +++ b/servers/rendering/rendering_device_driver.h @@ -220,6 +220,7 @@ public: enum TextureLayout { TEXTURE_LAYOUT_UNDEFINED, + TEXTURE_LAYOUT_GENERAL, TEXTURE_LAYOUT_STORAGE_OPTIMAL, TEXTURE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, TEXTURE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, @@ -750,6 +751,7 @@ public: API_TRAIT_TEXTURE_DATA_ROW_PITCH_STEP, API_TRAIT_SECONDARY_VIEWPORT_SCISSOR, API_TRAIT_CLEARS_WITH_COPY_ENGINE, + API_TRAIT_USE_GENERAL_IN_COPY_QUEUES, }; enum ShaderChangeInvalidation {