Merge pull request #98425 from darksylinc/matias-breadcrumbs-race-fix

Fix race conditions in breadcrumbs
This commit is contained in:
Thaddeus Crews 2024-10-25 13:04:12 -05:00
commit edad871a2d
No known key found for this signature in database
GPG Key ID: 62181B86FE9E5D84
9 changed files with 235 additions and 58 deletions

View File

@ -267,6 +267,12 @@ bool Engine::is_extra_gpu_memory_tracking_enabled() const {
return extra_gpu_memory_tracking;
}
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
bool Engine::is_accurate_breadcrumbs_enabled() const {
return accurate_breadcrumbs;
}
#endif
void Engine::set_print_to_stdout(bool p_enabled) {
CoreGlobals::print_line_enabled = p_enabled;
}

View File

@ -73,6 +73,9 @@ private:
bool use_validation_layers = false;
bool generate_spirv_debug_info = false;
bool extra_gpu_memory_tracking = false;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
bool accurate_breadcrumbs = false;
#endif
int32_t gpu_idx = -1;
uint64_t _process_frames = 0;
@ -186,6 +189,9 @@ public:
bool is_validation_layers_enabled() const;
bool is_generate_spirv_debug_info_enabled() const;
bool is_extra_gpu_memory_tracking_enabled() const;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
bool is_accurate_breadcrumbs_enabled() const;
#endif
int32_t get_gpu_index() const;
void increment_frames_drawn();

View File

@ -107,6 +107,28 @@ void _err_print_error(const char *p_function, const char *p_file, int p_line, co
_global_unlock();
}
// For printing errors when we may crash at any point, so we must flush ASAP a lot of lines
// but we don't want to make it noisy by printing lots of file & line info (because it's already
// been printing by a preceding _err_print_error).
void _err_print_error_asap(const String &p_error, ErrorHandlerType p_type) {
if (OS::get_singleton()) {
OS::get_singleton()->printerr("ERROR: %s\n", p_error.utf8().get_data());
} else {
// Fallback if errors happen before OS init or after it's destroyed.
const char *err_details = p_error.utf8().get_data();
fprintf(stderr, "ERROR: %s\n", err_details);
}
_global_lock();
ErrorHandlerList *l = error_handler_list;
while (l) {
l->errfunc(l->userdata, "", "", 0, p_error.utf8().get_data(), "", false, p_type);
l = l->next;
}
_global_unlock();
}
// Errors with message. (All combinations of p_error and p_message as String or char*.)
void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const char *p_message, bool p_editor_notify, ErrorHandlerType p_type) {
_err_print_error(p_function, p_file, p_line, p_error.utf8().get_data(), p_message, p_editor_notify, p_type);

View File

@ -68,6 +68,7 @@ void _err_print_error(const char *p_function, const char *p_file, int p_line, co
void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const char *p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_error(const char *p_function, const char *p_file, int p_line, const char *p_error, const String &p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const String &p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_error_asap(const String &p_error, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const char *p_message = "", bool p_editor_notify = false, bool fatal = false);
void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const String &p_message, bool p_editor_notify = false, bool fatal = false);
void _err_flush_stdout();

View File

@ -43,6 +43,10 @@
/**** GENERIC ****/
/*****************/
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
static const uint32_t BREADCRUMB_BUFFER_ENTRIES = 512u;
#endif
static const VkFormat RD_TO_VK_FORMAT[RDD::DATA_FORMAT_MAX] = {
VK_FORMAT_R4G4_UNORM_PACK8,
VK_FORMAT_R4G4B4A4_UNORM_PACK16,
@ -1370,7 +1374,10 @@ Error RenderingDeviceDriverVulkan::initialize(uint32_t p_device_index, uint32_t
ERR_FAIL_COND_V(err != OK, err);
max_descriptor_sets_per_pool = GLOBAL_GET("rendering/rendering_device/vulkan/max_descriptors_per_pool");
breadcrumb_buffer = buffer_create(sizeof(uint32_t), BufferUsageBits::BUFFER_USAGE_TRANSFER_TO_BIT, MemoryAllocationType::MEMORY_ALLOCATION_TYPE_CPU);
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
breadcrumb_buffer = buffer_create(2u * sizeof(uint32_t) * BREADCRUMB_BUFFER_ENTRIES, BufferUsageBits::BUFFER_USAGE_TRANSFER_TO_BIT, MemoryAllocationType::MEMORY_ALLOCATION_TYPE_CPU);
#endif
return OK;
}
@ -5004,10 +5011,65 @@ void RenderingDeviceDriverVulkan::command_end_label(CommandBufferID p_cmd_buffer
/**** DEBUG *****/
/****************/
void RenderingDeviceDriverVulkan::command_insert_breadcrumb(CommandBufferID p_cmd_buffer, uint32_t p_data) {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
if (p_data == BreadcrumbMarker::NONE) {
return;
}
vkCmdFillBuffer((VkCommandBuffer)p_cmd_buffer.id, ((BufferInfo *)breadcrumb_buffer.id)->vk_buffer, 0, sizeof(uint32_t), p_data);
if (Engine::get_singleton()->is_accurate_breadcrumbs_enabled()) {
// Force a full barrier so commands are not executed in parallel.
// This will mean that the last breadcrumb to see was actually the
// last (group of) command to be executed (hence, the one causing the crash).
VkMemoryBarrier memoryBarrier;
memoryBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
memoryBarrier.pNext = nullptr;
memoryBarrier.srcAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_SHADER_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_READ_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT |
VK_ACCESS_HOST_READ_BIT |
VK_ACCESS_HOST_WRITE_BIT;
memoryBarrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_SHADER_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_READ_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT |
VK_ACCESS_HOST_READ_BIT |
VK_ACCESS_HOST_WRITE_BIT;
vkCmdPipelineBarrier(
(VkCommandBuffer)p_cmd_buffer.id,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, 1u, &memoryBarrier, 0u, nullptr, 0u, nullptr);
}
// We write to a circular buffer. If you're getting barrier sync errors here,
// increase the value of BREADCRUMB_BUFFER_ENTRIES.
vkCmdFillBuffer((VkCommandBuffer)p_cmd_buffer.id, ((BufferInfo *)breadcrumb_buffer.id)->vk_buffer, breadcrumb_offset, sizeof(uint32_t), breadcrumb_id++);
vkCmdFillBuffer((VkCommandBuffer)p_cmd_buffer.id, ((BufferInfo *)breadcrumb_buffer.id)->vk_buffer, breadcrumb_offset + sizeof(uint32_t), sizeof(uint32_t), p_data);
breadcrumb_offset += sizeof(uint32_t) * 2u;
if (breadcrumb_offset >= BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u) {
breadcrumb_offset = 0u;
}
#endif
}
void RenderingDeviceDriverVulkan::on_device_lost() const {
@ -5089,64 +5151,121 @@ void RenderingDeviceDriverVulkan::on_device_lost() const {
void RenderingDeviceDriverVulkan::print_lost_device_info() {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
void *breadcrumb_ptr;
vmaFlushAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, sizeof(uint32_t));
vmaInvalidateAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, sizeof(uint32_t));
vmaMapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, &breadcrumb_ptr);
uint32_t last_breadcrumb = *(uint32_t *)breadcrumb_ptr;
vmaUnmapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle);
uint32_t phase = last_breadcrumb & uint32_t(~((1 << 16) - 1));
uint32_t user_data = last_breadcrumb & ((1 << 16) - 1);
String error_msg = "Last known breadcrumb: ";
switch (phase) {
case BreadcrumbMarker::ALPHA_PASS:
error_msg += "ALPHA_PASS";
break;
case BreadcrumbMarker::BLIT_PASS:
error_msg += "BLIT_PASS";
break;
case BreadcrumbMarker::DEBUG_PASS:
error_msg += "DEBUG_PASS";
break;
case BreadcrumbMarker::LIGHTMAPPER_PASS:
error_msg += "LIGHTMAPPER_PASS";
break;
case BreadcrumbMarker::OPAQUE_PASS:
error_msg += "OPAQUE_PASS";
break;
case BreadcrumbMarker::POST_PROCESSING_PASS:
error_msg += "POST_PROCESSING_PASS";
break;
case BreadcrumbMarker::REFLECTION_PROBES:
error_msg += "REFLECTION_PROBES";
break;
case BreadcrumbMarker::SHADOW_PASS_CUBE:
error_msg += "SHADOW_PASS_CUBE";
break;
case BreadcrumbMarker::SHADOW_PASS_DIRECTIONAL:
error_msg += "SHADOW_PASS_DIRECTIONAL";
break;
case BreadcrumbMarker::SKY_PASS:
error_msg += "SKY_PASS";
break;
case BreadcrumbMarker::TRANSPARENT_PASS:
error_msg += "TRANSPARENT_PASS";
break;
case BreadcrumbMarker::UI_PASS:
error_msg += "UI_PASS";
break;
default:
error_msg += "UNKNOWN_BREADCRUMB(" + itos((uint32_t)phase) + ')';
break;
{
String error_msg = "Printing last known breadcrumbs in reverse order (last executed first).";
if (!Engine::get_singleton()->is_accurate_breadcrumbs_enabled()) {
error_msg += "\nSome of them might be inaccurate. Try running with --accurate-breadcrumbs for precise information.";
}
_err_print_error(FUNCTION_STR, __FILE__, __LINE__, error_msg);
}
if (user_data != 0) {
error_msg += " | User data: " + itos(user_data);
uint8_t *breadcrumb_ptr = nullptr;
VkResult map_result = VK_SUCCESS;
vmaFlushAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u);
vmaInvalidateAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u);
{
void *ptr = nullptr;
map_result = vmaMapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, &ptr);
breadcrumb_ptr = reinterpret_cast<uint8_t *>(ptr);
}
_err_print_error(FUNCTION_STR, __FILE__, __LINE__, error_msg);
if (breadcrumb_ptr && map_result == VK_SUCCESS) {
uint32_t last_breadcrumb_offset = 0;
{
_err_print_error_asap("Searching last breadcrumb. We've sent up to ID: " + itos(breadcrumb_id - 1u));
// Scan the whole buffer to find the offset with the highest ID.
// That means that was the last one to be written.
//
// We use "breadcrumb_id - id" to account for wraparound.
// e.g. breadcrumb_id = 2 and id = 4294967294; then 2 - 4294967294 = 4.
// The one with the smallest difference is the closest to breadcrumb_id, which means it's
// the last written command.
uint32_t biggest_id = 0u;
uint32_t smallest_id_diff = std::numeric_limits<uint32_t>::max();
const uint32_t *breadcrumb_ptr32 = reinterpret_cast<const uint32_t *>(breadcrumb_ptr);
for (size_t i = 0u; i < BREADCRUMB_BUFFER_ENTRIES; ++i) {
const uint32_t id = breadcrumb_ptr32[i * 2u];
const uint32_t id_diff = breadcrumb_id - id;
if (id_diff < smallest_id_diff) {
biggest_id = i;
smallest_id_diff = id_diff;
}
}
_err_print_error_asap("Last breadcrumb ID found: " + itos(breadcrumb_ptr32[biggest_id * 2u]));
last_breadcrumb_offset = biggest_id * sizeof(uint32_t) * 2u;
}
const size_t entries_to_print = 8u; // Note: The value is arbitrary.
for (size_t i = 0u; i < entries_to_print; ++i) {
const uint32_t last_breadcrumb = *reinterpret_cast<uint32_t *>(breadcrumb_ptr + last_breadcrumb_offset + sizeof(uint32_t));
const uint32_t phase = last_breadcrumb & uint32_t(~((1 << 16) - 1));
const uint32_t user_data = last_breadcrumb & ((1 << 16) - 1);
String error_msg = "Last known breadcrumb: ";
switch (phase) {
case BreadcrumbMarker::ALPHA_PASS:
error_msg += "ALPHA_PASS";
break;
case BreadcrumbMarker::BLIT_PASS:
error_msg += "BLIT_PASS";
break;
case BreadcrumbMarker::DEBUG_PASS:
error_msg += "DEBUG_PASS";
break;
case BreadcrumbMarker::LIGHTMAPPER_PASS:
error_msg += "LIGHTMAPPER_PASS";
break;
case BreadcrumbMarker::OPAQUE_PASS:
error_msg += "OPAQUE_PASS";
break;
case BreadcrumbMarker::POST_PROCESSING_PASS:
error_msg += "POST_PROCESSING_PASS";
break;
case BreadcrumbMarker::REFLECTION_PROBES:
error_msg += "REFLECTION_PROBES";
break;
case BreadcrumbMarker::SHADOW_PASS_CUBE:
error_msg += "SHADOW_PASS_CUBE";
break;
case BreadcrumbMarker::SHADOW_PASS_DIRECTIONAL:
error_msg += "SHADOW_PASS_DIRECTIONAL";
break;
case BreadcrumbMarker::SKY_PASS:
error_msg += "SKY_PASS";
break;
case BreadcrumbMarker::TRANSPARENT_PASS:
error_msg += "TRANSPARENT_PASS";
break;
case BreadcrumbMarker::UI_PASS:
error_msg += "UI_PASS";
break;
default:
error_msg += "UNKNOWN_BREADCRUMB(" + itos((uint32_t)phase) + ')';
break;
}
if (user_data != 0) {
error_msg += " | User data: " + itos(user_data);
}
_err_print_error_asap(error_msg);
if (last_breadcrumb_offset == 0u) {
// Decrement last_breadcrumb_idx, wrapping underflow.
last_breadcrumb_offset = BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u;
}
last_breadcrumb_offset -= sizeof(uint32_t) * 2u;
}
vmaUnmapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle);
breadcrumb_ptr = nullptr;
} else {
_err_print_error(FUNCTION_STR, __FILE__, __LINE__, "Couldn't map breadcrumb buffer. VkResult = " + itos(map_result));
}
#endif
on_device_lost();
}
@ -5417,7 +5536,9 @@ RenderingDeviceDriverVulkan::RenderingDeviceDriverVulkan(RenderingContextDriverV
}
RenderingDeviceDriverVulkan::~RenderingDeviceDriverVulkan() {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
buffer_free(breadcrumb_buffer);
#endif
while (small_allocs_pools.size()) {
HashMap<uint32_t, VmaPool>::Iterator E = small_allocs_pools.begin();

View File

@ -172,7 +172,12 @@ private:
VmaPool _find_or_create_small_allocs_pool(uint32_t p_mem_type_index);
private:
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
// It's a circular buffer.
BufferID breadcrumb_buffer;
uint32_t breadcrumb_offset = 0u;
uint32_t breadcrumb_id = 0u;
#endif
public:
/*****************/

View File

@ -630,6 +630,7 @@ void Main::print_help(const char *p_binary) {
print_help_option("--generate-spirv-debug-info", "Generate SPIR-V debug information. This allows source-level shader debugging with RenderDoc.\n");
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
print_help_option("--extra-gpu-memory-tracking", "Enables additional memory tracking (see class reference for `RenderingDevice.get_driver_and_device_memory_report()` and linked methods). Currently only implemented for Vulkan. Enabling this feature may cause crashes on some systems due to buggy drivers or bugs in the Vulkan Loader. See https://github.com/godotengine/godot/issues/95967\n");
print_help_option("--accurate-breadcrumbs", "Force barriers between breadcrumbs. Useful for narrowing down a command causing GPU resets. Currently only implemented for Vulkan.\n");
#endif
print_help_option("--remote-debug <uri>", "Remote debug (<protocol>://<host/IP>[:<port>], e.g. tcp://127.0.0.1:6007).\n");
print_help_option("--single-threaded-scene", "Force scene tree to run in single-threaded mode. Sub-thread groups are disabled and run on the main thread.\n");
@ -1236,8 +1237,12 @@ Error Main::setup(const char *execpath, int argc, char *argv[], bool p_second_ph
#endif
} else if (arg == "--generate-spirv-debug-info") {
Engine::singleton->generate_spirv_debug_info = true;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
} else if (arg == "--extra-gpu-memory-tracking") {
Engine::singleton->extra_gpu_memory_tracking = true;
} else if (arg == "--accurate-breadcrumbs") {
Engine::singleton->accurate_breadcrumbs = true;
#endif
} else if (arg == "--tablet-driver") {
if (N) {
tablet_driver = N->get();

View File

@ -34,7 +34,6 @@
#define FORCE_FULL_ACCESS_BITS 0
#define PRINT_RESOURCE_TRACKER_TOTAL 0
#define PRINT_COMMAND_RECORDING 0
#define INSERT_BREADCRUMBS 1
RenderingDeviceGraph::RenderingDeviceGraph() {
driver_honors_barriers = false;
@ -833,7 +832,7 @@ void RenderingDeviceGraph::_run_render_commands(int32_t p_level, const RecordedC
const RecordedDrawListCommand *draw_list_command = reinterpret_cast<const RecordedDrawListCommand *>(command);
const VectorView clear_values(draw_list_command->clear_values(), draw_list_command->clear_values_count);
#if INSERT_BREADCRUMBS
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
driver->command_insert_breadcrumb(r_command_buffer, draw_list_command->breadcrumb);
#endif
driver->command_begin_render_pass(r_command_buffer, draw_list_command->render_pass, draw_list_command->framebuffer, draw_list_command->command_buffer_type, draw_list_command->region, clear_values);
@ -1416,7 +1415,9 @@ void RenderingDeviceGraph::add_buffer_update(RDD::BufferID p_dst, ResourceTracke
void RenderingDeviceGraph::add_compute_list_begin(RDD::BreadcrumbMarker p_phase, uint32_t p_breadcrumb_data) {
compute_instruction_list.clear();
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
compute_instruction_list.breadcrumb = p_breadcrumb_data | (p_phase & ((1 << 16) - 1));
#endif
compute_instruction_list.index++;
}
@ -1512,7 +1513,9 @@ void RenderingDeviceGraph::add_draw_list_begin(RDD::RenderPassID p_render_pass,
draw_instruction_list.render_pass = p_render_pass;
draw_instruction_list.framebuffer = p_framebuffer;
draw_instruction_list.region = p_region;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
draw_instruction_list.breadcrumb = p_breadcrumb;
#endif
draw_instruction_list.clear_values.resize(p_clear_values.size());
for (uint32_t i = 0; i < p_clear_values.size(); i++) {
draw_instruction_list.clear_values[i] = p_clear_values[i];
@ -1723,7 +1726,9 @@ void RenderingDeviceGraph::add_draw_list_end() {
command->framebuffer = draw_instruction_list.framebuffer;
command->command_buffer_type = command_buffer_type;
command->region = draw_instruction_list.region;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
command->breadcrumb = draw_instruction_list.breadcrumb;
#endif
command->clear_values_count = draw_instruction_list.clear_values.size();
RDD::RenderPassClearValue *clear_values = command->clear_values();

View File

@ -221,14 +221,18 @@ private:
};
struct ComputeInstructionList : InstructionList {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
uint32_t breadcrumb;
#endif
};
struct DrawInstructionList : InstructionList {
RDD::RenderPassID render_pass;
RDD::FramebufferID framebuffer;
Rect2i region;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
uint32_t breadcrumb;
#endif
LocalVector<RDD::RenderPassClearValue> clear_values;
};
@ -317,7 +321,9 @@ private:
RDD::FramebufferID framebuffer;
RDD::CommandBufferType command_buffer_type;
Rect2i region;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
uint32_t breadcrumb = 0;
#endif
uint32_t clear_values_count = 0;
_FORCE_INLINE_ RDD::RenderPassClearValue *clear_values() {