From 2ef1ef63a5404d47c8c01e874f316d1a8cde26c8 Mon Sep 17 00:00:00 2001 From: Stuart Carnie Date: Sun, 25 Aug 2024 08:04:02 +1000 Subject: [PATCH] Metal: Improve startup times by using concurrent shader compilation APIs --- drivers/metal/metal_objects.h | 84 ++++++- drivers/metal/metal_objects.mm | 208 +++++++++++++++++- drivers/metal/metal_utils.h | 20 ++ drivers/metal/rendering_device_driver_metal.h | 17 +- .../metal/rendering_device_driver_metal.mm | 104 +++++++-- 5 files changed, 403 insertions(+), 30 deletions(-) diff --git a/drivers/metal/metal_objects.h b/drivers/metal/metal_objects.h index 70f86f2facd..11b96f8373f 100644 --- a/drivers/metal/metal_objects.h +++ b/drivers/metal/metal_objects.h @@ -57,10 +57,12 @@ #import "servers/rendering/rendering_device_driver.h" +#import #import #import #import #import +#import #import #import #import @@ -497,6 +499,76 @@ struct API_AVAILABLE(macos(11.0), ios(14.0)) UniformSet { HashMap> encoders; }; +struct ShaderCacheEntry; + +enum class ShaderLoadStrategy { + DEFAULT, + LAZY, +}; + +/** + * A Metal shader library. + */ +@interface MDLibrary : NSObject +- (id)library; +- (NSError *)error; +- (void)setLabel:(NSString *)label; + ++ (instancetype)newLibraryWithCacheEntry:(ShaderCacheEntry *)entry + device:(id)device + source:(NSString *)source + options:(MTLCompileOptions *)options + strategy:(ShaderLoadStrategy)strategy; +@end + +struct SHA256Digest { + unsigned char data[CC_SHA256_DIGEST_LENGTH]; + + uint32_t hash() const { + uint32_t c = crc32(0, data, CC_SHA256_DIGEST_LENGTH); + return c; + } + + SHA256Digest() { + bzero(data, CC_SHA256_DIGEST_LENGTH); + } + + SHA256Digest(const char *p_data, size_t p_length) { + CC_SHA256(p_data, (CC_LONG)p_length, data); + } +}; + +template <> +struct HashMapComparatorDefault { + static bool compare(const SHA256Digest &p_lhs, const SHA256Digest &p_rhs) { + return memcmp(p_lhs.data, p_rhs.data, CC_SHA256_DIGEST_LENGTH) == 0; + } +}; + +/** + * A cache entry for a Metal shader library. + */ +struct ShaderCacheEntry { + RenderingDeviceDriverMetal &owner; + SHA256Digest key; + CharString name; + CharString short_sha; + RD::ShaderStage stage = RD::SHADER_STAGE_VERTEX; + /** + * This reference must be weak, to ensure that when the last strong reference to the library + * is released, the cache entry is freed. + */ + MDLibrary *__weak library = nil; + + /** Notify the cache that this entry is no longer needed. */ + void notify_free() const; + + ShaderCacheEntry(RenderingDeviceDriverMetal &p_owner, SHA256Digest p_key) : + owner(p_owner), key(p_key) { + } + ~ShaderCacheEntry() = default; +}; + class API_AVAILABLE(macos(11.0), ios(14.0)) MDShader { public: CharString name; @@ -517,15 +589,14 @@ public: } push_constants; MTLSize local = {}; - id kernel; + MDLibrary *kernel; #if DEV_ENABLED CharString kernel_source; #endif void encode_push_constant_data(VectorView p_data, MDCommandBuffer *p_cb) final; - MDComputeShader(CharString p_name, Vector p_sets, id p_kernel); - ~MDComputeShader() override = default; + MDComputeShader(CharString p_name, Vector p_sets, MDLibrary *p_kernel); }; class API_AVAILABLE(macos(11.0), ios(14.0)) MDRenderShader final : public MDShader { @@ -541,8 +612,8 @@ public: } frag; } push_constants; - id vert; - id frag; + MDLibrary *vert; + MDLibrary *frag; #if DEV_ENABLED CharString vert_source; CharString frag_source; @@ -550,8 +621,7 @@ public: void encode_push_constant_data(VectorView p_data, MDCommandBuffer *p_cb) final; - MDRenderShader(CharString p_name, Vector p_sets, id p_vert, id p_frag); - ~MDRenderShader() override = default; + MDRenderShader(CharString p_name, Vector p_sets, MDLibrary *p_vert, MDLibrary *p_frag); }; enum StageResourceUsage : uint32_t { diff --git a/drivers/metal/metal_objects.mm b/drivers/metal/metal_objects.mm index 3ce00f74a30..5c7036f11ed 100644 --- a/drivers/metal/metal_objects.mm +++ b/drivers/metal/metal_objects.mm @@ -50,9 +50,12 @@ #import "metal_objects.h" +#import "metal_utils.h" #import "pixel_formats.h" #import "rendering_device_driver_metal.h" +#import + void MDCommandBuffer::begin() { DEV_ASSERT(commandBuffer == nil); commandBuffer = queue.commandBuffer; @@ -850,7 +853,7 @@ void MDCommandBuffer::_end_blit() { type = MDCommandBufferStateType::None; } -MDComputeShader::MDComputeShader(CharString p_name, Vector p_sets, id p_kernel) : +MDComputeShader::MDComputeShader(CharString p_name, Vector p_sets, MDLibrary *p_kernel) : MDShader(p_name, p_sets), kernel(p_kernel) { } @@ -868,7 +871,7 @@ void MDComputeShader::encode_push_constant_data(VectorView p_data, MDC [enc setBytes:ptr length:length atIndex:push_constants.binding]; } -MDRenderShader::MDRenderShader(CharString p_name, Vector p_sets, id _Nonnull p_vert, id _Nonnull p_frag) : +MDRenderShader::MDRenderShader(CharString p_name, Vector p_sets, MDLibrary *_Nonnull p_vert, MDLibrary *_Nonnull p_frag) : MDShader(p_name, p_sets), vert(p_vert), frag(p_frag) { } @@ -1378,3 +1381,204 @@ id MDResourceCache::get_depth_stencil_state(bool p_use_dep } return *val; } + +static const char *SHADER_STAGE_NAMES[] = { + [RD::SHADER_STAGE_VERTEX] = "vert", + [RD::SHADER_STAGE_FRAGMENT] = "frag", + [RD::SHADER_STAGE_TESSELATION_CONTROL] = "tess_ctrl", + [RD::SHADER_STAGE_TESSELATION_EVALUATION] = "tess_eval", + [RD::SHADER_STAGE_COMPUTE] = "comp", +}; + +void ShaderCacheEntry::notify_free() const { + owner.shader_cache_free_entry(key); +} + +@interface MDLibrary () +- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry; +- (ShaderCacheEntry *)entry; +@end + +@interface MDLazyLibrary : MDLibrary { + id _library; + NSError *_error; + std::shared_mutex _mu; + bool _loaded; + id _device; + NSString *_source; + MTLCompileOptions *_options; +} +- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry + device:(id)device + source:(NSString *)source + options:(MTLCompileOptions *)options; +@end + +@interface MDImmediateLibrary : MDLibrary { + id _library; + NSError *_error; + std::mutex _cv_mutex; + std::condition_variable _cv; + std::atomic _complete; + bool _ready; +} +- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry + device:(id)device + source:(NSString *)source + options:(MTLCompileOptions *)options; +@end + +@implementation MDLibrary { + ShaderCacheEntry *_entry; +} + ++ (instancetype)newLibraryWithCacheEntry:(ShaderCacheEntry *)entry + device:(id)device + source:(NSString *)source + options:(MTLCompileOptions *)options + strategy:(ShaderLoadStrategy)strategy { + switch (strategy) { + case ShaderLoadStrategy::DEFAULT: + [[fallthrough]]; + default: + return [[MDImmediateLibrary alloc] initWithCacheEntry:entry device:device source:source options:options]; + case ShaderLoadStrategy::LAZY: + return [[MDLazyLibrary alloc] initWithCacheEntry:entry device:device source:source options:options]; + } +} + +- (ShaderCacheEntry *)entry { + return _entry; +} + +- (id)library { + CRASH_NOW_MSG("Not implemented"); + return nil; +} + +- (NSError *)error { + CRASH_NOW_MSG("Not implemented"); + return nil; +} + +- (void)setLabel:(NSString *)label { +} + +- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry { + self = [super init]; + _entry = entry; + _entry->library = self; + return self; +} + +- (void)dealloc { + _entry->notify_free(); +} + +@end + +@implementation MDImmediateLibrary + +- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry + device:(id)device + source:(NSString *)source + options:(MTLCompileOptions *)options { + self = [super initWithCacheEntry:entry]; + _complete = false; + _ready = false; + + __block os_signpost_id_t compile_id = (os_signpost_id_t)(uintptr_t)self; + os_signpost_interval_begin(LOG_INTERVALS, compile_id, "shader_compile", + "shader_name=%{public}s stage=%{public}s hash=%{public}s", + entry->name.get_data(), SHADER_STAGE_NAMES[entry->stage], entry->short_sha.get_data()); + + [device newLibraryWithSource:source + options:options + completionHandler:^(id library, NSError *error) { + os_signpost_interval_end(LOG_INTERVALS, compile_id, "shader_compile"); + self->_library = library; + self->_error = error; + if (error) { + ERR_PRINT(String(U"Error compiling shader %s: %s").format(entry->name.get_data(), error.localizedDescription.UTF8String)); + } + + { + std::lock_guard lock(self->_cv_mutex); + _ready = true; + } + _cv.notify_all(); + _complete = true; + }]; + return self; +} + +- (id)library { + if (!_complete) { + std::unique_lock lock(_cv_mutex); + _cv.wait(lock, [&] { return _ready; }); + } + return _library; +} + +- (NSError *)error { + if (!_complete) { + std::unique_lock lock(_cv_mutex); + _cv.wait(lock, [&] { return _ready; }); + } + return _error; +} + +@end + +@implementation MDLazyLibrary +- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry + device:(id)device + source:(NSString *)source + options:(MTLCompileOptions *)options { + self = [super initWithCacheEntry:entry]; + _device = device; + _source = source; + _options = options; + + return self; +} + +- (void)load { + { + std::shared_lock lock(_mu); + if (_loaded) { + return; + } + } + + std::unique_lock lock(_mu); + if (_loaded) { + return; + } + + ShaderCacheEntry *entry = [self entry]; + + __block os_signpost_id_t compile_id = (os_signpost_id_t)(uintptr_t)self; + os_signpost_interval_begin(LOG_INTERVALS, compile_id, "shader_compile", + "shader_name=%{public}s stage=%{public}s hash=%{public}s", + entry->name.get_data(), SHADER_STAGE_NAMES[entry->stage], entry->short_sha.get_data()); + NSError *error; + _library = [_device newLibraryWithSource:_source options:_options error:&error]; + os_signpost_interval_end(LOG_INTERVALS, compile_id, "shader_compile"); + _device = nil; + _source = nil; + _options = nil; + _loaded = true; +} + +- (id)library { + [self load]; + return _library; +} + +- (NSError *)error { + [self load]; + return _error; +} + +@end diff --git a/drivers/metal/metal_utils.h b/drivers/metal/metal_utils.h index eed1aad89bc..f3ee395d04a 100644 --- a/drivers/metal/metal_utils.h +++ b/drivers/metal/metal_utils.h @@ -31,6 +31,8 @@ #ifndef METAL_UTILS_H #define METAL_UTILS_H +#import + #pragma mark - Boolean flags namespace flags { @@ -78,4 +80,22 @@ static constexpr uint64_t round_up_to_alignment(uint64_t p_value, uint64_t p_ali return aligned_value; } +class Defer { +public: + Defer(std::function func) : + func_(func) {} + ~Defer() { func_(); } + +private: + std::function func_; +}; + +#define CONCAT_INTERNAL(x, y) x##y +#define CONCAT(x, y) CONCAT_INTERNAL(x, y) +#define DEFER const Defer &CONCAT(defer__, __LINE__) = Defer + +extern os_log_t LOG_DRIVER; +// Used for dynamic tracing. +extern os_log_t LOG_INTERVALS; + #endif // METAL_UTILS_H diff --git a/drivers/metal/rendering_device_driver_metal.h b/drivers/metal/rendering_device_driver_metal.h index 1bb71583ab6..7c23624e43a 100644 --- a/drivers/metal/rendering_device_driver_metal.h +++ b/drivers/metal/rendering_device_driver_metal.h @@ -48,6 +48,8 @@ class RenderingContextDriverMetal; class API_AVAILABLE(macos(11.0), ios(14.0)) RenderingDeviceDriverMetal : public RenderingDeviceDriver { + friend struct ShaderCacheEntry; + template using Result = std::variant; @@ -77,6 +79,19 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) RenderingDeviceDriverMetal : public Error _create_device(); Error _check_capabilities(); +#pragma mark - Shader Cache + + ShaderLoadStrategy _shader_load_strategy = ShaderLoadStrategy::DEFAULT; + + /** + * The shader cache is a map of hashes of the Metal source to shader cache entries. + * + * To prevent unbounded growth of the cache, cache entries are automatically freed when + * there are no more references to the MDLibrary associated with the cache entry. + */ + HashMap> _shader_cache; + void shader_cache_free_entry(const SHA256Digest &key); + public: Error initialize(uint32_t p_device_index, uint32_t p_frame_count) override final; @@ -270,7 +285,7 @@ public: #pragma mark Pipeline private: - Result> _create_function(id p_library, NSString *p_name, VectorView &p_specialization_constants); + Result> _create_function(MDLibrary *p_library, NSString *p_name, VectorView &p_specialization_constants); public: virtual void pipeline_free(PipelineID p_pipeline_id) override final; diff --git a/drivers/metal/rendering_device_driver_metal.mm b/drivers/metal/rendering_device_driver_metal.mm index ffa4cf92c75..ec42472cb6e 100644 --- a/drivers/metal/rendering_device_driver_metal.mm +++ b/drivers/metal/rendering_device_driver_metal.mm @@ -60,9 +60,22 @@ #import #import +#import +#import #import #import +#pragma mark - Logging + +os_log_t LOG_DRIVER; +// Used for dynamic tracing. +os_log_t LOG_INTERVALS; + +__attribute__((constructor)) static void InitializeLogging(void) { + LOG_DRIVER = os_log_create("org.stuartcarnie.godot.metal", OS_LOG_CATEGORY_POINTS_OF_INTEREST); + LOG_INTERVALS = os_log_create("org.stuartcarnie.godot.metal", "events"); +} + /*****************/ /**** GENERIC ****/ /*****************/ @@ -2258,6 +2271,15 @@ Vector RenderingDeviceDriverMetal::shader_compile_binary_from_spirv(Vec return ret; } +void RenderingDeviceDriverMetal::shader_cache_free_entry(const SHA256Digest &key) { + if (ShaderCacheEntry **pentry = _shader_cache.getptr(key); pentry != nullptr) { + ShaderCacheEntry *entry = *pentry; + _shader_cache.erase(key); + entry->library = nil; + memdelete(entry); + } +} + RDD::ShaderID RenderingDeviceDriverMetal::shader_create_from_bytecode(const Vector &p_shader_binary, ShaderDescription &r_shader_desc, String &r_name) { r_shader_desc = {}; // Driver-agnostic. @@ -2285,18 +2307,32 @@ RDD::ShaderID RenderingDeviceDriverMetal::shader_create_from_bytecode(const Vect MTLCompileOptions *options = [MTLCompileOptions new]; options.languageVersion = binary_data.get_msl_version(); - HashMap> libraries; + HashMap libraries; + for (ShaderStageData &shader_data : binary_data.stages) { - NSString *source = [[NSString alloc] initWithBytesNoCopy:(void *)shader_data.source.ptr() - length:shader_data.source.length() - encoding:NSUTF8StringEncoding - freeWhenDone:NO]; - NSError *error = nil; - id library = [device newLibraryWithSource:source options:options error:&error]; - if (error != nil) { - print_error(error.localizedDescription.UTF8String); - ERR_FAIL_V_MSG(ShaderID(), "failed to compile Metal source"); + SHA256Digest key = SHA256Digest(shader_data.source.ptr(), shader_data.source.length()); + + if (ShaderCacheEntry **p = _shader_cache.getptr(key); p != nullptr) { + libraries[shader_data.stage] = (*p)->library; + continue; } + + NSString *source = [[NSString alloc] initWithBytes:(void *)shader_data.source.ptr() + length:shader_data.source.length() + encoding:NSUTF8StringEncoding]; + + ShaderCacheEntry *cd = memnew(ShaderCacheEntry(*this, key)); + cd->name = binary_data.shader_name; + String sha_hex = String::hex_encode_buffer(key.data, CC_SHA256_DIGEST_LENGTH); + cd->short_sha = sha_hex.substr(0, 8).utf8(); + cd->stage = shader_data.stage; + + MDLibrary *library = [MDLibrary newLibraryWithCacheEntry:cd + device:device + source:source + options:options + strategy:_shader_load_strategy]; + _shader_cache[key] = cd; libraries[shader_data.stage] = library; } @@ -3062,8 +3098,13 @@ void RenderingDeviceDriverMetal::command_render_set_line_width(CommandBufferID p // ----- PIPELINE ----- -RenderingDeviceDriverMetal::Result> RenderingDeviceDriverMetal::_create_function(id p_library, NSString *p_name, VectorView &p_specialization_constants) { - id function = [p_library newFunctionWithName:p_name]; +RenderingDeviceDriverMetal::Result> RenderingDeviceDriverMetal::_create_function(MDLibrary *p_library, NSString *p_name, VectorView &p_specialization_constants) { + id library = p_library.library; + if (!library) { + ERR_FAIL_V_MSG(ERR_CANT_CREATE, "Failed to compile Metal library"); + } + + id function = [library newFunctionWithName:p_name]; ERR_FAIL_NULL_V_MSG(function, ERR_CANT_CREATE, "No function named main0"); if (function.functionConstantsDictionary.count == 0) { @@ -3141,9 +3182,9 @@ RenderingDeviceDriverMetal::Result> RenderingDeviceDriverMetal:: } NSError *err = nil; - function = [p_library newFunctionWithName:@"main0" - constantValues:constantValues - error:&err]; + function = [library newFunctionWithName:@"main0" + constantValues:constantValues + error:&err]; ERR_FAIL_NULL_V_MSG(function, ERR_CANT_CREATE, String("specialized function failed: ") + err.localizedDescription.UTF8String); return function; @@ -3188,6 +3229,14 @@ RDD::PipelineID RenderingDeviceDriverMetal::render_pipeline_create( MTLVertexDescriptor *vert_desc = rid::get(p_vertex_format); MDRenderPass *pass = (MDRenderPass *)(p_render_pass.id); + os_signpost_id_t reflect_id = os_signpost_id_make_with_pointer(LOG_INTERVALS, shader); + os_signpost_interval_begin(LOG_INTERVALS, reflect_id, "render_pipeline_create", "shader_name=%{public}s", shader->name.get_data()); + DEFER([=]() { + os_signpost_interval_end(LOG_INTERVALS, reflect_id, "render_pipeline_create"); + }); + + os_signpost_event_emit(LOG_DRIVER, OS_SIGNPOST_ID_EXCLUSIVE, "create_pipeline"); + MTLRenderPipelineDescriptor *desc = [MTLRenderPipelineDescriptor new]; { @@ -3482,9 +3531,15 @@ void RenderingDeviceDriverMetal::command_compute_dispatch_indirect(CommandBuffer RDD::PipelineID RenderingDeviceDriverMetal::compute_pipeline_create(ShaderID p_shader, VectorView p_specialization_constants) { MDComputeShader *shader = (MDComputeShader *)(p_shader.id); - id library = shader->kernel; + os_signpost_id_t reflect_id = os_signpost_id_make_with_pointer(LOG_INTERVALS, shader); + os_signpost_interval_begin(LOG_INTERVALS, reflect_id, "compute_pipeline_create", "shader_name=%{public}s", shader->name.get_data()); + DEFER([=]() { + os_signpost_interval_end(LOG_INTERVALS, reflect_id, "compute_pipeline_create"); + }); - Result> function_or_err = _create_function(library, @"main0", p_specialization_constants); + os_signpost_event_emit(LOG_DRIVER, OS_SIGNPOST_ID_EXCLUSIVE, "create_pipeline"); + + Result> function_or_err = _create_function(shader->kernel, @"main0", p_specialization_constants); ERR_FAIL_COND_V(std::holds_alternative(function_or_err), PipelineID()); id function = std::get>(function_or_err); @@ -3585,12 +3640,13 @@ void RenderingDeviceDriverMetal::set_object_name(ObjectType p_type, ID p_driver_ buffer.label = [NSString stringWithUTF8String:p_name.utf8().get_data()]; } break; case OBJECT_TYPE_SHADER: { + NSString *label = [NSString stringWithUTF8String:p_name.utf8().get_data()]; MDShader *shader = (MDShader *)(p_driver_id.id); if (MDRenderShader *rs = dynamic_cast(shader); rs != nullptr) { - rs->vert.label = [NSString stringWithUTF8String:p_name.utf8().get_data()]; - rs->frag.label = [NSString stringWithUTF8String:p_name.utf8().get_data()]; + [rs->vert setLabel:label]; + [rs->frag setLabel:label]; } else if (MDComputeShader *cs = dynamic_cast(shader); cs != nullptr) { - cs->kernel.label = [NSString stringWithUTF8String:p_name.utf8().get_data()]; + [cs->kernel setLabel:label]; } else { DEV_ASSERT(false); } @@ -3830,12 +3886,20 @@ size_t RenderingDeviceDriverMetal::get_texel_buffer_alignment_for_format(MTLPixe RenderingDeviceDriverMetal::RenderingDeviceDriverMetal(RenderingContextDriverMetal *p_context_driver) : context_driver(p_context_driver) { DEV_ASSERT(p_context_driver != nullptr); + + if (String res = OS::get_singleton()->get_environment("GODOT_MTL_SHADER_LOAD_STRATEGY"); res == U"lazy") { + _shader_load_strategy = ShaderLoadStrategy::LAZY; + } } RenderingDeviceDriverMetal::~RenderingDeviceDriverMetal() { for (MDCommandBuffer *cb : command_buffers) { delete cb; } + + for (KeyValue &kv : _shader_cache) { + memdelete(kv.value); + } } #pragma mark - Initialization