From 44264591a8c4da7090a4bfd10e04f4cb8fe60afe Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Fri, 9 Oct 2020 09:36:01 -0400 Subject: [PATCH 1/7] drm/amd/display: Fix module load hangs when connected to an eDP It was recently introduced a change that enables driver to disable streams if pixel clock changes. Consequently, the code path executed in the disable vbios function expanded to an encoder verification part. The encoder loop is nested inside the pipe count loop, and both loops share the 'i' variable in control of their flow. This situation may lead to an infinite loop because the encoder loop constantly updates the `i` variable, making the first loop always positive. As a result, we can see a soft hang during the module load (modprobe amdgpu) and a series of dmesg log that looks like this: kernel:[ 124.538727] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [modprobe:1000] RSP: 0018:ffffabbf419bf0e8 EFLAGS: 00000282 RAX: ffffffffc0809de0 RBX: ffff93b35ccc0000 RCX: ffff93b366c21800 RDX: 0000000000000000 RSI: 0000000000000141 RDI: ffff93b35ccc0000 RBP: ffffabbf419bf108 R08: ffffabbf419bf164 R09: 0000000000000001 R10: 0000000000000003 R11: 0000000000000003 R12: 0000000008677d40 R13: 0000000000000141 R14: ffff93b35cfc0000 R15: ffff93b35abc0000 FS: 00007f1400717540(0000) GS:ffff93b37f680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005649b66b0968 CR3: 00000003e0fec000 CR4: 0000000000350ee0 Call Trace: amdgpu_device_rreg+0x17/0x20 [amdgpu] amdgpu_cgs_read_register+0x14/0x20 [amdgpu] dm_read_reg_func+0x3a/0xb0 [amdgpu] get_pixel_clk_frequency_100hz+0x30/0x50 [amdgpu] dc_commit_state+0x8f1/0xae0 [amdgpu] ? drm_calc_timestamping_constants+0x101/0x160 [drm] amdgpu_dm_atomic_commit_tail+0x39d/0x21a0 [amdgpu] ? dcn21_validate_bandwidth+0xe5/0x290 [amdgpu] ? kfree+0xc3/0x390 ? dcn21_validate_bandwidth+0xe5/0x290 [amdgpu] ... RSP: 002b:00007fff26009bd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000055a8025bea50 RCX: 00007f140085c89d RDX: 0000000000000000 RSI: 000055a8025b8290 RDI: 000000000000000c RBP: 0000000000040000 R08: 0000000000000000 R09: 0000000000000000 R10: 000000000000000c R11: 0000000000000246 R12: 000055a8025b8290 R13: 0000000000000000 R14: 000055a8025bead0 R15: 000055a8025bea50 This issue was fixed by introducing a second variable for the internal loop. Fixes: 8353d30e747f4e ("drm/amd/display: disable stream if pixel clock changed with link active") Reviewed-by: Roman Li Reviewed-by: Nicholas Kazlauskas Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 2a725a5fba40..1eb29c362122 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -848,7 +848,7 @@ static void disable_vbios_mode_if_required( struct dc *dc, struct dc_state *context) { - unsigned int i; + unsigned int i, j; /* check if timing_changed, disable stream*/ for (i = 0; i < dc->res_pool->pipe_count; i++) { @@ -872,10 +872,10 @@ static void disable_vbios_mode_if_required( enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc); if (enc_inst != ENGINE_ID_UNKNOWN) { - for (i = 0; i < dc->res_pool->stream_enc_count; i++) { - if (dc->res_pool->stream_enc[i]->id == enc_inst) { - tg_inst = dc->res_pool->stream_enc[i]->funcs->dig_source_otg( - dc->res_pool->stream_enc[i]); + for (j = 0; j < dc->res_pool->stream_enc_count; j++) { + if (dc->res_pool->stream_enc[j]->id == enc_inst) { + tg_inst = dc->res_pool->stream_enc[j]->funcs->dig_source_otg( + dc->res_pool->stream_enc[j]); break; } } From 02a1bea65bb335ebfd3a4d191742de3b6f64a414 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 12 Oct 2020 10:12:28 -0400 Subject: [PATCH 2/7] drm/amdgpu/swsmu: init the baco mutex in early_init GPU reset might get called during init time, before sw_init has been called. Reviewed-by: Kevin Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index e41fd6ea6451..b1e5ec01527b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -417,6 +417,9 @@ static int smu_early_init(void *handle) smu->pm_enabled = !!amdgpu_dpm; smu->is_apu = false; mutex_init(&smu->mutex); + mutex_init(&smu->smu_baco.mutex); + smu->smu_baco.state = SMU_BACO_STATE_EXIT; + smu->smu_baco.platform_support = false; return smu_set_funcs(adev); } @@ -795,10 +798,6 @@ static int smu_sw_init(void *handle) bitmap_zero(smu->smu_feature.enabled, SMU_FEATURE_MAX); bitmap_zero(smu->smu_feature.allowed, SMU_FEATURE_MAX); - mutex_init(&smu->smu_baco.mutex); - smu->smu_baco.state = SMU_BACO_STATE_EXIT; - smu->smu_baco.platform_support = false; - mutex_init(&smu->sensor_lock); mutex_init(&smu->metrics_lock); mutex_init(&smu->message_lock); From c0e35ed924e47be387205fa4beaf4134b992e0d4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 13 Oct 2020 13:54:27 +0200 Subject: [PATCH 3/7] drm/amd/display: kernel-doc: document force_timing_sync As warned when running "make htmldocs": ./drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h:345: warning: Function parameter or member 'force_timing_sync' not described in 'amdgpu_display_manager' This new struct member was not documented at kernel-doc markup. Fixes: 3d4e52d0cf24 ("drm/amd/display: Add debugfs for forcing stream timing sync") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 9c1e003d9c29..34f6369bf51f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -149,6 +149,8 @@ struct amdgpu_dm_backlight_caps { * @cached_state: Caches device atomic state for suspend/resume * @cached_dc_state: Cached state of content streams * @compressor: Frame buffer compression buffer. See &struct dm_comressor_info + * @force_timing_sync: set via debugfs. When set, indicates that all connected + * displays will be forced to synchronize. */ struct amdgpu_display_manager { From 39ec39d77170a3fe9e92dcddf9060634276ee1ee Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 13 Oct 2020 13:54:20 +0200 Subject: [PATCH 4/7] docs: amdgpu: fix a warning when building the documentation As reported by Sphinx: Documentation/gpu/amdgpu.rst:200: WARNING: Inline emphasis start-string without end-string. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst index 57047dcb8d19..1f9ea8221f80 100644 --- a/Documentation/gpu/amdgpu.rst +++ b/Documentation/gpu/amdgpu.rst @@ -206,8 +206,8 @@ pp_power_profile_mode .. kernel-doc:: drivers/gpu/drm/amd/pm/amdgpu_pm.c :doc: pp_power_profile_mode -*_busy_percent -~~~~~~~~~~~~~~ +\*_busy_percent +~~~~~~~~~~~~~~~ .. kernel-doc:: drivers/gpu/drm/amd/pm/amdgpu_pm.c :doc: gpu_busy_percent From 83da6eea3af669ee0b1f1bc05ffd6150af984994 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 2 Sep 2020 16:10:10 +0800 Subject: [PATCH 5/7] drm/amd/pm: increase mclk switch threshold to 200 us To avoid underflow seen on Polaris10 with some 3440x1440 144Hz displays. As the threshold of 190 us cuts too close to minVBlankTime of 192 us. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c index 3bf8be4d107b..1e8919b0acdb 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c @@ -2883,7 +2883,7 @@ static int smu7_vblank_too_short(struct pp_hwmgr *hwmgr, if (hwmgr->is_kicker) switch_limit_us = data->is_memory_gddr5 ? 450 : 150; else - switch_limit_us = data->is_memory_gddr5 ? 190 : 150; + switch_limit_us = data->is_memory_gddr5 ? 200 : 150; break; case CHIP_VEGAM: switch_limit_us = 30; From 187561dd76531945126b15c9486fec7cfa5f0415 Mon Sep 17 00:00:00 2001 From: Veerabadhran G Date: Thu, 8 Oct 2020 22:30:02 +0530 Subject: [PATCH 6/7] drm/amdgpu: vcn and jpeg ring synchronization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synchronize the ring usage for vcn1 and jpeg1 to workaround a hardware bug. Signed-off-by: Veerabadhran Gopalakrishnan Acked-by: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 1 + drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c | 24 +++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c | 28 +++++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/vcn_v1_0.h | 3 ++- 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 495c3d7bb2b2..f3b7287e84c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -68,6 +68,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) INIT_DELAYED_WORK(&adev->vcn.idle_work, amdgpu_vcn_idle_work_handler); mutex_init(&adev->vcn.vcn_pg_lock); + mutex_init(&adev->vcn.vcn1_jpeg1_workaround); atomic_set(&adev->vcn.total_submission_cnt, 0); for (i = 0; i < adev->vcn.num_vcn_inst; i++) atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0); @@ -237,6 +238,7 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev) } release_firmware(adev->vcn.fw); + mutex_destroy(&adev->vcn.vcn1_jpeg1_workaround); mutex_destroy(&adev->vcn.vcn_pg_lock); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 7a9b804bc988..17691158f783 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -220,6 +220,7 @@ struct amdgpu_vcn { struct amdgpu_vcn_inst inst[AMDGPU_MAX_VCN_INSTANCES]; struct amdgpu_vcn_reg internal; struct mutex vcn_pg_lock; + struct mutex vcn1_jpeg1_workaround; atomic_t total_submission_cnt; unsigned harvest_config; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c index bc300283b6ab..c600b61b5f45 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c @@ -33,6 +33,7 @@ static void jpeg_v1_0_set_dec_ring_funcs(struct amdgpu_device *adev); static void jpeg_v1_0_set_irq_funcs(struct amdgpu_device *adev); +static void jpeg_v1_0_ring_begin_use(struct amdgpu_ring *ring); static void jpeg_v1_0_decode_ring_patch_wreg(struct amdgpu_ring *ring, uint32_t *ptr, uint32_t reg_offset, uint32_t val) { @@ -564,8 +565,8 @@ static const struct amdgpu_ring_funcs jpeg_v1_0_decode_ring_vm_funcs = { .insert_start = jpeg_v1_0_decode_ring_insert_start, .insert_end = jpeg_v1_0_decode_ring_insert_end, .pad_ib = amdgpu_ring_generic_pad_ib, - .begin_use = vcn_v1_0_ring_begin_use, - .end_use = amdgpu_vcn_ring_end_use, + .begin_use = jpeg_v1_0_ring_begin_use, + .end_use = vcn_v1_0_ring_end_use, .emit_wreg = jpeg_v1_0_decode_ring_emit_wreg, .emit_reg_wait = jpeg_v1_0_decode_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, @@ -586,3 +587,22 @@ static void jpeg_v1_0_set_irq_funcs(struct amdgpu_device *adev) { adev->jpeg.inst->irq.funcs = &jpeg_v1_0_irq_funcs; } + +static void jpeg_v1_0_ring_begin_use(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + bool set_clocks = !cancel_delayed_work_sync(&adev->vcn.idle_work); + int cnt = 0; + + mutex_lock(&adev->vcn.vcn1_jpeg1_workaround); + + if (amdgpu_fence_wait_empty(&adev->vcn.inst->ring_dec)) + DRM_ERROR("JPEG dec: vcn dec ring may not be empty\n"); + + for (cnt = 0; cnt < adev->vcn.num_enc_rings; cnt++) { + if (amdgpu_fence_wait_empty(&adev->vcn.inst->ring_enc[cnt])) + DRM_ERROR("JPEG dec: vcn enc ring[%d] may not be empty\n", cnt); + } + + vcn_v1_0_set_pg_for_begin_use(ring, set_clocks); +} diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c index 73699eafb51e..86e1ef732ebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c @@ -54,6 +54,7 @@ static int vcn_v1_0_pause_dpg_mode(struct amdgpu_device *adev, int inst_idx, struct dpg_pause_state *new_state); static void vcn_v1_0_idle_work_handler(struct work_struct *work); +static void vcn_v1_0_ring_begin_use(struct amdgpu_ring *ring); /** * vcn_v1_0_early_init - set function pointers @@ -1804,10 +1805,23 @@ static void vcn_v1_0_idle_work_handler(struct work_struct *work) } } -void vcn_v1_0_ring_begin_use(struct amdgpu_ring *ring) +static void vcn_v1_0_ring_begin_use(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + bool set_clocks = !cancel_delayed_work_sync(&adev->vcn.idle_work); + + mutex_lock(&adev->vcn.vcn1_jpeg1_workaround); + + if (amdgpu_fence_wait_empty(&ring->adev->jpeg.inst->ring_dec)) + DRM_ERROR("VCN dec: jpeg dec ring may not be empty\n"); + + vcn_v1_0_set_pg_for_begin_use(ring, set_clocks); + +} + +void vcn_v1_0_set_pg_for_begin_use(struct amdgpu_ring *ring, bool set_clocks) { struct amdgpu_device *adev = ring->adev; - bool set_clocks = !cancel_delayed_work_sync(&adev->vcn.idle_work); if (set_clocks) { amdgpu_gfx_off_ctrl(adev, false); @@ -1844,6 +1858,12 @@ void vcn_v1_0_ring_begin_use(struct amdgpu_ring *ring) } } +void vcn_v1_0_ring_end_use(struct amdgpu_ring *ring) +{ + schedule_delayed_work(&ring->adev->vcn.idle_work, VCN_IDLE_TIMEOUT); + mutex_unlock(&ring->adev->vcn.vcn1_jpeg1_workaround); +} + static const struct amd_ip_funcs vcn_v1_0_ip_funcs = { .name = "vcn_v1_0", .early_init = vcn_v1_0_early_init, @@ -1891,7 +1911,7 @@ static const struct amdgpu_ring_funcs vcn_v1_0_dec_ring_vm_funcs = { .insert_end = vcn_v1_0_dec_ring_insert_end, .pad_ib = amdgpu_ring_generic_pad_ib, .begin_use = vcn_v1_0_ring_begin_use, - .end_use = amdgpu_vcn_ring_end_use, + .end_use = vcn_v1_0_ring_end_use, .emit_wreg = vcn_v1_0_dec_ring_emit_wreg, .emit_reg_wait = vcn_v1_0_dec_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, @@ -1923,7 +1943,7 @@ static const struct amdgpu_ring_funcs vcn_v1_0_enc_ring_vm_funcs = { .insert_end = vcn_v1_0_enc_ring_insert_end, .pad_ib = amdgpu_ring_generic_pad_ib, .begin_use = vcn_v1_0_ring_begin_use, - .end_use = amdgpu_vcn_ring_end_use, + .end_use = vcn_v1_0_ring_end_use, .emit_wreg = vcn_v1_0_enc_ring_emit_wreg, .emit_reg_wait = vcn_v1_0_enc_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.h b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.h index f67d7391fc21..1f1cc7f0ece7 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.h +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.h @@ -24,7 +24,8 @@ #ifndef __VCN_V1_0_H__ #define __VCN_V1_0_H__ -void vcn_v1_0_ring_begin_use(struct amdgpu_ring *ring); +void vcn_v1_0_ring_end_use(struct amdgpu_ring *ring); +void vcn_v1_0_set_pg_for_begin_use(struct amdgpu_ring *ring, bool set_clocks); extern const struct amdgpu_ip_block_version vcn_v1_0_ip_block; From 8f4729e880647c419de0bbe3ff21d7efb4e65676 Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Wed, 14 Oct 2020 07:47:32 -0400 Subject: [PATCH 7/7] drm/amdkfd: Use kvfree in destroy_crat_image Now that we use kvmalloc for the crat_image, we need to use kvfree when we destroy this. Fixes: d0e63b343e575e ("drm/amdkfd: Use kvmalloc instead of kmalloc for VCRAT") Reported-by: Morris Zhang Signed-off-by: Kent Russell Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index d2981524dba0..5e2254b9e931 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1426,5 +1426,5 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, */ void kfd_destroy_crat_image(void *crat_image) { - kfree(crat_image); + kvfree(crat_image); }