drm/amdgpu: Use delayed work to collect RAS error counters
On Context Query2 IOCTL return the correctable and
uncorrectable errors in O(1) fashion, from cached
values, and schedule a delayed work function to
calculate and cache them for the next such IOCTL.
v2: Cancel pending delayed work at ras_fini().
v3: Remove conditionals when dealing with delayed
    work manipulation as they're inherently racy.
Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
Reviewed-by: Alexander Deucher <Alexander.Deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
			
			
This commit is contained in:
		
							parent
							
								
									a46751fbcd
								
							
						
					
					
						commit
						05adfd80cc
					
				| @ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| #define AMDGPU_RAS_COUNTE_DELAY_MS 3000 | ||||
| 
 | ||||
| static int amdgpu_ctx_query2(struct amdgpu_device *adev, | ||||
| 	struct amdgpu_fpriv *fpriv, uint32_t id, | ||||
| 	union drm_amdgpu_ctx_out *out) | ||||
| 			     struct amdgpu_fpriv *fpriv, uint32_t id, | ||||
| 			     union drm_amdgpu_ctx_out *out) | ||||
| { | ||||
| 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||||
| 	struct amdgpu_ctx *ctx; | ||||
| 	struct amdgpu_ctx_mgr *mgr; | ||||
| 
 | ||||
| @ -361,6 +364,30 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, | ||||
| 	if (atomic_read(&ctx->guilty)) | ||||
| 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; | ||||
| 
 | ||||
| 	if (adev->ras_enabled && con) { | ||||
| 		/* Return the cached values in O(1),
 | ||||
| 		 * and schedule delayed work to cache | ||||
| 		 * new vaues. | ||||
| 		 */ | ||||
| 		int ce_count, ue_count; | ||||
| 
 | ||||
| 		ce_count = atomic_read(&con->ras_ce_count); | ||||
| 		ue_count = atomic_read(&con->ras_ue_count); | ||||
| 
 | ||||
| 		if (ce_count != ctx->ras_counter_ce) { | ||||
| 			ctx->ras_counter_ce = ce_count; | ||||
| 			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; | ||||
| 		} | ||||
| 
 | ||||
| 		if (ue_count != ctx->ras_counter_ue) { | ||||
| 			ctx->ras_counter_ue = ue_count; | ||||
| 			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; | ||||
| 		} | ||||
| 
 | ||||
| 		schedule_delayed_work(&con->ras_counte_delay_work, | ||||
| 				      msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS)); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_unlock(&mgr->lock); | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
| @ -27,6 +27,7 @@ | ||||
| #include <linux/uaccess.h> | ||||
| #include <linux/reboot.h> | ||||
| #include <linux/syscalls.h> | ||||
| #include <linux/pm_runtime.h> | ||||
| 
 | ||||
| #include "amdgpu.h" | ||||
| #include "amdgpu_ras.h" | ||||
| @ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) | ||||
| 		adev->ras_hw_enabled & amdgpu_ras_mask; | ||||
| } | ||||
| 
 | ||||
| static void amdgpu_ras_counte_dw(struct work_struct *work) | ||||
| { | ||||
| 	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, | ||||
| 					      ras_counte_delay_work.work); | ||||
| 	struct amdgpu_device *adev = con->adev; | ||||
| 	struct drm_device *dev = &adev->ddev; | ||||
| 	unsigned long ce_count, ue_count; | ||||
| 	int res; | ||||
| 
 | ||||
| 	res = pm_runtime_get_sync(dev->dev); | ||||
| 	if (res < 0) | ||||
| 		goto Out; | ||||
| 
 | ||||
| 	/* Cache new values.
 | ||||
| 	 */ | ||||
| 	amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); | ||||
| 	atomic_set(&con->ras_ce_count, ce_count); | ||||
| 	atomic_set(&con->ras_ue_count, ue_count); | ||||
| 
 | ||||
| 	pm_runtime_mark_last_busy(dev->dev); | ||||
| Out: | ||||
| 	pm_runtime_put_autosuspend(dev->dev); | ||||
| } | ||||
| 
 | ||||
| int amdgpu_ras_init(struct amdgpu_device *adev) | ||||
| { | ||||
| 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||||
| @ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) | ||||
| 	if (!con) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	con->adev = adev; | ||||
| 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); | ||||
| 	atomic_set(&con->ras_ce_count, 0); | ||||
| 	atomic_set(&con->ras_ue_count, 0); | ||||
| 
 | ||||
| 	con->objs = (struct ras_manager *)(con + 1); | ||||
| 
 | ||||
| 	amdgpu_ras_set_context(adev, con); | ||||
| @ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, | ||||
| 			 struct ras_fs_if *fs_info, | ||||
| 			 struct ras_ih_if *ih_info) | ||||
| { | ||||
| 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||||
| 	unsigned long ue_count, ce_count; | ||||
| 	int r; | ||||
| 
 | ||||
| 	/* disable RAS feature per IP block if it is not supported */ | ||||
| @ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, | ||||
| 	if (r) | ||||
| 		goto sysfs; | ||||
| 
 | ||||
| 	/* Those are the cached values at init.
 | ||||
| 	 */ | ||||
| 	amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); | ||||
| 	atomic_set(&con->ras_ce_count, ce_count); | ||||
| 	atomic_set(&con->ras_ue_count, ue_count); | ||||
| 
 | ||||
| 	return 0; | ||||
| cleanup: | ||||
| 	amdgpu_ras_sysfs_remove(adev, ras_block); | ||||
| @ -2390,6 +2428,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) | ||||
| 	if (con->features) | ||||
| 		amdgpu_ras_disable_all_features(adev, 1); | ||||
| 
 | ||||
| 	cancel_delayed_work_sync(&con->ras_counte_delay_work); | ||||
| 
 | ||||
| 	amdgpu_ras_set_context(adev, NULL); | ||||
| 	kfree(con); | ||||
| 
 | ||||
|  | ||||
| @ -340,6 +340,11 @@ struct amdgpu_ras { | ||||
| 
 | ||||
| 	/* disable ras error count harvest in recovery */ | ||||
| 	bool disable_ras_err_cnt_harvest; | ||||
| 
 | ||||
| 	/* RAS count errors delayed work */ | ||||
| 	struct delayed_work ras_counte_delay_work; | ||||
| 	atomic_t ras_ue_count; | ||||
| 	atomic_t ras_ce_count; | ||||
| }; | ||||
| 
 | ||||
| struct ras_fs_data { | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user