From a9fe4ac7a3a25f0242406db7aa237850692fb29c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 3 May 2024 12:46:21 +0530 Subject: [PATCH] perf vendor events amd: Add Zen 5 metrics Add metrics taken from Section 1.2 "Performance Measurement" of the Performance Monitor Counters for AMD Family 1Ah Model 00h-0Fh Processors document available at the link below. The recommended metrics are sourced from Table 1 "Guidance for Common Performance Statistics with Complex Event Selects". The pipeline utilization metrics are sourced from Table 2 "Guidance for Pipeline Utilization Analysis Statistics". These are useful for finding performance bottlenecks by analyzing activity at different stages of the pipeline. There are metric groups available for Level 1 and Level 2 analysis. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=305974 Link: https://lore.kernel.org/r/ee21ff77d89efa99997d3c2ebeeae22ddb6e7e12.1714717230.git.sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/amdzen5/pipeline.json | 99 +++++ .../arch/x86/amdzen5/recommended.json | 345 ++++++++++++++++++ 2 files changed, 444 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/recommended.json diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json b/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json new file mode 100644 index 000000000000..d860bf599cf2 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json @@ -0,0 +1,99 @@ +[ + { + "MetricName": "total_dispatch_slots", + "BriefDescription": "Total dispatch slots (up to 8 instructions can be dispatched in each cycle).", + "MetricExpr": "8 * ls_not_halted_cyc", + "ScaleUnit": "1slots" + }, + { + "MetricName": "frontend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because the frontend did not supply enough instructions/ops.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation", + "BriefDescription": "Percentage of dispatched ops that did not retire.", + "MetricExpr": "d_ratio(de_src_op_disp.all - ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because of backend stalls.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.backend_stalls, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "smt_contention", + "BriefDescription": "Percentage of dispatch slots that remained unused because the other thread was selected.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.smt_contention, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring", + "BriefDescription": "Percentage of dispatch slots used by ops that retired.", + "MetricExpr": "d_ratio(ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_latency", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a latency bottleneck in the frontend (such as instruction cache or TLB misses).", + "MetricExpr": "d_ratio((8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_bandwidth", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a bandwidth bottleneck in the frontend (such as decode or op cache fetch bandwidth).", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend - (8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation_from_mispredicts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to branch mispredicts.", + "MetricExpr": "d_ratio(bad_speculation * ex_ret_brn_misp, ex_ret_brn_misp + bp_redirects.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "bad_speculation_from_pipeline_restarts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to pipeline restarts (resyncs).", + "MetricExpr": "d_ratio(bad_speculation * bp_redirects.resync, ex_ret_brn_misp + bp_redirects.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound_by_memory", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls due to the memory subsystem.", + "MetricExpr": "backend_bound * d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete)", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "backend_bound_by_cpu", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls not related to the memory subsystem.", + "MetricExpr": "backend_bound * (1 - d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete))", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_fastpath", + "BriefDescription": "Percentage of dispatch slots used by fastpath ops that retired.", + "MetricExpr": "retiring * (1 - d_ratio(ex_ret_ucode_ops, ex_ret_ops))", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_microcode", + "BriefDescription": "Percentage of dispatch slots used by microcode ops that retired.", + "MetricExpr": "retiring * d_ratio(ex_ret_ucode_ops, ex_ret_ops)", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json new file mode 100644 index 000000000000..c97874039c1e --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json @@ -0,0 +1,345 @@ +[ + { + "MetricName": "branch_misprediction_rate", + "BriefDescription": "Execution-time branch misprediction rate (non-speculative).", + "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)", + "MetricGroup": "branch_prediction", + "ScaleUnit": "1per_branch" + }, + { + "MetricName": "all_data_cache_accesses_pti", + "BriefDescription": "All data cache accesses per thousand instructions.", + "MetricExpr": "ls_dispatch.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_accesses_pti", + "BriefDescription": "All L2 cache accesses per thousand instructions.", + "MetricExpr": "(l2_request_g1.all_no_prefetch + l2_pf_hit_l2.l2_hwpf + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_ic_misses_pti", + "BriefDescription": "L2 cache accesses from L1 instruction cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.cacheable_ic_read / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_dc_misses_pti", + "BriefDescription": "L2 cache accesses from L1 data cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.all_dc / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache accesses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_hit_l2.l1_dc_l2_hwpf + l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_misses_pti", + "BriefDescription": "All L2 cache misses per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache misses from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_fill_miss / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache misses from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ls_rd_blk_c / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache misses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_hits_pti", + "BriefDescription": "All L2 cache hits per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache hits from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache hits from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.dc_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l2_hwpf_pti", + "BriefDescription": "L2 cache hits from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "l2_pf_hit_l2.l1_dc_l2_hwpf / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l3_cache_accesses", + "BriefDescription": "L3 cache accesses.", + "MetricExpr": "l3_lookup_state.all_coherent_accesses_to_l3", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_misses", + "BriefDescription": "L3 misses (including cacheline state change requests).", + "MetricExpr": "l3_lookup_state.l3_miss", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_read_miss_latency", + "BriefDescription": "Average L3 read miss latency (in core clocks).", + "MetricExpr": "(l3_xi_sampled_latency.all * 10) / l3_xi_sampled_latency_requests.all", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_local_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for local DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_near * 10) / l3_xi_sampled_latency_requests.dram_near", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_remote_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for remote DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_far * 10) / l3_xi_sampled_latency_requests.dram_far", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "op_cache_fetch_miss_ratio", + "BriefDescription": "Op cache miss ratio for all fetches.", + "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)", + "ScaleUnit": "100%" + }, + { + "MetricName": "ic_fetch_miss_ratio", + "BriefDescription": "Instruction cache miss ratio for all fetches. An instruction cache miss will not be counted by this metric if it is an OC hit.", + "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)", + "ScaleUnit": "100%" + }, + { + "MetricName": "l1_data_cache_fills_from_memory_pti", + "BriefDescription": "L1 data cache fills from DRAM or MMIO in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.dram_io_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_remote_node_pti", + "BriefDescription": "L1 data cache fills from a different NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.far_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.local_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_different_ccx_pti", + "BriefDescription": "L1 data cache fills from another CCX cache in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.remote_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l1_data_cache_fills_pti", + "BriefDescription": "All L1 data cache fills per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_local_l2_pti", + "BriefDescription": "L1 demand data cache fills from local L2 cache per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_l2 / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 demand data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_ccx / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.near_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_near / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.far_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_far / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_itlb_misses_pti", + "BriefDescription": "L1 instruction TLB misses per thousand instructions.", + "MetricExpr": "(bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss.all) / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_itlb_misses_pti", + "BriefDescription": "L2 instruction TLB misses and instruction page walks per thousand instructions.", + "MetricExpr": "bp_l1_tlb_miss_l2_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_dtlb_misses_pti", + "BriefDescription": "L1 data TLB misses per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_dtlb_misses_pti", + "BriefDescription": "L2 data TLB misses and data page walks per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all_l2_miss / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_tlbs_flushed_pti", + "BriefDescription": "All TLBs flushed per thousand instructions.", + "MetricExpr": "ls_tlb_flush.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "macro_ops_dispatched", + "BriefDescription": "Macro-ops dispatched.", + "MetricExpr": "de_src_op_disp.all", + "MetricGroup": "decoder" + }, + { + "MetricName": "sse_avx_stalls", + "BriefDescription": "Mixed SSE/AVX stalls.", + "MetricExpr": "fp_disp_faults.sse_avx_all" + }, + { + "MetricName": "macro_ops_retired", + "BriefDescription": "Macro-ops retired.", + "MetricExpr": "ex_ret_ops" + }, + { + "MetricName": "umc_data_bus_utilization", + "BriefDescription": "Memory controller data bus utilization.", + "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_rate", + "BriefDescription": "Memory controller CAS command rate.", + "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_cas_cmd_read_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for reads.", + "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_write_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for writes.", + "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_mem_read_bandwidth", + "BriefDescription": "Estimated memory read bandwidth.", + "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_write_bandwidth", + "BriefDescription": "Estimated memory write bandwidth.", + "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_bandwidth", + "BriefDescription": "Estimated combined memory bandwidth.", + "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_activate_cmd_rate", + "BriefDescription": "Memory controller ACTIVATE command rate.", + "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_precharge_cmd_rate", + "BriefDescription": "Memory controller PRECHARGE command rate.", + "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + } +]