ipistorm [*] can be used to benchmark the raw interrupt rate of an interrupt controller by measuring the number of IPIs a system can sustain. When applied to the XIVE interrupt controller of POWER9 and POWER10 systems, a significant drop of the interrupt rate can be observed when crossing the second node boundary. This is due to the fact that a single IPI interrupt is used for all CPUs of the system. The structure is shared and the cache line updates impact greatly the traffic between nodes and the overall IPI performance. As a workaround, the impact can be reduced by deactivating the IRQ lockup detector ("noirqdebug") which does a lot of accounting in the Linux IRQ descriptor structure and is responsible for most of the performance penalty. As a fix, this proposal allocates an IPI interrupt per node, to be shared by all CPUs of that node. It solves the scaling issue, the IRQ lockup detector still has an impact but the XIVE interrupt rate scales linearly. It also improves the "noirqdebug" case as showed in the tables below. * P9 DD2.2 - 2s * 64 threads "noirqdebug" Mint/s Mint/s chips cpus IPI/sys IPI/chip IPI/chip IPI/sys -------------------------------------------------------------- 1 0-15 4.984023 4.875405 4.996536 5.048892 0-31 10.879164 10.544040 10.757632 11.037859 0-47 15.345301 14.688764 14.926520 15.310053 0-63 17.064907 17.066812 17.613416 17.874511 2 0-79 11.768764 21.650749 22.689120 22.566508 0-95 10.616812 26.878789 28.434703 28.320324 0-111 10.151693 31.397803 31.771773 32.388122 0-127 9.948502 33.139336 34.875716 35.224548 * P10 DD1 - 4s (not homogeneous) 352 threads "noirqdebug" Mint/s Mint/s chips cpus IPI/sys IPI/chip IPI/chip IPI/sys -------------------------------------------------------------- 1 0-15 2.409402 2.364108 2.383303 2.395091 0-31 6.028325 6.046075 6.089999 6.073750 0-47 8.655178 8.644531 8.712830 8.724702 0-63 11.629652 11.735953 12.088203 12.055979 0-79 14.392321 14.729959 14.986701 14.973073 0-95 12.604158 13.004034 17.528748 17.568095 2 0-111 9.767753 13.719831 19.968606 20.024218 0-127 6.744566 16.418854 22.898066 22.995110 0-143 6.005699 19.174421 25.425622 25.417541 0-159 5.649719 21.938836 27.952662 28.059603 0-175 5.441410 24.109484 31.133915 31.127996 3 0-191 5.318341 24.405322 33.999221 33.775354 0-207 5.191382 26.449769 36.050161 35.867307 0-223 5.102790 29.356943 39.544135 39.508169 0-239 5.035295 31.933051 42.135075 42.071975 0-255 4.969209 34.477367 44.655395 44.757074 4 0-271 4.907652 35.887016 47.080545 47.318537 0-287 4.839581 38.076137 50.464307 50.636219 0-303 4.786031 40.881319 53.478684 53.310759 0-319 4.743750 43.448424 56.388102 55.973969 0-335 4.709936 45.623532 59.400930 58.926857 0-351 4.681413 45.646151 62.035804 61.830057 [*] https://github.com/antonblanchard/ipistorm Signed-off-by: Cédric Le Goater <clg@kaod.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20210331144514.892250-9-clg@kaod.org
76 lines
2.1 KiB
C
76 lines
2.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* Copyright 2016,2017 IBM Corporation.
|
|
*/
|
|
#ifndef __XIVE_INTERNAL_H
|
|
#define __XIVE_INTERNAL_H
|
|
|
|
/*
|
|
* A "disabled" interrupt should never fire, to catch problems
|
|
* we set its logical number to this
|
|
*/
|
|
#define XIVE_BAD_IRQ 0x7fffffff
|
|
#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1)
|
|
|
|
/* Each CPU carry one of these with various per-CPU state */
|
|
struct xive_cpu {
|
|
#ifdef CONFIG_SMP
|
|
/* HW irq number and data of IPI */
|
|
u32 hw_ipi;
|
|
struct xive_irq_data ipi_data;
|
|
#endif /* CONFIG_SMP */
|
|
|
|
int chip_id;
|
|
|
|
/* Queue datas. Only one is populated */
|
|
#define XIVE_MAX_QUEUES 8
|
|
struct xive_q queue[XIVE_MAX_QUEUES];
|
|
|
|
/*
|
|
* Pending mask. Each bit corresponds to a priority that
|
|
* potentially has pending interrupts.
|
|
*/
|
|
u8 pending_prio;
|
|
|
|
/* Cache of HW CPPR */
|
|
u8 cppr;
|
|
};
|
|
|
|
/* Backend ops */
|
|
struct xive_ops {
|
|
int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
|
|
int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
|
|
int (*get_irq_config)(u32 hw_irq, u32 *target, u8 *prio,
|
|
u32 *sw_irq);
|
|
int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
|
|
void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
|
|
void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
|
|
void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
|
|
bool (*match)(struct device_node *np);
|
|
void (*shutdown)(void);
|
|
|
|
void (*update_pending)(struct xive_cpu *xc);
|
|
void (*sync_source)(u32 hw_irq);
|
|
u64 (*esb_rw)(u32 hw_irq, u32 offset, u64 data, bool write);
|
|
#ifdef CONFIG_SMP
|
|
int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
|
|
void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
|
|
#endif
|
|
int (*debug_show)(struct seq_file *m, void *private);
|
|
const char *name;
|
|
};
|
|
|
|
bool xive_core_init(struct device_node *np, const struct xive_ops *ops,
|
|
void __iomem *area, u32 offset, u8 max_prio);
|
|
__be32 *xive_queue_page_alloc(unsigned int cpu, u32 queue_shift);
|
|
int xive_core_debug_init(void);
|
|
|
|
static inline u32 xive_alloc_order(u32 queue_shift)
|
|
{
|
|
return (queue_shift > PAGE_SHIFT) ? (queue_shift - PAGE_SHIFT) : 0;
|
|
}
|
|
|
|
extern bool xive_cmdline_disabled;
|
|
|
|
#endif /* __XIVE_INTERNAL_H */
|