linux/arch/powerpc/sysdev/xive/xive-internal.h
Cédric Le Goater 7dcc37b3ef powerpc/xive: Map one IPI interrupt per node
ipistorm [*] can be used to benchmark the raw interrupt rate of an
interrupt controller by measuring the number of IPIs a system can
sustain. When applied to the XIVE interrupt controller of POWER9 and
POWER10 systems, a significant drop of the interrupt rate can be
observed when crossing the second node boundary.

This is due to the fact that a single IPI interrupt is used for all
CPUs of the system. The structure is shared and the cache line updates
impact greatly the traffic between nodes and the overall IPI
performance.

As a workaround, the impact can be reduced by deactivating the IRQ
lockup detector ("noirqdebug") which does a lot of accounting in the
Linux IRQ descriptor structure and is responsible for most of the
performance penalty.

As a fix, this proposal allocates an IPI interrupt per node, to be
shared by all CPUs of that node. It solves the scaling issue, the IRQ
lockup detector still has an impact but the XIVE interrupt rate scales
linearly. It also improves the "noirqdebug" case as showed in the
tables below.

 * P9 DD2.2 - 2s * 64 threads

                                               "noirqdebug"
                        Mint/s                    Mint/s
 chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
 --------------------------------------------------------------
 1      0-15     4.984023   4.875405       4.996536   5.048892
        0-31    10.879164  10.544040      10.757632  11.037859
        0-47    15.345301  14.688764      14.926520  15.310053
        0-63    17.064907  17.066812      17.613416  17.874511
 2      0-79    11.768764  21.650749      22.689120  22.566508
        0-95    10.616812  26.878789      28.434703  28.320324
        0-111   10.151693  31.397803      31.771773  32.388122
        0-127    9.948502  33.139336      34.875716  35.224548

 * P10 DD1 - 4s (not homogeneous) 352 threads

                                               "noirqdebug"
                        Mint/s                    Mint/s
 chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
 --------------------------------------------------------------
 1      0-15     2.409402   2.364108       2.383303   2.395091
        0-31     6.028325   6.046075       6.089999   6.073750
        0-47     8.655178   8.644531       8.712830   8.724702
        0-63    11.629652  11.735953      12.088203  12.055979
        0-79    14.392321  14.729959      14.986701  14.973073
        0-95    12.604158  13.004034      17.528748  17.568095
 2      0-111    9.767753  13.719831      19.968606  20.024218
        0-127    6.744566  16.418854      22.898066  22.995110
        0-143    6.005699  19.174421      25.425622  25.417541
        0-159    5.649719  21.938836      27.952662  28.059603
        0-175    5.441410  24.109484      31.133915  31.127996
 3      0-191    5.318341  24.405322      33.999221  33.775354
        0-207    5.191382  26.449769      36.050161  35.867307
        0-223    5.102790  29.356943      39.544135  39.508169
        0-239    5.035295  31.933051      42.135075  42.071975
        0-255    4.969209  34.477367      44.655395  44.757074
 4      0-271    4.907652  35.887016      47.080545  47.318537
        0-287    4.839581  38.076137      50.464307  50.636219
        0-303    4.786031  40.881319      53.478684  53.310759
        0-319    4.743750  43.448424      56.388102  55.973969
        0-335    4.709936  45.623532      59.400930  58.926857
        0-351    4.681413  45.646151      62.035804  61.830057

[*] https://github.com/antonblanchard/ipistorm

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210331144514.892250-9-clg@kaod.org
2021-04-14 23:04:15 +10:00

76 lines
2.1 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright 2016,2017 IBM Corporation.
*/
#ifndef __XIVE_INTERNAL_H
#define __XIVE_INTERNAL_H
/*
* A "disabled" interrupt should never fire, to catch problems
* we set its logical number to this
*/
#define XIVE_BAD_IRQ 0x7fffffff
#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1)
/* Each CPU carry one of these with various per-CPU state */
struct xive_cpu {
#ifdef CONFIG_SMP
/* HW irq number and data of IPI */
u32 hw_ipi;
struct xive_irq_data ipi_data;
#endif /* CONFIG_SMP */
int chip_id;
/* Queue datas. Only one is populated */
#define XIVE_MAX_QUEUES 8
struct xive_q queue[XIVE_MAX_QUEUES];
/*
* Pending mask. Each bit corresponds to a priority that
* potentially has pending interrupts.
*/
u8 pending_prio;
/* Cache of HW CPPR */
u8 cppr;
};
/* Backend ops */
struct xive_ops {
int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
int (*get_irq_config)(u32 hw_irq, u32 *target, u8 *prio,
u32 *sw_irq);
int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
bool (*match)(struct device_node *np);
void (*shutdown)(void);
void (*update_pending)(struct xive_cpu *xc);
void (*sync_source)(u32 hw_irq);
u64 (*esb_rw)(u32 hw_irq, u32 offset, u64 data, bool write);
#ifdef CONFIG_SMP
int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
#endif
int (*debug_show)(struct seq_file *m, void *private);
const char *name;
};
bool xive_core_init(struct device_node *np, const struct xive_ops *ops,
void __iomem *area, u32 offset, u8 max_prio);
__be32 *xive_queue_page_alloc(unsigned int cpu, u32 queue_shift);
int xive_core_debug_init(void);
static inline u32 xive_alloc_order(u32 queue_shift)
{
return (queue_shift > PAGE_SHIFT) ? (queue_shift - PAGE_SHIFT) : 0;
}
extern bool xive_cmdline_disabled;
#endif /* __XIVE_INTERNAL_H */