x86/apic: Reduce cache line misses in __x2apic_send_IPI_mask()
Using per-cpu storage for @x86_cpu_to_logical_apicid is not optimal. Broadcast IPI will need at least one cache line per cpu to access this field. __x2apic_send_IPI_mask() is using standard bitmask operators. By converting x86_cpu_to_logical_apicid to an array, we divide by 16x number of needed cache lines, because we find 16 values per cache line. CPU prefetcher can kick nicely. Also move @cluster_masks to READ_MOSTLY section to avoid false sharing. Tested on a dual socket host with 256 cpus, cost for a full broadcast is now 11 usec instead of 33 usec. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20211007143556.574911-1-eric.dumazet@gmail.com
This commit is contained in:
committed by
Peter Zijlstra
parent
3906fe9bb7
commit
cc95a07fef
@@ -15,9 +15,15 @@ struct cluster_mask {
|
|||||||
struct cpumask mask;
|
struct cpumask mask;
|
||||||
};
|
};
|
||||||
|
|
||||||
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
|
/*
|
||||||
|
* __x2apic_send_IPI_mask() possibly needs to read
|
||||||
|
* x86_cpu_to_logical_apicid for all online cpus in a sequential way.
|
||||||
|
* Using per cpu variable would cost one cache line per cpu.
|
||||||
|
*/
|
||||||
|
static u32 *x86_cpu_to_logical_apicid __read_mostly;
|
||||||
|
|
||||||
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
|
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
|
||||||
static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
|
static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
|
||||||
static struct cluster_mask *cluster_hotplug_mask;
|
static struct cluster_mask *cluster_hotplug_mask;
|
||||||
|
|
||||||
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
|
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
|
||||||
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
|
|||||||
|
|
||||||
static void x2apic_send_IPI(int cpu, int vector)
|
static void x2apic_send_IPI(int cpu, int vector)
|
||||||
{
|
{
|
||||||
u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
|
u32 dest = x86_cpu_to_logical_apicid[cpu];
|
||||||
|
|
||||||
/* x2apic MSRs are special and need a special fence: */
|
/* x2apic MSRs are special and need a special fence: */
|
||||||
weak_wrmsr_fence();
|
weak_wrmsr_fence();
|
||||||
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
|
|||||||
|
|
||||||
dest = 0;
|
dest = 0;
|
||||||
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
|
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
|
||||||
dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
|
dest |= x86_cpu_to_logical_apicid[clustercpu];
|
||||||
|
|
||||||
if (!dest)
|
if (!dest)
|
||||||
continue;
|
continue;
|
||||||
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)
|
|||||||
|
|
||||||
static u32 x2apic_calc_apicid(unsigned int cpu)
|
static u32 x2apic_calc_apicid(unsigned int cpu)
|
||||||
{
|
{
|
||||||
return per_cpu(x86_cpu_to_logical_apicid, cpu);
|
return x86_cpu_to_logical_apicid[cpu];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_x2apic_ldr(void)
|
static void init_x2apic_ldr(void)
|
||||||
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
|
|||||||
u32 cluster, apicid = apic_read(APIC_LDR);
|
u32 cluster, apicid = apic_read(APIC_LDR);
|
||||||
unsigned int cpu;
|
unsigned int cpu;
|
||||||
|
|
||||||
this_cpu_write(x86_cpu_to_logical_apicid, apicid);
|
x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
|
||||||
|
|
||||||
if (cmsk)
|
if (cmsk)
|
||||||
goto update;
|
goto update;
|
||||||
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
|
|||||||
|
|
||||||
static int x2apic_cluster_probe(void)
|
static int x2apic_cluster_probe(void)
|
||||||
{
|
{
|
||||||
|
u32 slots;
|
||||||
|
|
||||||
if (!x2apic_mode)
|
if (!x2apic_mode)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
|
||||||
|
x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
|
||||||
|
if (!x86_cpu_to_logical_apicid)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
|
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
|
||||||
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
|
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
|
||||||
pr_err("Failed to register X2APIC_PREPARE\n");
|
pr_err("Failed to register X2APIC_PREPARE\n");
|
||||||
|
kfree(x86_cpu_to_logical_apicid);
|
||||||
|
x86_cpu_to_logical_apicid = NULL;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
init_x2apic_ldr();
|
init_x2apic_ldr();
|
||||||
|
|||||||
Reference in New Issue
Block a user