linux/arch/x86/xen/spinlock.c
Jeremy Fitzhardinge 545ac13892 x86, spinlock: Replace pv spinlocks with pv ticketlocks
Rather than outright replacing the entire spinlock implementation in
order to paravirtualize it, keep the ticket lock implementation but add
a couple of pvops hooks on the slow patch (long spin on lock, unlocking
a contended lock).

Ticket locks have a number of nice properties, but they also have some
surprising behaviours in virtual environments.  They enforce a strict
FIFO ordering on cpus trying to take a lock; however, if the hypervisor
scheduler does not schedule the cpus in the correct order, the system can
waste a huge amount of time spinning until the next cpu can take the lock.

(See Thomas Friebel's talk "Prevent Guests from Spinning Around"
http://www.xen.org/files/xensummitboston08/LHP.pdf for more details.)

To address this, we add two hooks:
 - __ticket_spin_lock which is called after the cpu has been
   spinning on the lock for a significant number of iterations but has
   failed to take the lock (presumably because the cpu holding the lock
   has been descheduled).  The lock_spinning pvop is expected to block
   the cpu until it has been kicked by the current lock holder.
 - __ticket_spin_unlock, which on releasing a contended lock
   (there are more cpus with tail tickets), it looks to see if the next
   cpu is blocked and wakes it if so.

When compiled with CONFIG_PARAVIRT_SPINLOCKS disabled, a set of stub
functions causes all the extra code to go away.

Results:
=======
setup: 32 core machine with 32 vcpu KVM guest (HT off)  with 8GB RAM
base = 3.11-rc
patched = base + pvspinlock V12

+-----------------+----------------+--------+
 dbench (Throughput in MB/sec. Higher is better)
+-----------------+----------------+--------+
|   base (stdev %)|patched(stdev%) | %gain  |
+-----------------+----------------+--------+
| 15035.3   (0.3) |15150.0   (0.6) |   0.8  |
|  1470.0   (2.2) | 1713.7   (1.9) |  16.6  |
|   848.6   (4.3) |  967.8   (4.3) |  14.0  |
|   652.9   (3.5) |  685.3   (3.7) |   5.0  |
+-----------------+----------------+--------+

pvspinlock shows benefits for overcommit ratio > 1 for PLE enabled cases,
and undercommits results are flat

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Link: http://lkml.kernel.org/r/1376058122-8248-2-git-send-email-raghavendra.kt@linux.vnet.ibm.com
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Attilio Rao <attilio.rao@citrix.com>
[ Raghavendra: Changed SPIN_THRESHOLD, fixed redefinition of arch_spinlock_t]
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2013-08-09 07:53:05 -07:00

488 lines
12 KiB
C

/*
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
#include <linux/kernel_stat.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <asm/paravirt.h>
#include <xen/interface/xen.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "debugfs.h"
#ifdef CONFIG_XEN_DEBUG_FS
static struct xen_spinlock_stats
{
u64 taken;
u32 taken_slow;
u32 taken_slow_nested;
u32 taken_slow_pickup;
u32 taken_slow_spurious;
u32 taken_slow_irqenable;
u64 released;
u32 released_slow;
u32 released_slow_kicked;
#define HISTO_BUCKETS 30
u32 histo_spin_total[HISTO_BUCKETS+1];
u32 histo_spin_spinning[HISTO_BUCKETS+1];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
u64 time_total;
u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
static unsigned lock_timeout = 1 << 10;
#define TIMEOUT lock_timeout
static inline void check_zero(void)
{
if (unlikely(zero_stats)) {
memset(&spinlock_stats, 0, sizeof(spinlock_stats));
zero_stats = 0;
}
}
#define ADD_STATS(elem, val) \
do { check_zero(); spinlock_stats.elem += (val); } while(0)
static inline u64 spin_time_start(void)
{
return xen_clocksource_read();
}
static void __spin_time_accum(u64 delta, u32 *array)
{
unsigned index = ilog2(delta);
check_zero();
if (index < HISTO_BUCKETS)
array[index]++;
else
array[HISTO_BUCKETS]++;
}
static inline void spin_time_accum_spinning(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
spinlock_stats.time_spinning += delta;
}
static inline void spin_time_accum_total(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_total);
spinlock_stats.time_total += delta;
}
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
#define TIMEOUT (1 << 10)
#define ADD_STATS(elem, val) do { (void)(val); } while(0)
static inline u64 spin_time_start(void)
{
return 0;
}
static inline void spin_time_accum_total(u64 start)
{
}
static inline void spin_time_accum_spinning(u64 start)
{
}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
/*
* Size struct xen_spinlock so it's the same as arch_spinlock_t.
*/
#if NR_CPUS < 256
typedef u8 xen_spinners_t;
# define inc_spinners(xl) \
asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
# define dec_spinners(xl) \
asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
#else
typedef u16 xen_spinners_t;
# define inc_spinners(xl) \
asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
# define dec_spinners(xl) \
asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
#endif
struct xen_spinlock {
unsigned char lock; /* 0 -> free; 1 -> locked */
xen_spinners_t spinners; /* count of waiting cpus */
};
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
#if 0
static int xen_spin_is_locked(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
return xl->lock != 0;
}
static int xen_spin_is_contended(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
/* Not strictly true; this is only the count of contended
lock-takers entering the slow path. */
return xl->spinners != 0;
}
static int xen_spin_trylock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
u8 old = 1;
asm("xchgb %b0,%1"
: "+q" (old), "+m" (xl->lock) : : "memory");
return old == 0;
}
static DEFINE_PER_CPU(char *, irq_name);
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
/*
* Mark a cpu as interested in a lock. Returns the CPU's previous
* lock of interest, in case we got preempted by an interrupt.
*/
static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
{
struct xen_spinlock *prev;
prev = __this_cpu_read(lock_spinners);
__this_cpu_write(lock_spinners, xl);
wmb(); /* set lock of interest before count */
inc_spinners(xl);
return prev;
}
/*
* Mark a cpu as no longer interested in a lock. Restores previous
* lock of interest (NULL for none).
*/
static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
{
dec_spinners(xl);
wmb(); /* decrement count before restoring lock */
__this_cpu_write(lock_spinners, prev);
}
static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
struct xen_spinlock *prev;
int irq = __this_cpu_read(lock_kicker_irq);
int ret;
u64 start;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
return 0;
start = spin_time_start();
/* announce we're spinning */
prev = spinning_lock(xl);
ADD_STATS(taken_slow, 1);
ADD_STATS(taken_slow_nested, prev != NULL);
do {
unsigned long flags;
/* clear pending */
xen_clear_irq_pending(irq);
/* check again make sure it didn't become free while
we weren't looking */
ret = xen_spin_trylock(lock);
if (ret) {
ADD_STATS(taken_slow_pickup, 1);
/*
* If we interrupted another spinlock while it
* was blocking, make sure it doesn't block
* without rechecking the lock.
*/
if (prev != NULL)
xen_set_irq_pending(irq);
goto out;
}
flags = arch_local_save_flags();
if (irq_enable) {
ADD_STATS(taken_slow_irqenable, 1);
raw_local_irq_enable();
}
/*
* Block until irq becomes pending. If we're
* interrupted at this point (after the trylock but
* before entering the block), then the nested lock
* handler guarantees that the irq will be left
* pending if there's any chance the lock became free;
* xen_poll_irq() returns immediately if the irq is
* pending.
*/
xen_poll_irq(irq);
raw_local_irq_restore(flags);
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
unspinning_lock(xl, prev);
spin_time_accum_blocked(start);
return ret;
}
static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
unsigned timeout;
u8 oldval;
u64 start_spin;
ADD_STATS(taken, 1);
start_spin = spin_time_start();
do {
u64 start_spin_fast = spin_time_start();
timeout = TIMEOUT;
asm("1: xchgb %1,%0\n"
" testb %1,%1\n"
" jz 3f\n"
"2: rep;nop\n"
" cmpb $0,%0\n"
" je 1b\n"
" dec %2\n"
" jnz 2b\n"
"3:\n"
: "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
: "1" (1)
: "memory");
spin_time_accum_spinning(start_spin_fast);
} while (unlikely(oldval != 0 &&
(TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
spin_time_accum_total(start_spin);
}
static void xen_spin_lock(struct arch_spinlock *lock)
{
__xen_spin_lock(lock, false);
}
static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
{
__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
}
static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
{
int cpu;
ADD_STATS(released_slow, 1);
for_each_online_cpu(cpu) {
/* XXX should mix up next cpu selection */
if (per_cpu(lock_spinners, cpu) == xl) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
}
}
}
static void xen_spin_unlock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
ADD_STATS(released, 1);
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
/*
* Make sure unlock happens before checking for waiting
* spinners. We need a strong barrier to enforce the
* write-read ordering to different memory locations, as the
* CPU makes no implied guarantees about their ordering.
*/
mb();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);
}
#endif
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
return IRQ_HANDLED;
}
void xen_init_lock_cpu(int cpu)
{
int irq;
char *name;
WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
cpu, per_cpu(lock_kicker_irq, cpu));
/*
* See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
* (xen: disable PV spinlocks on HVM)
*/
if (xen_hvm_domain())
return;
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
}
void xen_uninit_lock_cpu(int cpu)
{
/*
* See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
* (xen: disable PV spinlocks on HVM)
*/
if (xen_hvm_domain())
return;
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
kfree(per_cpu(irq_name, cpu));
per_cpu(irq_name, cpu) = NULL;
}
void __init xen_init_spinlocks(void)
{
/*
* See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
* (xen: disable PV spinlocks on HVM)
*/
if (xen_hvm_domain())
return;
BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
#if 0
pv_lock_ops.spin_is_locked = xen_spin_is_locked;
pv_lock_ops.spin_is_contended = xen_spin_is_contended;
pv_lock_ops.spin_lock = xen_spin_lock;
pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
pv_lock_ops.spin_trylock = xen_spin_trylock;
pv_lock_ops.spin_unlock = xen_spin_unlock;
#endif
}
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_spin_debug;
static int __init xen_spinlock_debugfs(void)
{
struct dentry *d_xen = xen_init_debugfs();
if (d_xen == NULL)
return -ENOMEM;
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
&spinlock_stats.taken_slow);
debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
&spinlock_stats.taken_slow_nested);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
&spinlock_stats.taken_slow_pickup);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
&spinlock_stats.taken_slow_spurious);
debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
&spinlock_stats.taken_slow_irqenable);
debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
&spinlock_stats.released_slow);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
&spinlock_stats.released_slow_kicked);
debugfs_create_u64("time_spinning", 0444, d_spin_debug,
&spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
debugfs_create_u64("time_total", 0444, d_spin_debug,
&spinlock_stats.time_total);
debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
return 0;
}
fs_initcall(xen_spinlock_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */