Merge branch 'bcache-for-upstream' of http://evilpiepirate.org/git/linux-bcache into for-3.10/drivers

2024-12-19 09:32:32 +00:00 · 2013-03-24 21:42:45 -06:00 · 2013-03-24 21:42:45 -06:00 · e226e34165
commit e226e34165
parent 5bbcf5e6ab cafe563591
41 changed files with 16503 additions and 0 deletions
--- a/Documentation/ABI/testing/sysfs-block-bcache
+++ b/Documentation/ABI/testing/sysfs-block-bcache
@ -0,0 +1,156 @@
 What:		/sys/block/<disk>/bcache/unregister
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		A write to this file causes the backing device or cache to be
 		unregistered. If a backing device had dirty data in the cache,
 		writeback mode is automatically disabled and all dirty data is
 		flushed before the device is unregistered. Caches unregister
 		all associated backing devices before unregistering themselves.
 What:		/sys/block/<disk>/bcache/clear_stats
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		Writing to this file resets all the statistics for the device.
 What:		/sys/block/<disk>/bcache/cache
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a backing device that has cache, a symlink to
 		the bcache/ dir of that cache.
 What:		/sys/block/<disk>/bcache/cache_hits
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: integer number of full cache hits,
 		counted per bio. A partial cache hit counts as a miss.
 What:		/sys/block/<disk>/bcache/cache_misses
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: integer number of cache misses.
 What:		/sys/block/<disk>/bcache/cache_hit_ratio
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: cache hits as a percentage.
 What:		/sys/block/<disk>/bcache/sequential_cutoff
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: Threshold past which sequential IO will
 		skip the cache. Read and written as bytes in human readable
 		units (i.e. echo 10M > sequntial_cutoff).
 What:		/sys/block/<disk>/bcache/bypassed
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		Sum of all reads and writes that have bypassed the cache (due
 		to the sequential cutoff).  Expressed as bytes in human
 		readable units.
 What:		/sys/block/<disk>/bcache/writeback
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: When on, writeback caching is enabled and
 		writes will be buffered in the cache. When off, caching is in
 		writethrough mode; reads and writes will be added to the
 		cache but no write buffering will take place.
 What:		/sys/block/<disk>/bcache/writeback_running
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: when off, dirty data will not be written
 		from the cache to the backing device. The cache will still be
 		used to buffer writes until it is mostly full, at which point
 		writes transparently revert to writethrough mode. Intended only
 		for benchmarking/testing.
 What:		/sys/block/<disk>/bcache/writeback_delay
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: In writeback mode, when dirty data is
 		written to the cache and the cache held no dirty data for that
 		backing device, writeback from cache to backing device starts
 		after this delay, expressed as an integer number of seconds.
 What:		/sys/block/<disk>/bcache/writeback_percent
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For backing devices: If nonzero, writeback from cache to
 		backing device only takes place when more than this percentage
 		of the cache is used, allowing more write coalescing to take
 		place and reducing total number of writes sent to the backing
 		device. Integer between 0 and 40.
 What:		/sys/block/<disk>/bcache/synchronous
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, a boolean that allows synchronous mode to be
 		switched on and off. In synchronous mode all writes are ordered
 		such that the cache can reliably recover from unclean shutdown;
 		if disabled bcache will not generally wait for writes to
 		complete but if the cache is not shut down cleanly all data
 		will be discarded from the cache. Should not be turned off with
 		writeback caching enabled.
 What:		/sys/block/<disk>/bcache/discard
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, a boolean allowing discard/TRIM to be turned off
 		or back on if the device supports it.
 What:		/sys/block/<disk>/bcache/bucket_size
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, bucket size in human readable units, as set at
 		cache creation time; should match the erase block size of the
 		SSD for optimal performance.
 What:		/sys/block/<disk>/bcache/nbuckets
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, the number of usable buckets.
 What:		/sys/block/<disk>/bcache/tree_depth
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, height of the btree excluding leaf nodes (i.e. a
 		one node tree will have a depth of 0).
 What:		/sys/block/<disk>/bcache/btree_cache_size
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		Number of btree buckets/nodes that are currently cached in
 		memory; cache dynamically grows and shrinks in response to
 		memory pressure from the rest of the system.
 What:		/sys/block/<disk>/bcache/written
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, total amount of data in human readable units
 		written to the cache, excluding all metadata.
 What:		/sys/block/<disk>/bcache/btree_written
 Date:		November 2010
 Contact:	Kent Overstreet <kent.overstreet@gmail.com>
 Description:
 		For a cache, sum of all btree writes in human readable units.
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@ -0,0 +1,343 @@
 Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be
 nice if you could use them as cache... Hence bcache.
 Wiki and git repositories are at:
  http://bcache.evilpiepirate.org
  http://evilpiepirate.org/git/linux-bcache.git
  http://evilpiepirate.org/git/bcache-tools.git
 It's designed around the performance characteristics of SSDs - it only allocates
 in erase block sized buckets, and it uses a hybrid btree/log to track cached
 extants (which can be anywhere from a single sector to the bucket size). It's
 designed to avoid random writes at all costs; it fills up an erase block
 sequentially, then issues a discard before reusing it.
 Both writethrough and writeback caching are supported. Writeback defaults to
 off, but can be switched on and off arbitrarily at runtime. Bcache goes to
 great lengths to protect your data - it reliably handles unclean shutdown. (It
 doesn't even have a notion of a clean shutdown; bcache simply doesn't return
 writes as completed until they're on stable storage).
 Writeback caching can use most of the cache for buffering writes - writing
 dirty data to the backing device is always done sequentially, scanning from the
 start to the end of the index.
 Since random IO is what SSDs excel at, there generally won't be much benefit
 to caching large sequential IO. Bcache detects sequential IO and skips it;
 it also keeps a rolling average of the IO sizes per task, and as long as the
 average is above the cutoff it will skip all IO from that task - instead of
 caching the first 512k after every seek. Backups and large file copies should
 thus entirely bypass the cache.
 In the event of a data IO error on the flash it will try to recover by reading
 from disk or invalidating cache entries.  For unrecoverable errors (meta data
 or dirty data), caching is automatically disabled; if dirty data was present
 in the cache it first disables writeback caching and waits for all dirty data
 to be flushed.
 Getting started:
 You'll need make-bcache from the bcache-tools repository. Both the cache device
 and backing device must be formatted before use.
  make-bcache -B /dev/sdb
  make-bcache -C /dev/sdc
 make-bcache has the ability to format multiple devices at the same time - if
 you format your backing devices and cache device at the same time, you won't
 have to manually attach:
  make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
 To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
  echo /dev/sdb > /sys/fs/bcache/register
  echo /dev/sdc > /sys/fs/bcache/register
 To register your bcache devices automatically, you could add something like
 this to an init script:
  echo /dev/sd* > /sys/fs/bcache/register_quiet
 It'll look for bcache superblocks and ignore everything that doesn't have one.
 Registering the backing device makes the bcache show up in /dev; you can now
 format it and use it as normal. But the first time using a new bcache device,
 it'll be running in passthrough mode until you attach it to a cache. See the
 section on attaching.
 The devices show up at /dev/bcacheN, and can be controlled via sysfs from
 /sys/block/bcacheN/bcache:
  mkfs.ext4 /dev/bcache0
  mount /dev/bcache0 /mnt
 Cache devices are managed as sets; multiple caches per set isn't supported yet
 but will allow for mirroring of metadata and dirty data in the future. Your new
 cache set shows up as /sys/fs/bcache/<UUID>
 ATTACHING:
 After your cache device and backing device are registered, the backing device
 must be attached to your cache set to enable caching. Attaching a backing
 device to a cache set is done thusly, with the UUID of the cache set in
 /sys/fs/bcache:
  echo <UUID> > /sys/block/bcache0/bcache/attach
 This only has to be done once. The next time you reboot, just reregister all
 your bcache devices. If a backing device has data in a cache somewhere, the
 /dev/bcache# device won't be created until the cache shows up - particularly
 important if you have writeback caching turned on.
 If you're booting up and your cache device is gone and never coming back, you
 can force run the backing device:
  echo 1 > /sys/block/sdb/bcache/running
 (You need to use /sys/block/sdb (or whatever your backing device is called), not
 /sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a
 partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache)
 The backing device will still use that cache set if it shows up in the future,
 but all the cached data will be invalidated. If there was dirty data in the
 cache, don't expect the filesystem to be recoverable - you will have massive
 filesystem corruption, though ext4's fsck does work miracles.
 SYSFS - BACKING DEVICE:
 attach
  Echo the UUID of a cache set to this file to enable caching.
 cache_mode
  Can be one of either writethrough, writeback, writearound or none.
 clear_stats
  Writing to this file resets the running total stats (not the day/hour/5 minute
  decaying versions).
 detach
  Write to this file to detach from a cache set. If there is dirty data in the
  cache, it will be flushed first.
 dirty_data
  Amount of dirty data for this backing device in the cache. Continuously
  updated unlike the cache set's version, but may be slightly off.
 label
  Name of underlying device.
 readahead
  Size of readahead that should be performed.  Defaults to 0.  If set to e.g.
  1M, it will round cache miss reads up to that size, but without overlapping
  existing cache entries.
 running
  1 if bcache is running (i.e. whether the /dev/bcache device exists, whether
  it's in passthrough mode or caching).
 sequential_cutoff
  A sequential IO will bypass the cache once it passes this threshhold; the
  most recent 128 IOs are tracked so sequential IO can be detected even when
  it isn't all done at once.
 sequential_merge
  If non zero, bcache keeps a list of the last 128 requests submitted to compare
  against all new requests to determine which new requests are sequential
  continuations of previous requests for the purpose of determining sequential
  cutoff. This is necessary if the sequential cutoff value is greater than the
  maximum acceptable sequential size for any single request. 
 state
  The backing device can be in one of four different states:
  no cache: Has never been attached to a cache set.
  clean: Part of a cache set, and there is no cached dirty data.
  dirty: Part of a cache set, and there is cached dirty data.
  inconsistent: The backing device was forcibly run by the user when there was
  dirty data cached but the cache set was unavailable; whatever data was on the
  backing device has likely been corrupted.
 stop
  Write to this file to shut down the bcache device and close the backing
  device.
 writeback_delay
  When dirty data is written to the cache and it previously did not contain
  any, waits some number of seconds before initiating writeback. Defaults to
  30.
 writeback_percent
  If nonzero, bcache tries to keep around this percentage of the cache dirty by
  throttling background writeback and using a PD controller to smoothly adjust
  the rate.
 writeback_rate
  Rate in sectors per second - if writeback_percent is nonzero, background
  writeback is throttled to this rate. Continuously adjusted by bcache but may
  also be set by the user.
 writeback_running
  If off, writeback of dirty data will not take place at all. Dirty data will
  still be added to the cache until it is mostly full; only meant for
  benchmarking. Defaults to on.
 SYSFS - BACKING DEVICE STATS:
 There are directories with these numbers for a running total, as well as
 versions that decay over the past day, hour and 5 minutes; they're also
 aggregated in the cache set directory as well.
 bypassed
  Amount of IO (both reads and writes) that has bypassed the cache
 cache_hits
 cache_misses
 cache_hit_ratio
  Hits and misses are counted per individual IO as bcache sees them; a
  partial hit is counted as a miss.
 cache_bypass_hits
 cache_bypass_misses
  Hits and misses for IO that is intended to skip the cache are still counted,
  but broken out here.
 cache_miss_collisions
  Counts instances where data was going to be inserted into the cache from a
  cache miss, but raced with a write and data was already present (usually 0
  since the synchronization for cache misses was rewritten)
 cache_readaheads
  Count of times readahead occured.
 SYSFS - CACHE SET:
 average_key_size
  Average data per key in the btree.
 bdev<0..n>
  Symlink to each of the attached backing devices.
 block_size
  Block size of the cache devices.
 btree_cache_size
  Amount of memory currently used by the btree cache
 bucket_size
  Size of buckets
 cache<0..n>
  Symlink to each of the cache devices comprising this cache set. 
 cache_available_percent
  Percentage of cache device free.
 clear_stats
  Clears the statistics associated with this cache
 dirty_data
  Amount of dirty data is in the cache (updated when garbage collection runs).
 flash_vol_create
  Echoing a size to this file (in human readable units, k/M/G) creates a thinly
  provisioned volume backed by the cache set.
 io_error_halflife
 io_error_limit
  These determines how many errors we accept before disabling the cache.
  Each error is decayed by the half life (in # ios).  If the decaying count
  reaches io_error_limit dirty data is written out and the cache is disabled.
 journal_delay_ms
  Journal writes will delay for up to this many milliseconds, unless a cache
  flush happens sooner. Defaults to 100.
 root_usage_percent
  Percentage of the root btree node in use.  If this gets too high the node
  will split, increasing the tree depth.
 stop
  Write to this file to shut down the cache set - waits until all attached
  backing devices have been shut down.
 tree_depth
  Depth of the btree (A single node btree has depth 0).
 unregister
  Detaches all backing devices and closes the cache devices; if dirty data is
  present it will disable writeback caching and wait for it to be flushed.
 SYSFS - CACHE SET INTERNAL:
 This directory also exposes timings for a number of internal operations, with
 separate files for average duration, average frequency, last occurence and max
 duration: garbage collection, btree read, btree node sorts and btree splits.
 active_journal_entries
  Number of journal entries that are newer than the index.
 btree_nodes
  Total nodes in the btree.
 btree_used_percent
  Average fraction of btree in use.
 bset_tree_stats
  Statistics about the auxiliary search trees
 btree_cache_max_chain
  Longest chain in the btree node cache's hash table
 cache_read_races
  Counts instances where while data was being read from the cache, the bucket
  was reused and invalidated - i.e. where the pointer was stale after the read
  completed. When this occurs the data is reread from the backing device.
 trigger_gc
  Writing to this file forces garbage collection to run.
 SYSFS - CACHE DEVICE:
 block_size
  Minimum granularity of writes - should match hardware sector size.
 btree_written
  Sum of all btree writes, in (kilo/mega/giga) bytes
 bucket_size
  Size of buckets
 cache_replacement_policy
  One of either lru, fifo or random.
 discard
  Boolean; if on a discard/TRIM will be issued to each bucket before it is
  reused. Defaults to off, since SATA TRIM is an unqueued command (and thus
  slow).
 freelist_percent
  Size of the freelist as a percentage of nbuckets. Can be written to to
  increase the number of buckets kept on the freelist, which lets you
  artificially reduce the size of the cache at runtime. Mostly for testing
  purposes (i.e. testing how different size caches affect your hit rate), but
  since buckets are discarded when they move on to the freelist will also make
  the SSD's garbage collection easier by effectively giving it more reserved
  space.
 io_errors
  Number of errors that have occured, decayed by io_error_halflife.
 metadata_written
  Sum of all non data writes (btree writes and all other metadata).
 nbuckets
  Total buckets in this cache
 priority_stats
  Statistics about how recently data in the cache has been accessed.  This can
  reveal your working set size.
 written
  Sum of all data that has been written to the cache; comparison with
  btree_written gives the amount of write inflation in bcache.
--- a/7
+++ b/7
@ -1616,6 +1616,13 @@ W:	http://www.baycom.org/~tom/ham/ham.html
 S:	Maintained
 F:	drivers/net/hamradio/baycom*
 BCACHE (BLOCK LAYER CACHE)
 M:	Kent Overstreet <koverstreet@google.com>
 L:	linux-bcache@vger.kernel.org
 W:	http://bcache.evilpiepirate.org
 S:	Maintained:
 F:	drivers/md/bcache/
 BEFS FILE SYSTEM
 S:	Orphan
 F:	Documentation/filesystems/befs.txt
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@ -1485,6 +1485,7 @@ unsigned int get_random_int(void)
 	return ret;
 }
 EXPORT_SYMBOL(get_random_int);
 /*
 * randomize_range() returns a start address such that
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@ -174,6 +174,8 @@ config MD_FAULTY
 	  In unsure, say N.
 source "drivers/md/bcache/Kconfig"
 config BLK_DEV_DM
 	tristate "Device mapper support"
 	---help---
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@ -0,0 +1,42 @@
 config BCACHE
 	tristate "Block device as cache"
 	select CLOSURES
 	---help---
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
 	See Documentation/bcache.txt for details.
 config BCACHE_DEBUG
 	bool "Bcache debugging"
 	depends on BCACHE
 	---help---
 	Don't select this option unless you're a developer
 	Enables extra debugging tools (primarily a fuzz tester)
 config BCACHE_EDEBUG
 	bool "Extended runtime checks"
 	depends on BCACHE
 	---help---
 	Don't select this option unless you're a developer
 	Enables extra runtime checks which significantly affect performance
 config BCACHE_CLOSURES_DEBUG
 	bool "Debug closures"
 	depends on BCACHE
 	select DEBUG_FS
 	---help---
 	Keeps all active closures in a linked list and provides a debugfs
 	interface to list them, which makes it possible to see asynchronous
 	operations that get stuck.
 # cgroup code needs to be updated:
 #
 #config CGROUP_BCACHE
 #	bool "Cgroup controls for bcache"
 #	depends on BCACHE && BLK_CGROUP
 #	---help---
 #	TODO
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@ -0,0 +1,7 @@
 obj-$(CONFIG_BCACHE)	+= bcache.o
 bcache-y		:= alloc.o btree.o bset.o io.o journal.o writeback.o\
 	movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
 CFLAGS_request.o	+= -Iblock
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@ -0,0 +1,583 @@
 /*
 * Primary bucket allocation code
 *
 * Copyright 2012 Google, Inc.
 *
 * Allocation in bcache is done in terms of buckets:
 *
 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
 * btree pointers - they must match for the pointer to be considered valid.
 *
 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
 * bucket simply by incrementing its gen.
 *
 * The gens (along with the priorities; it's really the gens are important but
 * the code is named as if it's the priorities) are written in an arbitrary list
 * of buckets on disk, with a pointer to them in the journal header.
 *
 * When we invalidate a bucket, we have to write its new gen to disk and wait
 * for that write to complete before we use it - otherwise after a crash we
 * could have pointers that appeared to be good but pointed to data that had
 * been overwritten.
 *
 * Since the gens and priorities are all stored contiguously on disk, we can
 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
 * call prio_write(), and when prio_write() finishes we pull buckets off the
 * free_inc list and optionally discard them.
 *
 * free_inc isn't the only freelist - if it was, we'd often to sleep while
 * priorities and gens were being written before we could allocate. c->free is a
 * smaller freelist, and buckets on that list are always ready to be used.
 *
 * If we've got discards enabled, that happens when a bucket moves from the
 * free_inc list to the free list.
 *
 * There is another freelist, because sometimes we have buckets that we know
 * have nothing pointing into them - these we can reuse without waiting for
 * priorities to be rewritten. These come from freed btree nodes and buckets
 * that garbage collection discovered no longer had valid keys pointing into
 * them (because they were overwritten). That's the unused list - buckets on the
 * unused list move to the free list, optionally being discarded in the process.
 *
 * It's also important to ensure that gens don't wrap around - with respect to
 * either the oldest gen in the btree or the gen on disk. This is quite
 * difficult to do in practice, but we explicitly guard against it anyways - if
 * a bucket is in danger of wrapping around we simply skip invalidating it that
 * time around, and we garbage collect or rewrite the priorities sooner than we
 * would have otherwise.
 *
 * bch_bucket_alloc() allocates a single bucket from a specific cache.
 *
 * bch_bucket_alloc_set() allocates one or more buckets from different caches
 * out of a cache set.
 *
 * free_some_buckets() drives all the processes described above. It's called
 * from bch_bucket_alloc() and a few other places that need to make sure free
 * buckets are ready.
 *
 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
 * invalidated, and then invalidate them and stick them on the free_inc list -
 * in either lru or fifo order.
 */
 #include "bcache.h"
 #include "btree.h"
 #include <linux/random.h>
 #define MAX_IN_FLIGHT_DISCARDS		8U
 /* Bucket heap / gen */
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
 {
 	uint8_t ret = ++b->gen;
 	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
 	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
 	if (CACHE_SYNC(&ca->set->sb)) {
 		ca->need_save_prio = max(ca->need_save_prio,
 					 bucket_disk_gen(b));
 		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
 	}
 	return ret;
 }
 void bch_rescale_priorities(struct cache_set *c, int sectors)
 {
 	struct cache *ca;
 	struct bucket *b;
 	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
 	unsigned i;
 	int r;
 	atomic_sub(sectors, &c->rescale);
 	do {
 		r = atomic_read(&c->rescale);
 		if (r >= 0)
 			return;
 	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
 	mutex_lock(&c->bucket_lock);
 	c->min_prio = USHRT_MAX;
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca)
 			if (b->prio &&
 			    b->prio != BTREE_PRIO &&
 			    !atomic_read(&b->pin)) {
 				b->prio--;
 				c->min_prio = min(c->min_prio, b->prio);
 			}
 	mutex_unlock(&c->bucket_lock);
 }
 /* Discard/TRIM */
 struct discard {
 	struct list_head	list;
 	struct work_struct	work;
 	struct cache		*ca;
 	long			bucket;
 	struct bio		bio;
 	struct bio_vec		bv;
 };
 static void discard_finish(struct work_struct *w)
 {
 	struct discard *d = container_of(w, struct discard, work);
 	struct cache *ca = d->ca;
 	char buf[BDEVNAME_SIZE];
 	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
 		pr_notice("discard error on %s, disabling",
 			 bdevname(ca->bdev, buf));
 		d->ca->discard = 0;
 	}
 	mutex_lock(&ca->set->bucket_lock);
 	fifo_push(&ca->free, d->bucket);
 	list_add(&d->list, &ca->discards);
 	atomic_dec(&ca->discards_in_flight);
 	mutex_unlock(&ca->set->bucket_lock);
 	closure_wake_up(&ca->set->bucket_wait);
 	wake_up(&ca->set->alloc_wait);
 	closure_put(&ca->set->cl);
 }
 static void discard_endio(struct bio *bio, int error)
 {
 	struct discard *d = container_of(bio, struct discard, bio);
 	schedule_work(&d->work);
 }
 static void do_discard(struct cache *ca, long bucket)
 {
 	struct discard *d = list_first_entry(&ca->discards,
 					     struct discard, list);
 	list_del(&d->list);
 	d->bucket = bucket;
 	atomic_inc(&ca->discards_in_flight);
 	closure_get(&ca->set->cl);
 	bio_init(&d->bio);
 	d->bio.bi_sector	= bucket_to_sector(ca->set, d->bucket);
 	d->bio.bi_bdev		= ca->bdev;
 	d->bio.bi_rw		= REQ_WRITE|REQ_DISCARD;
 	d->bio.bi_max_vecs	= 1;
 	d->bio.bi_io_vec	= d->bio.bi_inline_vecs;
 	d->bio.bi_size		= bucket_bytes(ca);
 	d->bio.bi_end_io	= discard_endio;
 	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	submit_bio(0, &d->bio);
 }
 /* Allocation */
 static inline bool can_inc_bucket_gen(struct bucket *b)
 {
 	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
 		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
 }
 bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
 {
 	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
 	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
 	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
 		return false;
 	b->prio = 0;
 	if (can_inc_bucket_gen(b) &&
 	    fifo_push(&ca->unused, b - ca->buckets)) {
 		atomic_inc(&b->pin);
 		return true;
 	}
 	return false;
 }
 static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
 	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
 		!atomic_read(&b->pin) &&
 		can_inc_bucket_gen(b);
 }
 static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
 	fifo_push(&ca->free_inc, b - ca->buckets);
 }
 static void invalidate_buckets_lru(struct cache *ca)
 {
 	unsigned bucket_prio(struct bucket *b)
 	{
 		return ((unsigned) (b->prio - ca->set->min_prio)) *
 			GC_SECTORS_USED(b);
 	}
 	bool bucket_max_cmp(struct bucket *l, struct bucket *r)
 	{
 		return bucket_prio(l) < bucket_prio(r);
 	}
 	bool bucket_min_cmp(struct bucket *l, struct bucket *r)
 	{
 		return bucket_prio(l) > bucket_prio(r);
 	}
 	struct bucket *b;
 	ssize_t i;
 	ca->heap.used = 0;
 	for_each_bucket(b, ca) {
 		if (!can_invalidate_bucket(ca, b))
 			continue;
 		if (!GC_SECTORS_USED(b)) {
 			if (!bch_bucket_add_unused(ca, b))
 				return;
 		} else {
 			if (!heap_full(&ca->heap))
 				heap_add(&ca->heap, b, bucket_max_cmp);
 			else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
 				ca->heap.data[0] = b;
 				heap_sift(&ca->heap, 0, bucket_max_cmp);
 			}
 		}
 	}
 	if (ca->heap.used * 2 < ca->heap.size)
 		bch_queue_gc(ca->set);
 	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
 		heap_sift(&ca->heap, i, bucket_min_cmp);
 	while (!fifo_full(&ca->free_inc)) {
 		if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
 			/* We don't want to be calling invalidate_buckets()
 			 * multiple times when it can't do anything
 			 */
 			ca->invalidate_needs_gc = 1;
 			bch_queue_gc(ca->set);
 			return;
 		}
 		invalidate_one_bucket(ca, b);
 	}
 }
 static void invalidate_buckets_fifo(struct cache *ca)
 {
 	struct bucket *b;
 	size_t checked = 0;
 	while (!fifo_full(&ca->free_inc)) {
 		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
 		    ca->fifo_last_bucket >= ca->sb.nbuckets)
 			ca->fifo_last_bucket = ca->sb.first_bucket;
 		b = ca->buckets + ca->fifo_last_bucket++;
 		if (can_invalidate_bucket(ca, b))
 			invalidate_one_bucket(ca, b);
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
 			bch_queue_gc(ca->set);
 			return;
 		}
 	}
 }
 static void invalidate_buckets_random(struct cache *ca)
 {
 	struct bucket *b;
 	size_t checked = 0;
 	while (!fifo_full(&ca->free_inc)) {
 		size_t n;
 		get_random_bytes(&n, sizeof(n));
 		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
 		n += ca->sb.first_bucket;
 		b = ca->buckets + n;
 		if (can_invalidate_bucket(ca, b))
 			invalidate_one_bucket(ca, b);
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
 			bch_queue_gc(ca->set);
 			return;
 		}
 	}
 }
 static void invalidate_buckets(struct cache *ca)
 {
 	if (ca->invalidate_needs_gc)
 		return;
 	switch (CACHE_REPLACEMENT(&ca->sb)) {
 	case CACHE_REPLACEMENT_LRU:
 		invalidate_buckets_lru(ca);
 		break;
 	case CACHE_REPLACEMENT_FIFO:
 		invalidate_buckets_fifo(ca);
 		break;
 	case CACHE_REPLACEMENT_RANDOM:
 		invalidate_buckets_random(ca);
 		break;
 	}
 }
 #define allocator_wait(ca, cond)					\
 do {									\
 	DEFINE_WAIT(__wait);						\
 									\
 	while (!(cond)) {						\
 		prepare_to_wait(&ca->set->alloc_wait,			\
 				&__wait, TASK_INTERRUPTIBLE);		\
 									\
 		mutex_unlock(&(ca)->set->bucket_lock);			\
 		if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {	\
 			finish_wait(&ca->set->alloc_wait, &__wait);	\
 			closure_return(cl);				\
 		}							\
 									\
 		schedule();						\
 		__set_current_state(TASK_RUNNING);			\
 		mutex_lock(&(ca)->set->bucket_lock);			\
 	}								\
 									\
 	finish_wait(&ca->set->alloc_wait, &__wait);			\
 } while (0)
 void bch_allocator_thread(struct closure *cl)
 {
 	struct cache *ca = container_of(cl, struct cache, alloc);
 	mutex_lock(&ca->set->bucket_lock);
 	while (1) {
 		while (1) {
 			long bucket;
 			if ((!atomic_read(&ca->set->prio_blocked) ||
 			     !CACHE_SYNC(&ca->set->sb)) &&
 			    !fifo_empty(&ca->unused))
 				fifo_pop(&ca->unused, bucket);
 			else if (!fifo_empty(&ca->free_inc))
 				fifo_pop(&ca->free_inc, bucket);
 			else
 				break;
 			allocator_wait(ca, (int) fifo_free(&ca->free) >
 				       atomic_read(&ca->discards_in_flight));
 			if (ca->discard) {
 				allocator_wait(ca, !list_empty(&ca->discards));
 				do_discard(ca, bucket);
 			} else {
 				fifo_push(&ca->free, bucket);
 				closure_wake_up(&ca->set->bucket_wait);
 			}
 		}
 		allocator_wait(ca, ca->set->gc_mark_valid);
 		invalidate_buckets(ca);
 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
 			       !CACHE_SYNC(&ca->set->sb));
 		if (CACHE_SYNC(&ca->set->sb) &&
 		    (!fifo_empty(&ca->free_inc) ||
 		     ca->need_save_prio > 64)) {
 			bch_prio_write(ca);
 		}
 	}
 }
 long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
 {
 	long r = -1;
 again:
 	wake_up(&ca->set->alloc_wait);
 	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
 	    fifo_pop(&ca->free, r)) {
 		struct bucket *b = ca->buckets + r;
 #ifdef CONFIG_BCACHE_EDEBUG
 		size_t iter;
 		long i;
 		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
 			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
 		fifo_for_each(i, &ca->free, iter)
 			BUG_ON(i == r);
 		fifo_for_each(i, &ca->free_inc, iter)
 			BUG_ON(i == r);
 		fifo_for_each(i, &ca->unused, iter)
 			BUG_ON(i == r);
 #endif
 		BUG_ON(atomic_read(&b->pin) != 1);
 		SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
 		if (watermark <= WATERMARK_METADATA) {
 			SET_GC_MARK(b, GC_MARK_METADATA);
 			b->prio = BTREE_PRIO;
 		} else {
 			SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
 			b->prio = INITIAL_PRIO;
 		}
 		return r;
 	}
 	pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
 		 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
 		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
 	if (cl) {
 		closure_wait(&ca->set->bucket_wait, cl);
 		if (closure_blocking(cl)) {
 			mutex_unlock(&ca->set->bucket_lock);
 			closure_sync(cl);
 			mutex_lock(&ca->set->bucket_lock);
 			goto again;
 		}
 	}
 	return -1;
 }
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		struct bucket *b = PTR_BUCKET(c, k, i);
 		SET_GC_MARK(b, 0);
 		SET_GC_SECTORS_USED(b, 0);
 		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
 	}
 }
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 			   struct bkey *k, int n, struct closure *cl)
 {
 	int i;
 	lockdep_assert_held(&c->bucket_lock);
 	BUG_ON(!n || n > c->caches_loaded || n > 8);
 	bkey_init(k);
 	/* sort by free space/prio of oldest data in caches */
 	for (i = 0; i < n; i++) {
 		struct cache *ca = c->cache_by_alloc[i];
 		long b = bch_bucket_alloc(ca, watermark, cl);
 		if (b == -1)
 			goto err;
 		k->ptr[i] = PTR(ca->buckets[b].gen,
 				bucket_to_sector(c, b),
 				ca->sb.nr_this_dev);
 		SET_KEY_PTRS(k, i + 1);
 	}
 	return 0;
 err:
 	bch_bucket_free(c, k);
 	__bkey_put(c, k);
 	return -1;
 }
 int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 			 struct bkey *k, int n, struct closure *cl)
 {
 	int ret;
 	mutex_lock(&c->bucket_lock);
 	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
 	mutex_unlock(&c->bucket_lock);
 	return ret;
 }
 /* Init */
 void bch_cache_allocator_exit(struct cache *ca)
 {
 	struct discard *d;
 	while (!list_empty(&ca->discards)) {
 		d = list_first_entry(&ca->discards, struct discard, list);
 		cancel_work_sync(&d->work);
 		list_del(&d->list);
 		kfree(d);
 	}
 }
 int bch_cache_allocator_init(struct cache *ca)
 {
 	unsigned i;
 	/*
 	 * Reserve:
 	 * Prio/gen writes first
 	 * Then 8 for btree allocations
 	 * Then half for the moving garbage collector
 	 */
 	ca->watermark[WATERMARK_PRIO] = 0;
 	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
 	ca->watermark[WATERMARK_MOVINGGC] = 8 +
 		ca->watermark[WATERMARK_METADATA];
 	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
 		ca->watermark[WATERMARK_MOVINGGC];
 	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
 		struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
 		if (!d)
 			return -ENOMEM;
 		d->ca = ca;
 		INIT_WORK(&d->work, discard_finish);
 		list_add(&d->list, &ca->discards);
 	}
 	return 0;
 }
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@ -0,0 +1,379 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H
 /*
 * BKEYS:
 *
 * A bkey contains a key, a size field, a variable number of pointers, and some
 * ancillary flag bits.
 *
 * We use two different functions for validating bkeys, bch_ptr_invalid and
 * bch_ptr_bad().
 *
 * bch_ptr_invalid() primarily filters out keys and pointers that would be
 * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
 * pointer that occur in normal practice but don't point to real data.
 *
 * The one exception to the rule that ptr_invalid() filters out invalid keys is
 * that it also filters out keys of size 0 - these are keys that have been
 * completely overwritten. It'd be safe to delete these in memory while leaving
 * them on disk, just unnecessary work - so we filter them out when resorting
 * instead.
 *
 * We can't filter out stale keys when we're resorting, because garbage
 * collection needs to find them to ensure bucket gens don't wrap around -
 * unless we're rewriting the btree node those stale keys still exist on disk.
 *
 * We also implement functions here for removing some number of sectors from the
 * front or the back of a bkey - this is mainly used for fixing overlapping
 * extents, by removing the overlapping sectors from the older key.
 *
 * BSETS:
 *
 * A bset is an array of bkeys laid out contiguously in memory in sorted order,
 * along with a header. A btree node is made up of a number of these, written at
 * different times.
 *
 * There could be many of them on disk, but we never allow there to be more than
 * 4 in memory - we lazily resort as needed.
 *
 * We implement code here for creating and maintaining auxiliary search trees
 * (described below) for searching an individial bset, and on top of that we
 * implement a btree iterator.
 *
 * BTREE ITERATOR:
 *
 * Most of the code in bcache doesn't care about an individual bset - it needs
 * to search entire btree nodes and iterate over them in sorted order.
 *
 * The btree iterator code serves both functions; it iterates through the keys
 * in a btree node in sorted order, starting from either keys after a specific
 * point (if you pass it a search key) or the start of the btree node.
 *
 * AUXILIARY SEARCH TREES:
 *
 * Since keys are variable length, we can't use a binary search on a bset - we
 * wouldn't be able to find the start of the next key. But binary searches are
 * slow anyways, due to terrible cache behaviour; bcache originally used binary
 * searches and that code topped out at under 50k lookups/second.
 *
 * So we need to construct some sort of lookup table. Since we only insert keys
 * into the last (unwritten) set, most of the keys within a given btree node are
 * usually in sets that are mostly constant. We use two different types of
 * lookup tables to take advantage of this.
 *
 * Both lookup tables share in common that they don't index every key in the
 * set; they index one key every BSET_CACHELINE bytes, and then a linear search
 * is used for the rest.
 *
 * For sets that have been written to disk and are no longer being inserted
 * into, we construct a binary search tree in an array - traversing a binary
 * search tree in an array gives excellent locality of reference and is very
 * fast, since both children of any node are adjacent to each other in memory
 * (and their grandchildren, and great grandchildren...) - this means
 * prefetching can be used to great effect.
 *
 * It's quite useful performance wise to keep these nodes small - not just
 * because they're more likely to be in L2, but also because we can prefetch
 * more nodes on a single cacheline and thus prefetch more iterations in advance
 * when traversing this tree.
 *
 * Nodes in the auxiliary search tree must contain both a key to compare against
 * (we don't want to fetch the key from the set, that would defeat the purpose),
 * and a pointer to the key. We use a few tricks to compress both of these.
 *
 * To compress the pointer, we take advantage of the fact that one node in the
 * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
 * a function (to_inorder()) that takes the index of a node in a binary tree and
 * returns what its index would be in an inorder traversal, so we only have to
 * store the low bits of the offset.
 *
 * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
 * compress that,  we take advantage of the fact that when we're traversing the
 * search tree at every iteration we know that both our search key and the key
 * we're looking for lie within some range - bounded by our previous
 * comparisons. (We special case the start of a search so that this is true even
 * at the root of the tree).
 *
 * So we know the key we're looking for is between a and b, and a and b don't
 * differ higher than bit 50, we don't need to check anything higher than bit
 * 50.
 *
 * We don't usually need the rest of the bits, either; we only need enough bits
 * to partition the key range we're currently checking.  Consider key n - the
 * key our auxiliary search tree node corresponds to, and key p, the key
 * immediately preceding n.  The lowest bit we need to store in the auxiliary
 * search tree is the highest bit that differs between n and p.
 *
 * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
 * comparison. But we'd really like our nodes in the auxiliary search tree to be
 * of fixed size.
 *
 * The solution is to make them fixed size, and when we're constructing a node
 * check if p and n differed in the bits we needed them to. If they don't we
 * flag that node, and when doing lookups we fallback to comparing against the
 * real key. As long as this doesn't happen to often (and it seems to reliably
 * happen a bit less than 1% of the time), we win - even on failures, that key
 * is then more likely to be in cache than if we were doing binary searches all
 * the way, since we're touching so much less memory.
 *
 * The keys in the auxiliary search tree are stored in (software) floating
 * point, with an exponent and a mantissa. The exponent needs to be big enough
 * to address all the bits in the original key, but the number of bits in the
 * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
 *
 * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
 * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
 * We need one node per 128 bytes in the btree node, which means the auxiliary
 * search trees take up 3% as much memory as the btree itself.
 *
 * Constructing these auxiliary search trees is moderately expensive, and we
 * don't want to be constantly rebuilding the search tree for the last set
 * whenever we insert another key into it. For the unwritten set, we use a much
 * simpler lookup table - it's just a flat array, so index i in the lookup table
 * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
 * within each byte range works the same as with the auxiliary search trees.
 *
 * These are much easier to keep up to date when we insert a key - we do it
 * somewhat lazily; when we shift a key up we usually just increment the pointer
 * to it, only when it would overflow do we go to the trouble of finding the
 * first key in that range of bytes again.
 */
 /* Btree key comparison/iteration */
 struct btree_iter {
 	size_t size, used;
 	struct btree_iter_set {
 		struct bkey *k, *end;
 	} data[MAX_BSETS];
 };
 struct bset_tree {
 	/*
 	 * We construct a binary tree in an array as if the array
 	 * started at 1, so that things line up on the same cachelines
 	 * better: see comments in bset.c at cacheline_to_bkey() for
 	 * details
 	 */
 	/* size of the binary tree and prev array */
 	unsigned	size;
 	/* function of size - precalculated for to_inorder() */
 	unsigned	extra;
 	/* copy of the last key in the set */
 	struct bkey	end;
 	struct bkey_float *tree;
 	/*
 	 * The nodes in the bset tree point to specific keys - this
 	 * array holds the sizes of the previous key.
 	 *
 	 * Conceptually it's a member of struct bkey_float, but we want
 	 * to keep bkey_float to 4 bytes and prev isn't used in the fast
 	 * path.
 	 */
 	uint8_t		*prev;
 	/* The actual btree node, with pointers to each sorted set */
 	struct bset	*data;
 };
 static __always_inline int64_t bkey_cmp(const struct bkey *l,
 					const struct bkey *r)
 {
 	return unlikely(KEY_INODE(l) != KEY_INODE(r))
 		? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
 static inline size_t bkey_u64s(const struct bkey *k)
 {
 	BUG_ON(KEY_CSUM(k) > 1);
 	return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
 }
 static inline size_t bkey_bytes(const struct bkey *k)
 {
 	return bkey_u64s(k) * sizeof(uint64_t);
 }
 static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
 {
 	memcpy(dest, src, bkey_bytes(src));
 }
 static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
 {
 	if (!src)
 		src = &KEY(0, 0, 0);
 	SET_KEY_INODE(dest, KEY_INODE(src));
 	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
 }
 static inline struct bkey *bkey_next(const struct bkey *k)
 {
 	uint64_t *d = (void *) k;
 	return (struct bkey *) (d + bkey_u64s(k));
 }
 /* Keylists */
 struct keylist {
 	struct bkey		*top;
 	union {
 		uint64_t		*list;
 		struct bkey		*bottom;
 	};
 	/* Enough room for btree_split's keys without realloc */
 #define KEYLIST_INLINE		16
 	uint64_t		d[KEYLIST_INLINE];
 };
 static inline void bch_keylist_init(struct keylist *l)
 {
 	l->top = (void *) (l->list = l->d);
 }
 static inline void bch_keylist_push(struct keylist *l)
 {
 	l->top = bkey_next(l->top);
 }
 static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
 {
 	bkey_copy(l->top, k);
 	bch_keylist_push(l);
 }
 static inline bool bch_keylist_empty(struct keylist *l)
 {
 	return l->top == (void *) l->list;
 }
 static inline void bch_keylist_free(struct keylist *l)
 {
 	if (l->list != l->d)
 		kfree(l->list);
 }
 void bch_keylist_copy(struct keylist *, struct keylist *);
 struct bkey *bch_keylist_pop(struct keylist *);
 int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
 void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
 			      unsigned);
 bool __bch_cut_front(const struct bkey *, struct bkey *);
 bool __bch_cut_back(const struct bkey *, struct bkey *);
 static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
 {
 	BUG_ON(bkey_cmp(where, k) > 0);
 	return __bch_cut_front(where, k);
 }
 static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
 {
 	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
 	return __bch_cut_back(where, k);
 }
 const char *bch_ptr_status(struct cache_set *, const struct bkey *);
 bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
 bool bch_ptr_bad(struct btree *, const struct bkey *);
 static inline uint8_t gen_after(uint8_t a, uint8_t b)
 {
 	uint8_t r = a - b;
 	return r > 128U ? 0 : r;
 }
 static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
 				unsigned i)
 {
 	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
 }
 static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
 				 unsigned i)
 {
 	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
 }
 typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
 struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
 struct bkey *bch_btree_iter_next(struct btree_iter *);
 struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
 					struct btree *, ptr_filter_fn);
 void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
 struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
 				   struct bkey *, struct bset_tree *);
 /* 32 bits total: */
 #define BKEY_MID_BITS		3
 #define BKEY_EXPONENT_BITS	7
 #define BKEY_MANTISSA_BITS	22
 #define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
 struct bkey_float {
 	unsigned	exponent:BKEY_EXPONENT_BITS;
 	unsigned	m:BKEY_MID_BITS;
 	unsigned	mantissa:BKEY_MANTISSA_BITS;
 } __packed;
 /*
 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
 * it used to be 64, but I realized the lookup code would touch slightly less
 * memory if it was 128.
 *
 * It definites the number of bytes (in struct bset) per struct bkey_float in
 * the auxiliar search tree - when we're done searching the bset_float tree we
 * have this many bytes left that we do a linear search over.
 *
 * Since (after level 5) every level of the bset_tree is on a new cacheline,
 * we're touching one fewer cacheline in the bset tree in exchange for one more
 * cacheline in the linear search - but the linear search might stop before it
 * gets to the second cacheline.
 */
 #define BSET_CACHELINE		128
 #define bset_tree_space(b)	(btree_data_space(b) / BSET_CACHELINE)
 #define bset_tree_bytes(b)	(bset_tree_space(b) * sizeof(struct bkey_float))
 #define bset_prev_bytes(b)	(bset_tree_space(b) * sizeof(uint8_t))
 void bch_bset_init_next(struct btree *);
 void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
 void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
 struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
 			   const struct bkey *);
 static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
 					   const struct bkey *search)
 {
 	return search ? __bch_bset_search(b, t, search) : t->data->start;
 }
 bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
 void bch_btree_sort_lazy(struct btree *);
 void bch_btree_sort_into(struct btree *, struct btree *);
 void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
 void bch_btree_sort_partial(struct btree *, unsigned);
 static inline void bch_btree_sort(struct btree *b)
 {
 	bch_btree_sort_partial(b, 0);
 }
 int bch_bset_print_stats(struct cache_set *, char *);
 #endif
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@ -0,0 +1,405 @@
 #ifndef _BCACHE_BTREE_H
 #define _BCACHE_BTREE_H
 /*
 * THE BTREE:
 *
 * At a high level, bcache's btree is relatively standard b+ tree. All keys and
 * pointers are in the leaves; interior nodes only have pointers to the child
 * nodes.
 *
 * In the interior nodes, a struct bkey always points to a child btree node, and
 * the key is the highest key in the child node - except that the highest key in
 * an interior node is always MAX_KEY. The size field refers to the size on disk
 * of the child node - this would allow us to have variable sized btree nodes
 * (handy for keeping the depth of the btree 1 by expanding just the root).
 *
 * Btree nodes are themselves log structured, but this is hidden fairly
 * thoroughly. Btree nodes on disk will in practice have extents that overlap
 * (because they were written at different times), but in memory we never have
 * overlapping extents - when we read in a btree node from disk, the first thing
 * we do is resort all the sets of keys with a mergesort, and in the same pass
 * we check for overlapping extents and adjust them appropriately.
 *
 * struct btree_op is a central interface to the btree code. It's used for
 * specifying read vs. write locking, and the embedded closure is used for
 * waiting on IO or reserve memory.
 *
 * BTREE CACHE:
 *
 * Btree nodes are cached in memory; traversing the btree might require reading
 * in btree nodes which is handled mostly transparently.
 *
 * bch_btree_node_get() looks up a btree node in the cache and reads it in from
 * disk if necessary. This function is almost never called directly though - the
 * btree() macro is used to get a btree node, call some function on it, and
 * unlock the node after the function returns.
 *
 * The root is special cased - it's taken out of the cache's lru (thus pinning
 * it in memory), so we can find the root of the btree by just dereferencing a
 * pointer instead of looking it up in the cache. This makes locking a bit
 * tricky, since the root pointer is protected by the lock in the btree node it
 * points to - the btree_root() macro handles this.
 *
 * In various places we must be able to allocate memory for multiple btree nodes
 * in order to make forward progress. To do this we use the btree cache itself
 * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
 * cache we can reuse. We can't allow more than one thread to be doing this at a
 * time, so there's a lock, implemented by a pointer to the btree_op closure -
 * this allows the btree_root() macro to implicitly release this lock.
 *
 * BTREE IO:
 *
 * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
 * this.
 *
 * For writing, we have two btree_write structs embeddded in struct btree - one
 * write in flight, and one being set up, and we toggle between them.
 *
 * Writing is done with a single function -  bch_btree_write() really serves two
 * different purposes and should be broken up into two different functions. When
 * passing now = false, it merely indicates that the node is now dirty - calling
 * it ensures that the dirty keys will be written at some point in the future.
 *
 * When passing now = true, bch_btree_write() causes a write to happen
 * "immediately" (if there was already a write in flight, it'll cause the write
 * to happen as soon as the previous write completes). It returns immediately
 * though - but it takes a refcount on the closure in struct btree_op you passed
 * to it, so a closure_sync() later can be used to wait for the write to
 * complete.
 *
 * This is handy because btree_split() and garbage collection can issue writes
 * in parallel, reducing the amount of time they have to hold write locks.
 *
 * LOCKING:
 *
 * When traversing the btree, we may need write locks starting at some level -
 * inserting a key into the btree will typically only require a write lock on
 * the leaf node.
 *
 * This is specified with the lock field in struct btree_op; lock = 0 means we
 * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
 * checks this field and returns the node with the appropriate lock held.
 *
 * If, after traversing the btree, the insertion code discovers it has to split
 * then it must restart from the root and take new locks - to do this it changes
 * the lock field and returns -EINTR, which causes the btree_root() macro to
 * loop.
 *
 * Handling cache misses require a different mechanism for upgrading to a write
 * lock. We do cache lookups with only a read lock held, but if we get a cache
 * miss and we wish to insert this data into the cache, we have to insert a
 * placeholder key to detect races - otherwise, we could race with a write and
 * overwrite the data that was just written to the cache with stale data from
 * the backing device.
 *
 * For this we use a sequence number that write locks and unlocks increment - to
 * insert the check key it unlocks the btree node and then takes a write lock,
 * and fails if the sequence number doesn't match.
 */
 #include "bset.h"
 #include "debug.h"
 struct btree_write {
 	struct closure		*owner;
 	atomic_t		*journal;
 	/* If btree_split() frees a btree node, it writes a new pointer to that
 	 * btree node indicating it was freed; it takes a refcount on
 	 * c->prio_blocked because we can't write the gens until the new
 	 * pointer is on disk. This allows btree_write_endio() to release the
 	 * refcount that btree_split() took.
 	 */
 	int			prio_blocked;
 };
 struct btree {
 	/* Hottest entries first */
 	struct hlist_node	hash;
 	/* Key/pointer for this btree node */
 	BKEY_PADDED(key);
 	/* Single bit - set when accessed, cleared by shrinker */
 	unsigned long		accessed;
 	unsigned long		seq;
 	struct rw_semaphore	lock;
 	struct cache_set	*c;
 	unsigned long		flags;
 	uint16_t		written;	/* would be nice to kill */
 	uint8_t			level;
 	uint8_t			nsets;
 	uint8_t			page_order;
 	/*
 	 * Set of sorted keys - the real btree node - plus a binary search tree
 	 *
 	 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
 	 * to the memory we have allocated for this btree node. Additionally,
 	 * set[0]->data points to the entire btree node as it exists on disk.
 	 */
 	struct bset_tree	sets[MAX_BSETS];
 	/* Used to refcount bio splits, also protects b->bio */
 	struct closure_with_waitlist	io;
 	/* Gets transferred to w->prio_blocked - see the comment there */
 	int			prio_blocked;
 	struct list_head	list;
 	struct delayed_work	work;
 	uint64_t		io_start_time;
 	struct btree_write	writes[2];
 	struct bio		*bio;
 };
 #define BTREE_FLAG(flag)						\
 static inline bool btree_node_ ## flag(struct btree *b)			\
 {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
 									\
 static inline void set_btree_node_ ## flag(struct btree *b)		\
 {	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
 enum btree_flags {
 	BTREE_NODE_read_done,
 	BTREE_NODE_io_error,
 	BTREE_NODE_dirty,
 	BTREE_NODE_write_idx,
 };
 BTREE_FLAG(read_done);
 BTREE_FLAG(io_error);
 BTREE_FLAG(dirty);
 BTREE_FLAG(write_idx);
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
 	return b->writes + btree_node_write_idx(b);
 }
 static inline struct btree_write *btree_prev_write(struct btree *b)
 {
 	return b->writes + (btree_node_write_idx(b) ^ 1);
 }
 static inline unsigned bset_offset(struct btree *b, struct bset *i)
 {
 	return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
 }
 static inline struct bset *write_block(struct btree *b)
 {
 	return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
 }
 static inline bool bset_written(struct btree *b, struct bset_tree *t)
 {
 	return t->data < write_block(b);
 }
 static inline bool bkey_written(struct btree *b, struct bkey *k)
 {
 	return k < write_block(b)->start;
 }
 static inline void set_gc_sectors(struct cache_set *c)
 {
 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
 }
 static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
 {
 	return __bch_ptr_invalid(b->c, b->level, k);
 }
 static inline struct bkey *bch_btree_iter_init(struct btree *b,
 					       struct btree_iter *iter,
 					       struct bkey *search)
 {
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 /* Looping macros */
 #define for_each_cached_btree(b, c, iter)				\
 	for (iter = 0;							\
 	     iter < ARRAY_SIZE((c)->bucket_hash);			\
 	     iter++)							\
 		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
 #define for_each_key_filter(b, k, iter, filter)				\
 	for (bch_btree_iter_init((b), (iter), NULL);			\
 	     ((k) = bch_btree_iter_next_filter((iter), b, filter));)
 #define for_each_key(b, k, iter)					\
 	for (bch_btree_iter_init((b), (iter), NULL);			\
 	     ((k) = bch_btree_iter_next(iter));)
 /* Recursing down the btree */
 struct btree_op {
 	struct closure		cl;
 	struct cache_set	*c;
 	/* Journal entry we have a refcount on */
 	atomic_t		*journal;
 	/* Bio to be inserted into the cache */
 	struct bio		*cache_bio;
 	unsigned		inode;
 	uint16_t		write_prio;
 	/* Btree level at which we start taking write locks */
 	short			lock;
 	/* Btree insertion type */
 	enum {
 		BTREE_INSERT,
 		BTREE_REPLACE
 	} type:8;
 	unsigned		csum:1;
 	unsigned		skip:1;
 	unsigned		flush_journal:1;
 	unsigned		insert_data_done:1;
 	unsigned		lookup_done:1;
 	unsigned		insert_collision:1;
 	/* Anything after this point won't get zeroed in do_bio_hook() */
 	/* Keys to be inserted */
 	struct keylist		keys;
 	BKEY_PADDED(replace);
 };
 void bch_btree_op_init_stack(struct btree_op *);
 static inline void rw_lock(bool w, struct btree *b, int level)
 {
 	w ? down_write_nested(&b->lock, level + 1)
 	  : down_read_nested(&b->lock, level + 1);
 	if (w)
 		b->seq++;
 }
 static inline void rw_unlock(bool w, struct btree *b)
 {
 #ifdef CONFIG_BCACHE_EDEBUG
 	unsigned i;
 	if (w &&
 	    b->key.ptr[0] &&
 	    btree_node_read_done(b))
 		for (i = 0; i <= b->nsets; i++)
 			bch_check_key_order(b, b->sets[i].data);
 #endif
 	if (w)
 		b->seq++;
 	(w ? up_write : up_read)(&b->lock);
 }
 #define insert_lock(s, b)	((b)->level <= (s)->lock)
 /*
 * These macros are for recursing down the btree - they handle the details of
 * locking and looking up nodes in the cache for you. They're best treated as
 * mere syntax when reading code that uses them.
 *
 * op->lock determines whether we take a read or a write lock at a given depth.
 * If you've got a read lock and find that you need a write lock (i.e. you're
 * going to have to split), set op->lock and return -EINTR; btree_root() will
 * call you again and you'll have the correct lock.
 */
 /**
 * btree - recurse down the btree on a specified key
 * @fn:		function to call, which will be passed the child node
 * @key:	key to recurse on
 * @b:		parent btree node
 * @op:		pointer to struct btree_op
 */
 #define btree(fn, key, b, op, ...)					\
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
 	struct btree *_b = bch_btree_node_get((b)->c, key, l, op);	\
 	if (!IS_ERR(_b)) {						\
 		_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);		\
 		rw_unlock(_w, _b);					\
 	} else								\
 		_r = PTR_ERR(_b);					\
 	_r;								\
 })
 /**
 * btree_root - call a function on the root of the btree
 * @fn:		function to call, which will be passed the child node
 * @c:		cache set
 * @op:		pointer to struct btree_op
 */
 #define btree_root(fn, c, op, ...)					\
 ({									\
 	int _r = -EINTR;						\
 	do {								\
 		struct btree *_b = (c)->root;				\
 		bool _w = insert_lock(op, _b);				\
 		rw_lock(_w, _b, _b->level);				\
 		if (_b == (c)->root &&					\
 		    _w == insert_lock(op, _b))				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		rw_unlock(_w, _b);					\
 		bch_cannibalize_unlock(c, &(op)->cl);		\
 	} while (_r == -EINTR);						\
 									\
 	_r;								\
 })
 static inline bool should_split(struct btree *b)
 {
 	struct bset *i = write_block(b);
 	return b->written >= btree_blocks(b) ||
 		(i->seq == b->sets[0].data->seq &&
 		 b->written + __set_blocks(i, i->keys + 15, b->c)
 		 > btree_blocks(b));
 }
 void bch_btree_read_done(struct closure *);
 void bch_btree_read(struct btree *);
 void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
 void bch_cannibalize_unlock(struct cache_set *, struct closure *);
 void bch_btree_set_root(struct btree *);
 struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
 				int, struct btree_op *);
 bool bch_btree_insert_keys(struct btree *, struct btree_op *);
 bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
 				   struct bio *);
 int bch_btree_insert(struct btree_op *, struct cache_set *);
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
 void bch_queue_gc(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
 void bch_moving_gc(struct closure *);
 int bch_btree_check(struct cache_set *, struct btree_op *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
 void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
 void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
 bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
 				  struct bkey *);
 void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
 struct keybuf_key *bch_keybuf_next(struct keybuf *);
 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
 					  struct keybuf *, struct bkey *);
 #endif
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@ -0,0 +1,348 @@
 /*
 * Asynchronous refcounty things
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include "closure.h"
 void closure_queue(struct closure *cl)
 {
 	struct workqueue_struct *wq = cl->wq;
 	if (wq) {
 		INIT_WORK(&cl->work, cl->work.func);
 		BUG_ON(!queue_work(wq, &cl->work));
 	} else
 		cl->fn(cl);
 }
 EXPORT_SYMBOL_GPL(closure_queue);
 #define CL_FIELD(type, field)					\
 	case TYPE_ ## type:					\
 	return &container_of(cl, struct type, cl)->field
 static struct closure_waitlist *closure_waitlist(struct closure *cl)
 {
 	switch (cl->type) {
 		CL_FIELD(closure_with_waitlist, wait);
 		CL_FIELD(closure_with_waitlist_and_timer, wait);
 	default:
 		return NULL;
 	}
 }
 static struct timer_list *closure_timer(struct closure *cl)
 {
 	switch (cl->type) {
 		CL_FIELD(closure_with_timer, timer);
 		CL_FIELD(closure_with_waitlist_and_timer, timer);
 	default:
 		return NULL;
 	}
 }
 static inline void closure_put_after_sub(struct closure *cl, int flags)
 {
 	int r = flags & CLOSURE_REMAINING_MASK;
 	BUG_ON(flags & CLOSURE_GUARD_MASK);
 	BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
 	/* Must deliver precisely one wakeup */
 	if (r == 1 && (flags & CLOSURE_SLEEPING))
 		wake_up_process(cl->task);
 	if (!r) {
 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
 			/* CLOSURE_BLOCKING might be set - clear it */
 			atomic_set(&cl->remaining,
 				   CLOSURE_REMAINING_INITIALIZER);
 			closure_queue(cl);
 		} else {
 			struct closure *parent = cl->parent;
 			struct closure_waitlist *wait = closure_waitlist(cl);
 			closure_debug_destroy(cl);
 			atomic_set(&cl->remaining, -1);
 			if (wait)
 				closure_wake_up(wait);
 			if (cl->fn)
 				cl->fn(cl);
 			if (parent)
 				closure_put(parent);
 		}
 	}
 }
 /* For clearing flags with the same atomic op as a put */
 void closure_sub(struct closure *cl, int v)
 {
 	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
 }
 EXPORT_SYMBOL_GPL(closure_sub);
 void closure_put(struct closure *cl)
 {
 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
 EXPORT_SYMBOL_GPL(closure_put);
 static void set_waiting(struct closure *cl, unsigned long f)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 	cl->waiting_on = f;
 #endif
 }
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
 	struct llist_node *list;
 	struct closure *cl;
 	struct llist_node *reverse = NULL;
 	list = llist_del_all(&wait_list->list);
 	/* We first reverse the list to preserve FIFO ordering and fairness */
 	while (list) {
 		struct llist_node *t = list;
 		list = llist_next(list);
 		t->next = reverse;
 		reverse = t;
 	}
 	/* Then do the wakeups */
 	while (reverse) {
 		cl = container_of(reverse, struct closure, list);
 		reverse = llist_next(reverse);
 		set_waiting(cl, 0);
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
 }
 EXPORT_SYMBOL_GPL(__closure_wake_up);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl)
 {
 	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
 		return false;
 	set_waiting(cl, _RET_IP_);
 	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
 	llist_add(&cl->list, &list->list);
 	return true;
 }
 EXPORT_SYMBOL_GPL(closure_wait);
 /**
 * closure_sync() - sleep until a closure a closure has nothing left to wait on
 *
 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
 * the last refcount.
 */
 void closure_sync(struct closure *cl)
 {
 	while (1) {
 		__closure_start_sleep(cl);
 		closure_set_ret_ip(cl);
 		if ((atomic_read(&cl->remaining) &
 		     CLOSURE_REMAINING_MASK) == 1)
 			break;
 		schedule();
 	}
 	__closure_end_sleep(cl);
 }
 EXPORT_SYMBOL_GPL(closure_sync);
 /**
 * closure_trylock() - try to acquire the closure, without waiting
 * @cl:		closure to lock
 *
 * Returns true if the closure was succesfully locked.
 */
 bool closure_trylock(struct closure *cl, struct closure *parent)
 {
 	if (atomic_cmpxchg(&cl->remaining, -1,
 			   CLOSURE_REMAINING_INITIALIZER) != -1)
 		return false;
 	closure_set_ret_ip(cl);
 	smp_mb();
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
 	closure_debug_create(cl);
 	return true;
 }
 EXPORT_SYMBOL_GPL(closure_trylock);
 void __closure_lock(struct closure *cl, struct closure *parent,
 		    struct closure_waitlist *wait_list)
 {
 	struct closure wait;
 	closure_init_stack(&wait);
 	while (1) {
 		if (closure_trylock(cl, parent))
 			return;
 		closure_wait_event_sync(wait_list, &wait,
 					atomic_read(&cl->remaining) == -1);
 	}
 }
 EXPORT_SYMBOL_GPL(__closure_lock);
 static void closure_delay_timer_fn(unsigned long data)
 {
 	struct closure *cl = (struct closure *) data;
 	closure_sub(cl, CLOSURE_TIMER + 1);
 }
 void do_closure_timer_init(struct closure *cl)
 {
 	struct timer_list *timer = closure_timer(cl);
 	init_timer(timer);
 	timer->data	= (unsigned long) cl;
 	timer->function = closure_delay_timer_fn;
 }
 EXPORT_SYMBOL_GPL(do_closure_timer_init);
 bool __closure_delay(struct closure *cl, unsigned long delay,
 		     struct timer_list *timer)
 {
 	if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
 		return false;
 	BUG_ON(timer_pending(timer));
 	timer->expires	= jiffies + delay;
 	atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
 	add_timer(timer);
 	return true;
 }
 EXPORT_SYMBOL_GPL(__closure_delay);
 void __closure_flush(struct closure *cl, struct timer_list *timer)
 {
 	if (del_timer(timer))
 		closure_sub(cl, CLOSURE_TIMER + 1);
 }
 EXPORT_SYMBOL_GPL(__closure_flush);
 void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
 {
 	if (del_timer_sync(timer))
 		closure_sub(cl, CLOSURE_TIMER + 1);
 }
 EXPORT_SYMBOL_GPL(__closure_flush_sync);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 static LIST_HEAD(closure_list);
 static DEFINE_SPINLOCK(closure_list_lock);
 void closure_debug_create(struct closure *cl)
 {
 	unsigned long flags;
 	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
 	cl->magic = CLOSURE_MAGIC_ALIVE;
 	spin_lock_irqsave(&closure_list_lock, flags);
 	list_add(&cl->all, &closure_list);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
 EXPORT_SYMBOL_GPL(closure_debug_create);
 void closure_debug_destroy(struct closure *cl)
 {
 	unsigned long flags;
 	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
 	cl->magic = CLOSURE_MAGIC_DEAD;
 	spin_lock_irqsave(&closure_list_lock, flags);
 	list_del(&cl->all);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
 EXPORT_SYMBOL_GPL(closure_debug_destroy);
 static struct dentry *debug;
 #define work_data_bits(work) ((unsigned long *)(&(work)->data))
 static int debug_seq_show(struct seq_file *f, void *data)
 {
 	struct closure *cl;
 	spin_lock_irq(&closure_list_lock);
 	list_for_each_entry(cl, &closure_list, all) {
 		int r = atomic_read(&cl->remaining);
 		seq_printf(f, "%p: %pF -> %pf p %p r %i ",
 			   cl, (void *) cl->ip, cl->fn, cl->parent,
 			   r & CLOSURE_REMAINING_MASK);
 		seq_printf(f, "%s%s%s%s%s%s\n",
 			   test_bit(WORK_STRUCT_PENDING,
 				    work_data_bits(&cl->work)) ? "Q" : "",
 			   r & CLOSURE_RUNNING	? "R" : "",
 			   r & CLOSURE_BLOCKING	? "B" : "",
 			   r & CLOSURE_STACK	? "S" : "",
 			   r & CLOSURE_SLEEPING	? "Sl" : "",
 			   r & CLOSURE_TIMER	? "T" : "");
 		if (r & CLOSURE_WAITING)
 			seq_printf(f, " W %pF\n",
 				   (void *) cl->waiting_on);
 		seq_printf(f, "\n");
 	}
 	spin_unlock_irq(&closure_list_lock);
 	return 0;
 }
 static int debug_seq_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, debug_seq_show, NULL);
 }
 static const struct file_operations debug_ops = {
 	.owner		= THIS_MODULE,
 	.open		= debug_seq_open,
 	.read		= seq_read,
 	.release	= single_release
 };
 int __init closure_debug_init(void)
 {
 	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
 	return 0;
 }
 module_init(closure_debug_init);
 #endif
 MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
 MODULE_LICENSE("GPL");
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@ -0,0 +1,670 @@
 #ifndef _LINUX_CLOSURE_H
 #define _LINUX_CLOSURE_H
 #include <linux/llist.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 /*
 * Closure is perhaps the most overused and abused term in computer science, but
 * since I've been unable to come up with anything better you're stuck with it
 * again.
 *
 * What are closures?
 *
 * They embed a refcount. The basic idea is they count "things that are in
 * progress" - in flight bios, some other thread that's doing something else -
 * anything you might want to wait on.
 *
 * The refcount may be manipulated with closure_get() and closure_put().
 * closure_put() is where many of the interesting things happen, when it causes
 * the refcount to go to 0.
 *
 * Closures can be used to wait on things both synchronously and asynchronously,
 * and synchronous and asynchronous use can be mixed without restriction. To
 * wait synchronously, use closure_sync() - you will sleep until your closure's
 * refcount hits 1.
 *
 * To wait asynchronously, use
 *   continue_at(cl, next_function, workqueue);
 *
 * passing it, as you might expect, the function to run when nothing is pending
 * and the workqueue to run that function out of.
 *
 * continue_at() also, critically, is a macro that returns the calling function.
 * There's good reason for this.
 *
 * To use safely closures asynchronously, they must always have a refcount while
 * they are running owned by the thread that is running them. Otherwise, suppose
 * you submit some bios and wish to have a function run when they all complete:
 *
 * foo_endio(struct bio *bio, int error)
 * {
 *	closure_put(cl);
 * }
 *
 * closure_init(cl);
 *
 * do_stuff();
 * closure_get(cl);
 * bio1->bi_endio = foo_endio;
 * bio_submit(bio1);
 *
 * do_more_stuff();
 * closure_get(cl);
 * bio2->bi_endio = foo_endio;
 * bio_submit(bio2);
 *
 * continue_at(cl, complete_some_read, system_wq);
 *
 * If closure's refcount started at 0, complete_some_read() could run before the
 * second bio was submitted - which is almost always not what you want! More
 * importantly, it wouldn't be possible to say whether the original thread or
 * complete_some_read()'s thread owned the closure - and whatever state it was
 * associated with!
 *
 * So, closure_init() initializes a closure's refcount to 1 - and when a
 * closure_fn is run, the refcount will be reset to 1 first.
 *
 * Then, the rule is - if you got the refcount with closure_get(), release it
 * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
 * on a closure because you called closure_init() or you were run out of a
 * closure - _always_ use continue_at(). Doing so consistently will help
 * eliminate an entire class of particularly pernicious races.
 *
 * For a closure to wait on an arbitrary event, we need to introduce waitlists:
 *
 * struct closure_waitlist list;
 * closure_wait_event(list, cl, condition);
 * closure_wake_up(wait_list);
 *
 * These work analagously to wait_event() and wake_up() - except that instead of
 * operating on the current thread (for wait_event()) and lists of threads, they
 * operate on an explicit closure and lists of closures.
 *
 * Because it's a closure we can now wait either synchronously or
 * asynchronously. closure_wait_event() returns the current value of the
 * condition, and if it returned false continue_at() or closure_sync() can be
 * used to wait for it to become true.
 *
 * It's useful for waiting on things when you can't sleep in the context in
 * which you must check the condition (perhaps a spinlock held, or you might be
 * beneath generic_make_request() - in which case you can't sleep on IO).
 *
 * closure_wait_event() will wait either synchronously or asynchronously,
 * depending on whether the closure is in blocking mode or not. You can pick a
 * mode explicitly with closure_wait_event_sync() and
 * closure_wait_event_async(), which do just what you might expect.
 *
 * Lastly, you might have a wait list dedicated to a specific event, and have no
 * need for specifying the condition - you just want to wait until someone runs
 * closure_wake_up() on the appropriate wait list. In that case, just use
 * closure_wait(). It will return either true or false, depending on whether the
 * closure was already on a wait list or not - a closure can only be on one wait
 * list at a time.
 *
 * Parents:
 *
 * closure_init() takes two arguments - it takes the closure to initialize, and
 * a (possibly null) parent.
 *
 * If parent is non null, the new closure will have a refcount for its lifetime;
 * a closure is considered to be "finished" when its refcount hits 0 and the
 * function to run is null. Hence
 *
 * continue_at(cl, NULL, NULL);
 *
 * returns up the (spaghetti) stack of closures, precisely like normal return
 * returns up the C stack. continue_at() with non null fn is better thought of
 * as doing a tail call.
 *
 * All this implies that a closure should typically be embedded in a particular
 * struct (which its refcount will normally control the lifetime of), and that
 * struct can very much be thought of as a stack frame.
 *
 * Locking:
 *
 * Closures are based on work items but they can be thought of as more like
 * threads - in that like threads and unlike work items they have a well
 * defined lifetime; they are created (with closure_init()) and eventually
 * complete after a continue_at(cl, NULL, NULL).
 *
 * Suppose you've got some larger structure with a closure embedded in it that's
 * used for periodically doing garbage collection. You only want one garbage
 * collection happening at a time, so the natural thing to do is protect it with
 * a lock. However, it's difficult to use a lock protecting a closure correctly
 * because the unlock should come after the last continue_to() (additionally, if
 * you're using the closure asynchronously a mutex won't work since a mutex has
 * to be unlocked by the same process that locked it).
 *
 * So to make it less error prone and more efficient, we also have the ability
 * to use closures as locks:
 *
 * closure_init_unlocked();
 * closure_trylock();
 *
 * That's all we need for trylock() - the last closure_put() implicitly unlocks
 * it for you.  But for closure_lock(), we also need a wait list:
 *
 * struct closure_with_waitlist frobnicator_cl;
 *
 * closure_init_unlocked(&frobnicator_cl);
 * closure_lock(&frobnicator_cl);
 *
 * A closure_with_waitlist embeds a closure and a wait list - much like struct
 * delayed_work embeds a work item and a timer_list. The important thing is, use
 * it exactly like you would a regular closure and closure_put() will magically
 * handle everything for you.
 *
 * We've got closures that embed timers, too. They're called, appropriately
 * enough:
 * struct closure_with_timer;
 *
 * This gives you access to closure_delay(). It takes a refcount for a specified
 * number of jiffies - you could then call closure_sync() (for a slightly
 * convoluted version of msleep()) or continue_at() - which gives you the same
 * effect as using a delayed work item, except you can reuse the work_struct
 * already embedded in struct closure.
 *
 * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
 * probably expect, if you happen to need the features of both. (You don't
 * really want to know how all this is implemented, but if I've done my job
 * right you shouldn't have to care).
 */
 struct closure;
 typedef void (closure_fn) (struct closure *);
 struct closure_waitlist {
 	struct llist_head	list;
 };
 enum closure_type {
 	TYPE_closure				= 0,
 	TYPE_closure_with_waitlist		= 1,
 	TYPE_closure_with_timer			= 2,
 	TYPE_closure_with_waitlist_and_timer	= 3,
 	MAX_CLOSURE_TYPE			= 3,
 };
 enum closure_state {
 	/*
 	 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
 	 * waiting asynchronously
 	 *
 	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
 	 * the thread that owns the closure, and cleared by the thread that's
 	 * waking up the closure.
 	 *
 	 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
 	 * - indicates that cl->task is valid and closure_put() may wake it up.
 	 * Only set or cleared by the thread that owns the closure.
 	 *
 	 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
 	 * has an outstanding timer. Must be set by the thread that owns the
 	 * closure, and cleared by the timer function when the timer goes off.
 	 *
 	 * The rest are for debugging and don't affect behaviour:
 	 *
 	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
 	 * closure_init() and when closure_put() runs then next function), and
 	 * must be cleared before remaining hits 0. Primarily to help guard
 	 * against incorrect usage and accidentally transferring references.
 	 * continue_at() and closure_return() clear it for you, if you're doing
 	 * something unusual you can use closure_set_dead() which also helps
 	 * annotate where references are being transferred.
 	 *
 	 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
 	 * closure with this flag set
 	 */
 	CLOSURE_BITS_START	= (1 << 19),
 	CLOSURE_DESTRUCTOR	= (1 << 19),
 	CLOSURE_BLOCKING	= (1 << 21),
 	CLOSURE_WAITING		= (1 << 23),
 	CLOSURE_SLEEPING	= (1 << 25),
 	CLOSURE_TIMER		= (1 << 27),
 	CLOSURE_RUNNING		= (1 << 29),
 	CLOSURE_STACK		= (1 << 31),
 };
 #define CLOSURE_GUARD_MASK					\
 	((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING|	\
 	  CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
 #define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
 #define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
 struct closure {
 	union {
 		struct {
 			struct workqueue_struct *wq;
 			struct task_struct	*task;
 			struct llist_node	list;
 			closure_fn		*fn;
 		};
 		struct work_struct	work;
 	};
 	struct closure		*parent;
 	atomic_t		remaining;
 	enum closure_type	type;
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 #define CLOSURE_MAGIC_DEAD	0xc054dead
 #define CLOSURE_MAGIC_ALIVE	0xc054a11e
 	unsigned		magic;
 	struct list_head	all;
 	unsigned long		ip;
 	unsigned long		waiting_on;
 #endif
 };
 struct closure_with_waitlist {
 	struct closure		cl;
 	struct closure_waitlist	wait;
 };
 struct closure_with_timer {
 	struct closure		cl;
 	struct timer_list	timer;
 };
 struct closure_with_waitlist_and_timer {
 	struct closure		cl;
 	struct closure_waitlist	wait;
 	struct timer_list	timer;
 };
 extern unsigned invalid_closure_type(void);
 #define __CLOSURE_TYPE(cl, _t)						\
 	  __builtin_types_compatible_p(typeof(cl), struct _t)		\
 		? TYPE_ ## _t :						\
 #define __closure_type(cl)						\
 (									\
 	__CLOSURE_TYPE(cl, closure)					\
 	__CLOSURE_TYPE(cl, closure_with_waitlist)			\
 	__CLOSURE_TYPE(cl, closure_with_timer)				\
 	__CLOSURE_TYPE(cl, closure_with_waitlist_and_timer)		\
 	invalid_closure_type()						\
 )
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
 void closure_queue(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);
 bool closure_trylock(struct closure *cl, struct closure *parent);
 void __closure_lock(struct closure *cl, struct closure *parent,
 		    struct closure_waitlist *wait_list);
 void do_closure_timer_init(struct closure *cl);
 bool __closure_delay(struct closure *cl, unsigned long delay,
 		     struct timer_list *timer);
 void __closure_flush(struct closure *cl, struct timer_list *timer);
 void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 void closure_debug_create(struct closure *cl);
 void closure_debug_destroy(struct closure *cl);
 #else
 static inline void closure_debug_create(struct closure *cl) {}
 static inline void closure_debug_destroy(struct closure *cl) {}
 #endif
 static inline void closure_set_ip(struct closure *cl)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 	cl->ip = _THIS_IP_;
 #endif
 }
 static inline void closure_set_ret_ip(struct closure *cl)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 	cl->ip = _RET_IP_;
 #endif
 }
 static inline void closure_get(struct closure *cl)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 	BUG_ON((atomic_inc_return(&cl->remaining) &
 		CLOSURE_REMAINING_MASK) <= 1);
 #else
 	atomic_inc(&cl->remaining);
 #endif
 }
 static inline void closure_set_stopped(struct closure *cl)
 {
 	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
 }
 static inline bool closure_is_stopped(struct closure *cl)
 {
 	return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
 }
 static inline bool closure_is_unlocked(struct closure *cl)
 {
 	return atomic_read(&cl->remaining) == -1;
 }
 static inline void do_closure_init(struct closure *cl, struct closure *parent,
 				   bool running)
 {
 	switch (cl->type) {
 	case TYPE_closure_with_timer:
 	case TYPE_closure_with_waitlist_and_timer:
 		do_closure_timer_init(cl);
 	default:
 		break;
 	}
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
 	if (running) {
 		closure_debug_create(cl);
 		atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
 	} else
 		atomic_set(&cl->remaining, -1);
 	closure_set_ip(cl);
 }
 /*
 * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
 * the result of __closure_type() is thrown away, it's used merely for type
 * checking.
 */
 #define __to_internal_closure(cl)				\
 ({								\
 	BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);	\
 	(struct closure *) cl;					\
 })
 #define closure_init_type(cl, parent, running)			\
 do {								\
 	struct closure *_cl = __to_internal_closure(cl);	\
 	_cl->type = __closure_type(*(cl));			\
 	do_closure_init(_cl, parent, running);			\
 } while (0)
 /**
 * __closure_init() - Initialize a closure, skipping the memset()
 *
 * May be used instead of closure_init() when memory has already been zeroed.
 */
 #define __closure_init(cl, parent)				\
 	closure_init_type(cl, parent, true)
 /**
 * closure_init() - Initialize a closure, setting the refcount to 1
 * @cl:		closure to initialize
 * @parent:	parent of the new closure. cl will take a refcount on it for its
 *		lifetime; may be NULL.
 */
 #define closure_init(cl, parent)				\
 do {								\
 	memset((cl), 0, sizeof(*(cl)));				\
 	__closure_init(cl, parent);				\
 } while (0)
 static inline void closure_init_stack(struct closure *cl)
 {
 	memset(cl, 0, sizeof(struct closure));
 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
 		   CLOSURE_BLOCKING|CLOSURE_STACK);
 }
 /**
 * closure_init_unlocked() - Initialize a closure but leave it unlocked.
 * @cl:		closure to initialize
 *
 * For when the closure will be used as a lock. The closure may not be used
 * until after a closure_lock() or closure_trylock().
 */
 #define closure_init_unlocked(cl)				\
 do {								\
 	memset((cl), 0, sizeof(*(cl)));				\
 	closure_init_type(cl, NULL, false);			\
 } while (0)
 /**
 * closure_lock() - lock and initialize a closure.
 * @cl:		the closure to lock
 * @parent:	the new parent for this closure
 *
 * The closure must be of one of the types that has a waitlist (otherwise we
 * wouldn't be able to sleep on contention).
 *
 * @parent has exactly the same meaning as in closure_init(); if non null, the
 * closure will take a reference on @parent which will be released when it is
 * unlocked.
 */
 #define closure_lock(cl, parent)				\
 	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
 /**
 * closure_delay() - delay some number of jiffies
 * @cl:		the closure that will sleep
 * @delay:	the delay in jiffies
 *
 * Takes a refcount on @cl which will be released after @delay jiffies; this may
 * be used to have a function run after a delay with continue_at(), or
 * closure_sync() may be used for a convoluted version of msleep().
 */
 #define closure_delay(cl, delay)			\
 	__closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
 #define closure_flush(cl)				\
 	__closure_flush(__to_internal_closure(cl), &(cl)->timer)
 #define closure_flush_sync(cl)				\
 	__closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
 static inline void __closure_end_sleep(struct closure *cl)
 {
 	__set_current_state(TASK_RUNNING);
 	if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
 		atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
 }
 static inline void __closure_start_sleep(struct closure *cl)
 {
 	closure_set_ip(cl);
 	cl->task = current;
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
 		atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
 /**
 * closure_blocking() - returns true if the closure is in blocking mode.
 *
 * If a closure is in blocking mode, closure_wait_event() will sleep until the
 * condition is true instead of waiting asynchronously.
 */
 static inline bool closure_blocking(struct closure *cl)
 {
 	return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
 }
 /**
 * set_closure_blocking() - put a closure in blocking mode.
 *
 * If a closure is in blocking mode, closure_wait_event() will sleep until the
 * condition is true instead of waiting asynchronously.
 *
 * Not thread safe - can only be called by the thread running the closure.
 */
 static inline void set_closure_blocking(struct closure *cl)
 {
 	if (!closure_blocking(cl))
 		atomic_add(CLOSURE_BLOCKING, &cl->remaining);
 }
 /*
 * Not thread safe - can only be called by the thread running the closure.
 */
 static inline void clear_closure_blocking(struct closure *cl)
 {
 	if (closure_blocking(cl))
 		atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
 }
 /**
 * closure_wake_up() - wake up all closures on a wait list.
 */
 static inline void closure_wake_up(struct closure_waitlist *list)
 {
 	smp_mb();
 	__closure_wake_up(list);
 }
 /*
 * Wait on an event, synchronously or asynchronously - analogous to wait_event()
 * but for closures.
 *
 * The loop is oddly structured so as to avoid a race; we must check the
 * condition again after we've added ourself to the waitlist. We know if we were
 * already on the waitlist because closure_wait() returns false; thus, we only
 * schedule or break if closure_wait() returns false. If it returns true, we
 * just loop again - rechecking the condition.
 *
 * The __closure_wake_up() is necessary because we may race with the event
 * becoming true; i.e. we see event false -> wait -> recheck condition, but the
 * thread that made the event true may have called closure_wake_up() before we
 * added ourself to the wait list.
 *
 * We have to call closure_sync() at the end instead of just
 * __closure_end_sleep() because a different thread might've called
 * closure_wake_up() before us and gotten preempted before they dropped the
 * refcount on our closure. If this was a stack allocated closure, that would be
 * bad.
 */
 #define __closure_wait_event(list, cl, condition, _block)		\
 ({									\
 	bool block = _block;						\
 	typeof(condition) ret;						\
 									\
 	while (1) {							\
 		ret = (condition);					\
 		if (ret) {						\
 			__closure_wake_up(list);			\
 			if (block)					\
 				closure_sync(cl);			\
 									\
 			break;						\
 		}							\
 									\
 		if (block)						\
 			__closure_start_sleep(cl);			\
 									\
 		if (!closure_wait(list, cl)) {				\
 			if (!block)					\
 				break;					\
 									\
 			schedule();					\
 		}							\
 	}								\
 									\
 	ret;								\
 })
 /**
 * closure_wait_event() - wait on a condition, synchronously or asynchronously.
 * @list:	the wait list to wait on
 * @cl:		the closure that is doing the waiting
 * @condition:	a C expression for the event to wait for
 *
 * If the closure is in blocking mode, sleeps until the @condition evaluates to
 * true - exactly like wait_event().
 *
 * If the closure is not in blocking mode, waits asynchronously; if the
 * condition is currently false the @cl is put onto @list and returns. @list
 * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
 * wait for another thread to wake up @list, which drops the refcount on @cl.
 *
 * Returns the value of @condition; @cl will be on @list iff @condition was
 * false.
 *
 * closure_wake_up(@list) must be called after changing any variable that could
 * cause @condition to become true.
 */
 #define closure_wait_event(list, cl, condition)				\
 	__closure_wait_event(list, cl, condition, closure_blocking(cl))
 #define closure_wait_event_async(list, cl, condition)			\
 	__closure_wait_event(list, cl, condition, false)
 #define closure_wait_event_sync(list, cl, condition)			\
 	__closure_wait_event(list, cl, condition, true)
 static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 				  struct workqueue_struct *wq)
 {
 	BUG_ON(object_is_on_stack(cl));
 	closure_set_ip(cl);
 	cl->fn = fn;
 	cl->wq = wq;
 	/* between atomic_dec() in closure_put() */
 	smp_mb__before_atomic_dec();
 }
 #define continue_at(_cl, _fn, _wq)					\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
 	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
 	return;								\
 } while (0)
 #define closure_return(_cl)	continue_at((_cl), NULL, NULL)
 #define continue_at_nobarrier(_cl, _fn, _wq)				\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
 	closure_queue(cl);						\
 	return;								\
 } while (0)
 #define closure_return_with_destructor(_cl, _destructor)		\
 do {									\
 	set_closure_fn(_cl, _destructor, NULL);				\
 	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
 	return;								\
 } while (0)
 static inline void closure_call(struct closure *cl, closure_fn fn,
 				struct workqueue_struct *wq,
 				struct closure *parent)
 {
 	closure_init(cl, parent);
 	continue_at_nobarrier(cl, fn, wq);
 }
 static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
 					struct workqueue_struct *wq,
 					struct closure *parent)
 {
 	if (closure_trylock(cl, parent))
 		continue_at_nobarrier(cl, fn, wq);
 }
 #endif /* _LINUX_CLOSURE_H */
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@ -0,0 +1,563 @@
 /*
 * Assorted bcache debug code
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 #include <linux/console.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 static struct dentry *debug;
 const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
 {
 	unsigned i;
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
 			struct cache *ca = PTR_CACHE(c, k, i);
 			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 			if (KEY_SIZE(k) + r > c->sb.bucket_size)
 				return "bad, length too big";
 			if (bucket <  ca->sb.first_bucket)
 				return "bad, short offset";
 			if (bucket >= ca->sb.nbuckets)
 				return "bad, offset past end of device";
 			if (ptr_stale(c, k, i))
 				return "stale";
 		}
 	if (!bkey_cmp(k, &ZERO_KEY))
 		return "bad, null key";
 	if (!KEY_PTRS(k))
 		return "bad, no pointers";
 	if (!KEY_SIZE(k))
 		return "zeroed key";
 	return "";
 }
 struct keyprint_hack bch_pkey(const struct bkey *k)
 {
 	unsigned i = 0;
 	struct keyprint_hack r;
 	char *out = r.s, *end = r.s + KEYHACK_SIZE;
 #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
 	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
 	if (KEY_PTRS(k))
 		while (1) {
 			p("%llu:%llu gen %llu",
 			  PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
 			if (++i == KEY_PTRS(k))
 				break;
 			p(", ");
 		}
 	p("]");
 	if (KEY_DIRTY(k))
 		p(" dirty");
 	if (KEY_CSUM(k))
 		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
 #undef p
 	return r;
 }
 struct keyprint_hack bch_pbtree(const struct btree *b)
 {
 	struct keyprint_hack r;
 	snprintf(r.s, 40, "%li level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
 		 b->level, b->c->root ? b->c->root->level : -1);
 	return r;
 }
 #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
 static bool skipped_backwards(struct btree *b, struct bkey *k)
 {
 	return bkey_cmp(k, (!b->level)
 			? &START_KEY(bkey_next(k))
 			: bkey_next(k)) > 0;
 }
 static void dump_bset(struct btree *b, struct bset *i)
 {
 	struct bkey *k;
 	unsigned j;
 	for (k = i->start; k < end(i); k = bkey_next(k)) {
 		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
 		       (uint64_t *) k - i->d, i->keys, pkey(k));
 		for (j = 0; j < KEY_PTRS(k); j++) {
 			size_t n = PTR_BUCKET_NR(b->c, k, j);
 			printk(" bucket %zu", n);
 			if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
 				printk(" prio %i",
 				       PTR_BUCKET(b->c, k, j)->prio);
 		}
 		printk(" %s\n", bch_ptr_status(b->c, k));
 		if (bkey_next(k) < end(i) &&
 		    skipped_backwards(b, k))
 			printk(KERN_ERR "Key skipped backwards\n");
 	}
 }
 #endif
 #ifdef CONFIG_BCACHE_DEBUG
 void bch_btree_verify(struct btree *b, struct bset *new)
 {
 	struct btree *v = b->c->verify_data;
 	struct closure cl;
 	closure_init_stack(&cl);
 	if (!b->c->verify)
 		return;
 	closure_wait_event(&b->io.wait, &cl,
 			   atomic_read(&b->io.cl.remaining) == -1);
 	mutex_lock(&b->c->verify_lock);
 	bkey_copy(&v->key, &b->key);
 	v->written = 0;
 	v->level = b->level;
 	bch_btree_read(v);
 	closure_wait_event(&v->io.wait, &cl,
 			   atomic_read(&b->io.cl.remaining) == -1);
 	if (new->keys != v->sets[0].data->keys ||
 	    memcmp(new->start,
 		   v->sets[0].data->start,
 		   (void *) end(new) - (void *) new->start)) {
 		unsigned i, j;
 		console_lock();
 		printk(KERN_ERR "*** original memory node:\n");
 		for (i = 0; i <= b->nsets; i++)
 			dump_bset(b, b->sets[i].data);
 		printk(KERN_ERR "*** sorted memory node:\n");
 		dump_bset(b, new);
 		printk(KERN_ERR "*** on disk node:\n");
 		dump_bset(v, v->sets[0].data);
 		for (j = 0; j < new->keys; j++)
 			if (new->d[j] != v->sets[0].data->d[j])
 				break;
 		console_unlock();
 		panic("verify failed at %u\n", j);
 	}
 	mutex_unlock(&b->c->verify_lock);
 }
 static void data_verify_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	closure_put(cl);
 }
 void bch_data_verify(struct search *s)
 {
 	char name[BDEVNAME_SIZE];
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct closure *cl = &s->cl;
 	struct bio *check;
 	struct bio_vec *bv;
 	int i;
 	if (!s->unaligned_bvec)
 		bio_for_each_segment(bv, s->orig_bio, i)
 			bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
 	check = bio_clone(s->orig_bio, GFP_NOIO);
 	if (!check)
 		return;
 	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
 	check->bi_rw		= READ_SYNC;
 	check->bi_private	= cl;
 	check->bi_end_io	= data_verify_endio;
 	closure_bio_submit(check, cl, &dc->disk);
 	closure_sync(cl);
 	bio_for_each_segment(bv, s->orig_bio, i) {
 		void *p1 = kmap(bv->bv_page);
 		void *p2 = kmap(check->bi_io_vec[i].bv_page);
 		if (memcmp(p1 + bv->bv_offset,
 			   p2 + bv->bv_offset,
 			   bv->bv_len))
 			printk(KERN_ERR "bcache (%s): verify failed"
 			       " at sector %llu\n",
 			       bdevname(dc->bdev, name),
 			       (uint64_t) s->orig_bio->bi_sector);
 		kunmap(bv->bv_page);
 		kunmap(check->bi_io_vec[i].bv_page);
 	}
 	__bio_for_each_segment(bv, check, i, 0)
 		__free_page(bv->bv_page);
 out_put:
 	bio_put(check);
 }
 #endif
 #ifdef CONFIG_BCACHE_EDEBUG
 unsigned bch_count_data(struct btree *b)
 {
 	unsigned ret = 0;
 	struct btree_iter iter;
 	struct bkey *k;
 	if (!b->level)
 		for_each_key(b, k, &iter)
 			ret += KEY_SIZE(k);
 	return ret;
 }
 static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
 				   va_list args)
 {
 	unsigned i;
 	console_lock();
 	for (i = 0; i <= b->nsets; i++)
 		dump_bset(b, b->sets[i].data);
 	vprintk(fmt, args);
 	console_unlock();
 	panic("at %s\n", pbtree(b));
 }
 void bch_check_key_order_msg(struct btree *b, struct bset *i,
 			     const char *fmt, ...)
 {
 	struct bkey *k;
 	if (!i->keys)
 		return;
 	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
 		if (skipped_backwards(b, k)) {
 			va_list args;
 			va_start(args, fmt);
 			vdump_bucket_and_panic(b, fmt, args);
 			va_end(args);
 		}
 }
 void bch_check_keys(struct btree *b, const char *fmt, ...)
 {
 	va_list args;
 	struct bkey *k, *p = NULL;
 	struct btree_iter iter;
 	if (b->level)
 		return;
 	for_each_key(b, k, &iter) {
 		if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
 			printk(KERN_ERR "Keys out of order:\n");
 			goto bug;
 		}
 		if (bch_ptr_invalid(b, k))
 			continue;
 		if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
 			printk(KERN_ERR "Overlapping keys:\n");
 			goto bug;
 		}
 		p = k;
 	}
 	return;
 bug:
 	va_start(args, fmt);
 	vdump_bucket_and_panic(b, fmt, args);
 	va_end(args);
 }
 #endif
 #ifdef CONFIG_DEBUG_FS
 /* XXX: cache set refcounting */
 struct dump_iterator {
 	char			buf[PAGE_SIZE];
 	size_t			bytes;
 	struct cache_set	*c;
 	struct keybuf		keys;
 };
 static bool dump_pred(struct keybuf *buf, struct bkey *k)
 {
 	return true;
 }
 static ssize_t bch_dump_read(struct file *file, char __user *buf,
 			     size_t size, loff_t *ppos)
 {
 	struct dump_iterator *i = file->private_data;
 	ssize_t ret = 0;
 	while (size) {
 		struct keybuf_key *w;
 		unsigned bytes = min(i->bytes, size);
 		int err = copy_to_user(buf, i->buf, bytes);
 		if (err)
 			return err;
 		ret	 += bytes;
 		buf	 += bytes;
 		size	 -= bytes;
 		i->bytes -= bytes;
 		memmove(i->buf, i->buf + bytes, i->bytes);
 		if (i->bytes)
 			break;
 		w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
 		if (!w)
 			break;
 		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
 		bch_keybuf_del(&i->keys, w);
 	}
 	return ret;
 }
 static int bch_dump_open(struct inode *inode, struct file *file)
 {
 	struct cache_set *c = inode->i_private;
 	struct dump_iterator *i;
 	i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
 	if (!i)
 		return -ENOMEM;
 	file->private_data = i;
 	i->c = c;
 	bch_keybuf_init(&i->keys, dump_pred);
 	i->keys.last_scanned = KEY(0, 0, 0);
 	return 0;
 }
 static int bch_dump_release(struct inode *inode, struct file *file)
 {
 	kfree(file->private_data);
 	return 0;
 }
 static const struct file_operations cache_set_debug_ops = {
 	.owner		= THIS_MODULE,
 	.open		= bch_dump_open,
 	.read		= bch_dump_read,
 	.release	= bch_dump_release
 };
 void bch_debug_init_cache_set(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(debug)) {
 		char name[50];
 		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
 		c->debug = debugfs_create_file(name, 0400, debug, c,
 					       &cache_set_debug_ops);
 	}
 }
 #endif
 #ifdef CONFIG_BCACHE_DEBUG
 static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
 			  const char *buffer, size_t size)
 {
 	void dump(struct btree *b)
 	{
 		struct bset *i;
 		for (i = b->sets[0].data;
 		     index(i, b) < btree_blocks(b) &&
 		     i->seq == b->sets[0].data->seq;
 		     i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
 			dump_bset(b, i);
 	}
 	struct cache_sb *sb;
 	struct cache_set *c;
 	struct btree *all[3], *b, *fill, *orig;
 	int j;
 	struct btree_op op;
 	bch_btree_op_init_stack(&op);
 	sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
 	if (!sb)
 		return -ENOMEM;
 	sb->bucket_size = 128;
 	sb->block_size = 4;
 	c = bch_cache_set_alloc(sb);
 	if (!c)
 		return -ENOMEM;
 	for (j = 0; j < 3; j++) {
 		BUG_ON(list_empty(&c->btree_cache));
 		all[j] = list_first_entry(&c->btree_cache, struct btree, list);
 		list_del_init(&all[j]->list);
 		all[j]->key = KEY(0, 0, c->sb.bucket_size);
 		bkey_copy_key(&all[j]->key, &MAX_KEY);
 	}
 	b = all[0];
 	fill = all[1];
 	orig = all[2];
 	while (1) {
 		for (j = 0; j < 3; j++)
 			all[j]->written = all[j]->nsets = 0;
 		bch_bset_init_next(b);
 		while (1) {
 			struct bset *i = write_block(b);
 			struct bkey *k = op.keys.top;
 			unsigned rand;
 			bkey_init(k);
 			rand = get_random_int();
 			op.type = rand & 1
 				? BTREE_INSERT
 				: BTREE_REPLACE;
 			rand >>= 1;
 			SET_KEY_SIZE(k, bucket_remainder(c, rand));
 			rand >>= c->bucket_bits;
 			rand &= 1024 * 512 - 1;
 			rand += c->sb.bucket_size;
 			SET_KEY_OFFSET(k, rand);
 #if 0
 			SET_KEY_PTRS(k, 1);
 #endif
 			bch_keylist_push(&op.keys);
 			bch_btree_insert_keys(b, &op);
 			if (should_split(b) ||
 			    set_blocks(i, b->c) !=
 			    __set_blocks(i, i->keys + 15, b->c)) {
 				i->csum = csum_set(i);
 				memcpy(write_block(fill),
 				       i, set_bytes(i));
 				b->written += set_blocks(i, b->c);
 				fill->written = b->written;
 				if (b->written == btree_blocks(b))
 					break;
 				bch_btree_sort_lazy(b);
 				bch_bset_init_next(b);
 			}
 		}
 		memcpy(orig->sets[0].data,
 		       fill->sets[0].data,
 		       btree_bytes(c));
 		bch_btree_sort(b);
 		fill->written = 0;
 		bch_btree_read_done(&fill->io.cl);
 		if (b->sets[0].data->keys != fill->sets[0].data->keys ||
 		    memcmp(b->sets[0].data->start,
 			   fill->sets[0].data->start,
 			   b->sets[0].data->keys * sizeof(uint64_t))) {
 			struct bset *i = b->sets[0].data;
 			struct bkey *k, *l;
 			for (k = i->start,
 			     l = fill->sets[0].data->start;
 			     k < end(i);
 			     k = bkey_next(k), l = bkey_next(l))
 				if (bkey_cmp(k, l) ||
 				    KEY_SIZE(k) != KEY_SIZE(l))
 					pr_err("key %zi differs: %s "
 					       "!= %s", (uint64_t *) k - i->d,
 					       pkey(k), pkey(l));
 			for (j = 0; j < 3; j++) {
 				pr_err("**** Set %i ****", j);
 				dump(all[j]);
 			}
 			panic("\n");
 		}
 		pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
 	}
 }
 kobj_attribute_write(fuzz, btree_fuzz);
 #endif
 void bch_debug_exit(void)
 {
 	if (!IS_ERR_OR_NULL(debug))
 		debugfs_remove_recursive(debug);
 }
 int __init bch_debug_init(struct kobject *kobj)
 {
 	int ret = 0;
 #ifdef CONFIG_BCACHE_DEBUG
 	ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
 	if (ret)
 		return ret;
 #endif
 	debug = debugfs_create_dir("bcache", NULL);
 	return ret;
 }
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@ -0,0 +1,54 @@
 #ifndef _BCACHE_DEBUG_H
 #define _BCACHE_DEBUG_H
 /* Btree/bkey debug printing */
 #define KEYHACK_SIZE 80
 struct keyprint_hack {
 	char s[KEYHACK_SIZE];
 };
 struct keyprint_hack bch_pkey(const struct bkey *k);
 struct keyprint_hack bch_pbtree(const struct btree *b);
 #define pkey(k)		(&bch_pkey(k).s[0])
 #define pbtree(b)	(&bch_pbtree(b).s[0])
 #ifdef CONFIG_BCACHE_EDEBUG
 unsigned bch_count_data(struct btree *);
 void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
 void bch_check_keys(struct btree *, const char *, ...);
 #define bch_check_key_order(b, i)			\
 	bch_check_key_order_msg(b, i, "keys out of order")
 #define EBUG_ON(cond)		BUG_ON(cond)
 #else /* EDEBUG */
 #define bch_count_data(b)				0
 #define bch_check_key_order(b, i)			do {} while (0)
 #define bch_check_key_order_msg(b, i, ...)		do {} while (0)
 #define bch_check_keys(b, ...)				do {} while (0)
 #define EBUG_ON(cond)					do {} while (0)
 #endif
 #ifdef CONFIG_BCACHE_DEBUG
 void bch_btree_verify(struct btree *, struct bset *);
 void bch_data_verify(struct search *);
 #else /* DEBUG */
 static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
 static inline void bch_data_verify(struct search *s) {};
 #endif
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
 static inline void bch_debug_init_cache_set(struct cache_set *c) {}
 #endif
 #endif
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@ -0,0 +1,390 @@
 /*
 * Some low level IO code, and hacks for various block layer limitations
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "bset.h"
 #include "debug.h"
 static void bch_bi_idx_hack_endio(struct bio *bio, int error)
 {
 	struct bio *p = bio->bi_private;
 	bio_endio(p, error);
 	bio_put(bio);
 }
 static void bch_generic_make_request_hack(struct bio *bio)
 {
 	if (bio->bi_idx) {
 		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
 		memcpy(clone->bi_io_vec,
 		       bio_iovec(bio),
 		       bio_segments(bio) * sizeof(struct bio_vec));
 		clone->bi_sector	= bio->bi_sector;
 		clone->bi_bdev		= bio->bi_bdev;
 		clone->bi_rw		= bio->bi_rw;
 		clone->bi_vcnt		= bio_segments(bio);
 		clone->bi_size		= bio->bi_size;
 		clone->bi_private	= bio;
 		clone->bi_end_io	= bch_bi_idx_hack_endio;
 		bio = clone;
 	}
 	generic_make_request(bio);
 }
 /**
 * bch_bio_split - split a bio
 * @bio:	bio to split
 * @sectors:	number of sectors to split from the front of @bio
 * @gfp:	gfp mask
 * @bs:		bio set to allocate from
 *
 * Allocates and returns a new bio which represents @sectors from the start of
 * @bio, and updates @bio to represent the remaining sectors.
 *
 * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
 * unchanged.
 *
 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
 * freed before the split.
 *
 * If bch_bio_split() is running under generic_make_request(), it's not safe to
 * allocate more than one bio from the same bio set. Therefore, if it is running
 * under generic_make_request() it masks out __GFP_WAIT when doing the
 * allocation. The caller must check for failure if there's any possibility of
 * it being called from under generic_make_request(); it is then the caller's
 * responsibility to retry from a safe context (by e.g. punting to workqueue).
 */
 struct bio *bch_bio_split(struct bio *bio, int sectors,
 			  gfp_t gfp, struct bio_set *bs)
 {
 	unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
 	struct bio_vec *bv;
 	struct bio *ret = NULL;
 	BUG_ON(sectors <= 0);
 	/*
 	 * If we're being called from underneath generic_make_request() and we
 	 * already allocated any bios from this bio set, we risk deadlock if we
 	 * use the mempool. So instead, we possibly fail and let the caller punt
 	 * to workqueue or somesuch and retry in a safe context.
 	 */
 	if (current->bio_list)
 		gfp &= ~__GFP_WAIT;
 	if (sectors >= bio_sectors(bio))
 		return bio;
 	if (bio->bi_rw & REQ_DISCARD) {
 		ret = bio_alloc_bioset(gfp, 1, bs);
 		idx = 0;
 		goto out;
 	}
 	bio_for_each_segment(bv, bio, idx) {
 		vcnt = idx - bio->bi_idx;
 		if (!nbytes) {
 			ret = bio_alloc_bioset(gfp, vcnt, bs);
 			if (!ret)
 				return NULL;
 			memcpy(ret->bi_io_vec, bio_iovec(bio),
 			       sizeof(struct bio_vec) * vcnt);
 			break;
 		} else if (nbytes < bv->bv_len) {
 			ret = bio_alloc_bioset(gfp, ++vcnt, bs);
 			if (!ret)
 				return NULL;
 			memcpy(ret->bi_io_vec, bio_iovec(bio),
 			       sizeof(struct bio_vec) * vcnt);
 			ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
 			bv->bv_offset	+= nbytes;
 			bv->bv_len	-= nbytes;
 			break;
 		}
 		nbytes -= bv->bv_len;
 	}
 out:
 	ret->bi_bdev	= bio->bi_bdev;
 	ret->bi_sector	= bio->bi_sector;
 	ret->bi_size	= sectors << 9;
 	ret->bi_rw	= bio->bi_rw;
 	ret->bi_vcnt	= vcnt;
 	ret->bi_max_vecs = vcnt;
 	bio->bi_sector	+= sectors;
 	bio->bi_size	-= sectors << 9;
 	bio->bi_idx	 = idx;
 	if (bio_integrity(bio)) {
 		if (bio_integrity_clone(ret, bio, gfp)) {
 			bio_put(ret);
 			return NULL;
 		}
 		bio_integrity_trim(ret, 0, bio_sectors(ret));
 		bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
 	}
 	return ret;
 }
 static unsigned bch_bio_max_sectors(struct bio *bio)
 {
 	unsigned ret = bio_sectors(bio);
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct bio_vec *bv, *end = bio_iovec(bio) +
 		min_t(int, bio_segments(bio), queue_max_segments(q));
 	struct bvec_merge_data bvm = {
 		.bi_bdev	= bio->bi_bdev,
 		.bi_sector	= bio->bi_sector,
 		.bi_size	= 0,
 		.bi_rw		= bio->bi_rw,
 	};
 	if (bio->bi_rw & REQ_DISCARD)
 		return min(ret, q->limits.max_discard_sectors);
 	if (bio_segments(bio) > queue_max_segments(q) ||
 	    q->merge_bvec_fn) {
 		ret = 0;
 		for (bv = bio_iovec(bio); bv < end; bv++) {
 			if (q->merge_bvec_fn &&
 			    q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
 				break;
 			ret		+= bv->bv_len >> 9;
 			bvm.bi_size	+= bv->bv_len;
 		}
 		if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9)
 			return (BIO_MAX_PAGES * PAGE_SIZE) >> 9;
 	}
 	ret = min(ret, queue_max_sectors(q));
 	WARN_ON(!ret);
 	ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
 	return ret;
 }
 static void bch_bio_submit_split_done(struct closure *cl)
 {
 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
 	bio_endio(s->bio, 0);
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
 }
 static void bch_bio_submit_split_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
 	if (error)
 		clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
 	bio_put(bio);
 	closure_put(cl);
 }
 static void __bch_bio_submit_split(struct closure *cl)
 {
 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
 	struct bio *bio = s->bio, *n;
 	do {
 		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
 				  GFP_NOIO, s->p->bio_split);
 		if (!n)
 			continue_at(cl, __bch_bio_submit_split, system_wq);
 		n->bi_end_io	= bch_bio_submit_split_endio;
 		n->bi_private	= cl;
 		closure_get(cl);
 		bch_generic_make_request_hack(n);
 	} while (n != bio);
 	continue_at(cl, bch_bio_submit_split_done, NULL);
 }
 void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 {
 	struct bio_split_hook *s;
 	if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
 		goto submit;
 	if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
 		goto submit;
 	s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
 	s->bio		= bio;
 	s->p		= p;
 	s->bi_end_io	= bio->bi_end_io;
 	s->bi_private	= bio->bi_private;
 	bio_get(bio);
 	closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
 	return;
 submit:
 	bch_generic_make_request_hack(bio);
 }
 /* Bios with headers */
 void bch_bbio_free(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	mempool_free(b, c->bio_meta);
 }
 struct bio *bch_bbio_alloc(struct cache_set *c)
 {
 	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 	bio_init(bio);
 	bio->bi_flags		|= BIO_POOL_NONE << BIO_POOL_OFFSET;
 	bio->bi_max_vecs	 = bucket_pages(c);
 	bio->bi_io_vec		 = bio->bi_inline_vecs;
 	return bio;
 }
 void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	bio->bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_bdev	= PTR_CACHE(c, &b->key, 0)->bdev;
 	b->submit_time_us = local_clock_us();
 	closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
 }
 void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 		     struct bkey *k, unsigned ptr)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	bch_bkey_copy_single_ptr(&b->key, k, ptr);
 	__bch_submit_bbio(bio, c);
 }
 /* IO errors */
 void bch_count_io_errors(struct cache *ca, int error, const char *m)
 {
 	/*
 	 * The halflife of an error is:
 	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
 	 */
 	if (ca->set->error_decay) {
 		unsigned count = atomic_inc_return(&ca->io_count);
 		while (count > ca->set->error_decay) {
 			unsigned errors;
 			unsigned old = count;
 			unsigned new = count - ca->set->error_decay;
 			/*
 			 * First we subtract refresh from count; each time we
 			 * succesfully do so, we rescale the errors once:
 			 */
 			count = atomic_cmpxchg(&ca->io_count, old, new);
 			if (count == old) {
 				count = new;
 				errors = atomic_read(&ca->io_errors);
 				do {
 					old = errors;
 					new = ((uint64_t) errors * 127) / 128;
 					errors = atomic_cmpxchg(&ca->io_errors,
 								old, new);
 				} while (old != errors);
 			}
 		}
 	}
 	if (error) {
 		char buf[BDEVNAME_SIZE];
 		unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
 						    &ca->io_errors);
 		errors >>= IO_ERROR_SHIFT;
 		if (errors < ca->set->error_limit)
 			pr_err("%s: IO error on %s, recovering",
 			       bdevname(ca->bdev, buf), m);
 		else
 			bch_cache_set_error(ca->set,
 					    "%s: too many IO errors %s",
 					    bdevname(ca->bdev, buf), m);
 	}
 }
 void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 			      int error, const char *m)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct cache *ca = PTR_CACHE(c, &b->key, 0);
 	unsigned threshold = bio->bi_rw & REQ_WRITE
 		? c->congested_write_threshold_us
 		: c->congested_read_threshold_us;
 	if (threshold) {
 		unsigned t = local_clock_us();
 		int us = t - b->submit_time_us;
 		int congested = atomic_read(&c->congested);
 		if (us > (int) threshold) {
 			int ms = us / 1024;
 			c->congested_last_us = t;
 			ms = min(ms, CONGESTED_MAX + congested);
 			atomic_sub(ms, &c->congested);
 		} else if (congested < 0)
 			atomic_inc(&c->congested);
 	}
 	bch_count_io_errors(ca, error, m);
 }
 void bch_bbio_endio(struct cache_set *c, struct bio *bio,
 		    int error, const char *m)
 {
 	struct closure *cl = bio->bi_private;
 	bch_bbio_count_io_errors(c, bio, error, m);
 	bio_put(bio);
 	closure_put(cl);
 }
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@ -0,0 +1,785 @@
 /*
 * bcache journalling code, for btree insertions
 *
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 /*
 * Journal replay/recovery:
 *
 * This code is all driven from run_cache_set(); we first read the journal
 * entries, do some other stuff, then we mark all the keys in the journal
 * entries (same as garbage collection would), then we replay them - reinserting
 * them into the cache in precisely the same order as they appear in the
 * journal.
 *
 * We only journal keys that go in leaf nodes, which simplifies things quite a
 * bit.
 */
 static void journal_read_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	closure_put(cl);
 }
 static int journal_read_bucket(struct cache *ca, struct list_head *list,
 			       struct btree_op *op, unsigned bucket_index)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = &ja->bio;
 	struct journal_replay *i;
 	struct jset *j, *data = ca->set->journal.w[0].data;
 	unsigned len, left, offset = 0;
 	int ret = 0;
 	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
 	pr_debug("reading %llu", (uint64_t) bucket);
 	while (offset < ca->sb.bucket_size) {
 reread:		left = ca->sb.bucket_size - offset;
 		len = min_t(unsigned, left, PAGE_SECTORS * 8);
 		bio_reset(bio);
 		bio->bi_sector	= bucket + offset;
 		bio->bi_bdev	= ca->bdev;
 		bio->bi_rw	= READ;
 		bio->bi_size	= len << 9;
 		bio->bi_end_io	= journal_read_endio;
 		bio->bi_private = &op->cl;
 		bio_map(bio, data);
 		closure_bio_submit(bio, &op->cl, ca);
 		closure_sync(&op->cl);
 		/* This function could be simpler now since we no longer write
 		 * journal entries that overlap bucket boundaries; this means
 		 * the start of a bucket will always have a valid journal entry
 		 * if it has any journal entries at all.
 		 */
 		j = data;
 		while (len) {
 			struct list_head *where;
 			size_t blocks, bytes = set_bytes(j);
 			if (j->magic != jset_magic(ca->set))
 				return ret;
 			if (bytes > left << 9)
 				return ret;
 			if (bytes > len << 9)
 				goto reread;
 			if (j->csum != csum_set(j))
 				return ret;
 			blocks = set_blocks(j, ca->set);
 			while (!list_empty(list)) {
 				i = list_first_entry(list,
 					struct journal_replay, list);
 				if (i->j.seq >= j->last_seq)
 					break;
 				list_del(&i->list);
 				kfree(i);
 			}
 			list_for_each_entry_reverse(i, list, list) {
 				if (j->seq == i->j.seq)
 					goto next_set;
 				if (j->seq < i->j.last_seq)
 					goto next_set;
 				if (j->seq > i->j.seq) {
 					where = &i->list;
 					goto add;
 				}
 			}
 			where = list;
 add:
 			i = kmalloc(offsetof(struct journal_replay, j) +
 				    bytes, GFP_KERNEL);
 			if (!i)
 				return -ENOMEM;
 			memcpy(&i->j, j, bytes);
 			list_add(&i->list, where);
 			ret = 1;
 			ja->seq[bucket_index] = j->seq;
 next_set:
 			offset	+= blocks * ca->sb.block_size;
 			len	-= blocks * ca->sb.block_size;
 			j = ((void *) j) + blocks * block_bytes(ca);
 		}
 	}
 	return ret;
 }
 int bch_journal_read(struct cache_set *c, struct list_head *list,
 			struct btree_op *op)
 {
 #define read_bucket(b)							\
 	({								\
 		int ret = journal_read_bucket(ca, list, op, b);		\
 		__set_bit(b, bitmap);					\
 		if (ret < 0)						\
 			return ret;					\
 		ret;							\
 	})
 	struct cache *ca;
 	unsigned iter;
 	for_each_cache(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 		unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
 		unsigned i, l, r, m;
 		uint64_t seq;
 		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
 		pr_debug("%u journal buckets", ca->sb.njournal_buckets);
 		/* Read journal buckets ordered by golden ratio hash to quickly
 		 * find a sequence of buckets with valid journal entries
 		 */
 		for (i = 0; i < ca->sb.njournal_buckets; i++) {
 			l = (i * 2654435769U) % ca->sb.njournal_buckets;
 			if (test_bit(l, bitmap))
 				break;
 			if (read_bucket(l))
 				goto bsearch;
 		}
 		/* If that fails, check all the buckets we haven't checked
 		 * already
 		 */
 		pr_debug("falling back to linear search");
 		for (l = 0; l < ca->sb.njournal_buckets; l++) {
 			if (test_bit(l, bitmap))
 				continue;
 			if (read_bucket(l))
 				goto bsearch;
 		}
 bsearch:
 		/* Binary search */
 		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
 		pr_debug("starting binary search, l %u r %u", l, r);
 		while (l + 1 < r) {
 			m = (l + r) >> 1;
 			if (read_bucket(m))
 				l = m;
 			else
 				r = m;
 		}
 		/* Read buckets in reverse order until we stop finding more
 		 * journal entries
 		 */
 		pr_debug("finishing up");
 		l = m;
 		while (1) {
 			if (!l--)
 				l = ca->sb.njournal_buckets - 1;
 			if (l == m)
 				break;
 			if (test_bit(l, bitmap))
 				continue;
 			if (!read_bucket(l))
 				break;
 		}
 		seq = 0;
 		for (i = 0; i < ca->sb.njournal_buckets; i++)
 			if (ja->seq[i] > seq) {
 				seq = ja->seq[i];
 				ja->cur_idx = ja->discard_idx =
 					ja->last_idx = i;
 			}
 	}
 	c->journal.seq = list_entry(list->prev,
 				    struct journal_replay,
 				    list)->j.seq;
 	return 0;
 #undef read_bucket
 }
 void bch_journal_mark(struct cache_set *c, struct list_head *list)
 {
 	atomic_t p = { 0 };
 	struct bkey *k;
 	struct journal_replay *i;
 	struct journal *j = &c->journal;
 	uint64_t last = j->seq;
 	/*
 	 * journal.pin should never fill up - we never write a journal
 	 * entry when it would fill up. But if for some reason it does, we
 	 * iterate over the list in reverse order so that we can just skip that
 	 * refcount instead of bugging.
 	 */
 	list_for_each_entry_reverse(i, list, list) {
 		BUG_ON(last < i->j.seq);
 		i->pin = NULL;
 		while (last-- != i->j.seq)
 			if (fifo_free(&j->pin) > 1) {
 				fifo_push_front(&j->pin, p);
 				atomic_set(&fifo_front(&j->pin), 0);
 			}
 		if (fifo_free(&j->pin) > 1) {
 			fifo_push_front(&j->pin, p);
 			i->pin = &fifo_front(&j->pin);
 			atomic_set(i->pin, 1);
 		}
 		for (k = i->j.start;
 		     k < end(&i->j);
 		     k = bkey_next(k)) {
 			unsigned j;
 			for (j = 0; j < KEY_PTRS(k); j++) {
 				struct bucket *g = PTR_BUCKET(c, k, j);
 				atomic_inc(&g->pin);
 				if (g->prio == BTREE_PRIO &&
 				    !ptr_stale(c, k, j))
 					g->prio = INITIAL_PRIO;
 			}
 			__bch_btree_mark_key(c, 0, k);
 		}
 	}
 }
 int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			  struct btree_op *op)
 {
 	int ret = 0, keys = 0, entries = 0;
 	struct bkey *k;
 	struct journal_replay *i =
 		list_entry(list->prev, struct journal_replay, list);
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
 		if (n != i->j.seq)
 			pr_err("journal entries %llu-%llu "
 			       "missing! (replaying %llu-%llu)\n",
 			       n, i->j.seq - 1, start, end);
 		for (k = i->j.start;
 		     k < end(&i->j);
 		     k = bkey_next(k)) {
 			pr_debug("%s", pkey(k));
 			bkey_copy(op->keys.top, k);
 			bch_keylist_push(&op->keys);
 			op->journal = i->pin;
 			atomic_inc(op->journal);
 			ret = bch_btree_insert(op, s);
 			if (ret)
 				goto err;
 			BUG_ON(!bch_keylist_empty(&op->keys));
 			keys++;
 			cond_resched();
 		}
 		if (i->pin)
 			atomic_dec(i->pin);
 		n = i->j.seq + 1;
 		entries++;
 	}
 	pr_info("journal replay done, %i keys in %i entries, seq %llu",
 		keys, entries, end);
 	while (!list_empty(list)) {
 		i = list_first_entry(list, struct journal_replay, list);
 		list_del(&i->list);
 		kfree(i);
 	}
 err:
 	closure_sync(&op->cl);
 	return ret;
 }
 /* Journalling */
 static void btree_flush_write(struct cache_set *c)
 {
 	/*
 	 * Try to find the btree node with that references the oldest journal
 	 * entry, best is our current candidate and is locked if non NULL:
 	 */
 	struct btree *b, *best = NULL;
 	unsigned iter;
 	for_each_cached_btree(b, c, iter) {
 		if (!down_write_trylock(&b->lock))
 			continue;
 		if (!btree_node_dirty(b) ||
 		    !btree_current_write(b)->journal) {
 			rw_unlock(true, b);
 			continue;
 		}
 		if (!best)
 			best = b;
 		else if (journal_pin_cmp(c,
 					 btree_current_write(best),
 					 btree_current_write(b))) {
 			rw_unlock(true, best);
 			best = b;
 		} else
 			rw_unlock(true, b);
 	}
 	if (best)
 		goto out;
 	/* We can't find the best btree node, just pick the first */
 	list_for_each_entry(b, &c->btree_cache, list)
 		if (!b->level && btree_node_dirty(b)) {
 			best = b;
 			rw_lock(true, best, best->level);
 			goto found;
 		}
 out:
 	if (!best)
 		return;
 found:
 	if (btree_node_dirty(best))
 		bch_btree_write(best, true, NULL);
 	rw_unlock(true, best);
 }
 #define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
 static void journal_discard_endio(struct bio *bio, int error)
 {
 	struct journal_device *ja =
 		container_of(bio, struct journal_device, discard_bio);
 	struct cache *ca = container_of(ja, struct cache, journal);
 	atomic_set(&ja->discard_in_flight, DISCARD_DONE);
 	closure_wake_up(&ca->set->journal.wait);
 	closure_put(&ca->set->cl);
 }
 static void journal_discard_work(struct work_struct *work)
 {
 	struct journal_device *ja =
 		container_of(work, struct journal_device, discard_work);
 	submit_bio(0, &ja->discard_bio);
 }
 static void do_journal_discard(struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = &ja->discard_bio;
 	if (!ca->discard) {
 		ja->discard_idx = ja->last_idx;
 		return;
 	}
 	switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
 	case DISCARD_IN_FLIGHT:
 		return;
 	case DISCARD_DONE:
 		ja->discard_idx = (ja->discard_idx + 1) %
 			ca->sb.njournal_buckets;
 		atomic_set(&ja->discard_in_flight, DISCARD_READY);
 		/* fallthrough */
 	case DISCARD_READY:
 		if (ja->discard_idx == ja->last_idx)
 			return;
 		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
 		bio_init(bio);
 		bio->bi_sector		= bucket_to_sector(ca->set,
 							   ca->sb.d[ja->discard_idx]);
 		bio->bi_bdev		= ca->bdev;
 		bio->bi_rw		= REQ_WRITE|REQ_DISCARD;
 		bio->bi_max_vecs	= 1;
 		bio->bi_io_vec		= bio->bi_inline_vecs;
 		bio->bi_size		= bucket_bytes(ca);
 		bio->bi_end_io		= journal_discard_endio;
 		closure_get(&ca->set->cl);
 		INIT_WORK(&ja->discard_work, journal_discard_work);
 		schedule_work(&ja->discard_work);
 	}
 }
 static void journal_reclaim(struct cache_set *c)
 {
 	struct bkey *k = &c->journal.key;
 	struct cache *ca;
 	uint64_t last_seq;
 	unsigned iter, n = 0;
 	atomic_t p;
 	while (!atomic_read(&fifo_front(&c->journal.pin)))
 		fifo_pop(&c->journal.pin, p);
 	last_seq = last_seq(&c->journal);
 	/* Update last_idx */
 	for_each_cache(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 		while (ja->last_idx != ja->cur_idx &&
 		       ja->seq[ja->last_idx] < last_seq)
 			ja->last_idx = (ja->last_idx + 1) %
 				ca->sb.njournal_buckets;
 	}
 	for_each_cache(ca, c, iter)
 		do_journal_discard(ca);
 	if (c->journal.blocks_free)
 		return;
 	/*
 	 * Allocate:
 	 * XXX: Sort by free journal space
 	 */
 	for_each_cache(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 		unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
 		/* No space available on this device */
 		if (next == ja->discard_idx)
 			continue;
 		ja->cur_idx = next;
 		k->ptr[n++] = PTR(0,
 				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
 				  ca->sb.nr_this_dev);
 	}
 	bkey_init(k);
 	SET_KEY_PTRS(k, n);
 	if (n)
 		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
 	if (!journal_full(&c->journal))
 		__closure_wake_up(&c->journal.wait);
 }
 void bch_journal_next(struct journal *j)
 {
 	atomic_t p = { 1 };
 	j->cur = (j->cur == j->w)
 		? &j->w[1]
 		: &j->w[0];
 	/*
 	 * The fifo_push() needs to happen at the same time as j->seq is
 	 * incremented for last_seq() to be calculated correctly
 	 */
 	BUG_ON(!fifo_push(&j->pin, p));
 	atomic_set(&fifo_back(&j->pin), 1);
 	j->cur->data->seq	= ++j->seq;
 	j->cur->need_write	= false;
 	j->cur->data->keys	= 0;
 	if (fifo_full(&j->pin))
 		pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
 }
 static void journal_write_endio(struct bio *bio, int error)
 {
 	struct journal_write *w = bio->bi_private;
 	cache_set_err_on(error, w->c, "journal io error");
 	closure_put(&w->c->journal.io.cl);
 }
 static void journal_write(struct closure *);
 static void journal_write_done(struct closure *cl)
 {
 	struct journal *j = container_of(cl, struct journal, io.cl);
 	struct cache_set *c = container_of(j, struct cache_set, journal);
 	struct journal_write *w = (j->cur == j->w)
 		? &j->w[1]
 		: &j->w[0];
 	__closure_wake_up(&w->wait);
 	if (c->journal_delay_ms)
 		closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
 	continue_at(cl, journal_write, system_wq);
 }
 static void journal_write_unlocked(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
 	struct cache *ca;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
 	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
 	struct bio *bio;
 	struct bio_list list;
 	bio_list_init(&list);
 	if (!w->need_write) {
 		/*
 		 * XXX: have to unlock closure before we unlock journal lock,
 		 * else we race with bch_journal(). But this way we race
 		 * against cache set unregister. Doh.
 		 */
 		set_closure_fn(cl, NULL, NULL);
 		closure_sub(cl, CLOSURE_RUNNING + 1);
 		spin_unlock(&c->journal.lock);
 		return;
 	} else if (journal_full(&c->journal)) {
 		journal_reclaim(c);
 		spin_unlock(&c->journal.lock);
 		btree_flush_write(c);
 		continue_at(cl, journal_write, system_wq);
 	}
 	c->journal.blocks_free -= set_blocks(w->data, c);
 	w->data->btree_level = c->root->level;
 	bkey_copy(&w->data->btree_root, &c->root->key);
 	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
 	for_each_cache(ca, c, i)
 		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
 	w->data->magic		= jset_magic(c);
 	w->data->version	= BCACHE_JSET_VERSION;
 	w->data->last_seq	= last_seq(&c->journal);
 	w->data->csum		= csum_set(w->data);
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		ca = PTR_CACHE(c, k, i);
 		bio = &ca->journal.bio;
 		atomic_long_add(sectors, &ca->meta_sectors_written);
 		bio_reset(bio);
 		bio->bi_sector	= PTR_OFFSET(k, i);
 		bio->bi_bdev	= ca->bdev;
 		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
 		bio->bi_size	= sectors << 9;
 		bio->bi_end_io	= journal_write_endio;
 		bio->bi_private = w;
 		bio_map(bio, w->data);
 		trace_bcache_journal_write(bio);
 		bio_list_add(&list, bio);
 		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
 		ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
 	}
 	atomic_dec_bug(&fifo_back(&c->journal.pin));
 	bch_journal_next(&c->journal);
 	journal_reclaim(c);
 	spin_unlock(&c->journal.lock);
 	while ((bio = bio_list_pop(&list)))
 		closure_bio_submit(bio, cl, c->cache[0]);
 	continue_at(cl, journal_write_done, NULL);
 }
 static void journal_write(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
 	spin_lock(&c->journal.lock);
 	journal_write_unlocked(cl);
 }
 static void __journal_try_write(struct cache_set *c, bool noflush)
 {
 	struct closure *cl = &c->journal.io.cl;
 	if (!closure_trylock(cl, &c->cl))
 		spin_unlock(&c->journal.lock);
 	else if (noflush && journal_full(&c->journal)) {
 		spin_unlock(&c->journal.lock);
 		continue_at(cl, journal_write, system_wq);
 	} else
 		journal_write_unlocked(cl);
 }
 #define journal_try_write(c)	__journal_try_write(c, false)
 void bch_journal_meta(struct cache_set *c, struct closure *cl)
 {
 	struct journal_write *w;
 	if (CACHE_SYNC(&c->sb)) {
 		spin_lock(&c->journal.lock);
 		w = c->journal.cur;
 		w->need_write = true;
 		if (cl)
 			BUG_ON(!closure_wait(&w->wait, cl));
 		__journal_try_write(c, true);
 	}
 }
 /*
 * Entry point to the journalling code - bio_insert() and btree_invalidate()
 * pass bch_journal() a list of keys to be journalled, and then
 * bch_journal() hands those same keys off to btree_insert_async()
 */
 void bch_journal(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct cache_set *c = op->c;
 	struct journal_write *w;
 	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
 	if (op->type != BTREE_INSERT ||
 	    !CACHE_SYNC(&c->sb))
 		goto out;
 	/*
 	 * If we're looping because we errored, might already be waiting on
 	 * another journal write:
 	 */
 	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
 		closure_sync(cl->parent);
 	spin_lock(&c->journal.lock);
 	if (journal_full(&c->journal)) {
 		/* XXX: tracepoint */
 		closure_wait(&c->journal.wait, cl);
 		journal_reclaim(c);
 		spin_unlock(&c->journal.lock);
 		btree_flush_write(c);
 		continue_at(cl, bch_journal, bcache_wq);
 	}
 	w = c->journal.cur;
 	w->need_write = true;
 	b = __set_blocks(w->data, w->data->keys + n, c);
 	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
 	    b > c->journal.blocks_free) {
 		/* XXX: If we were inserting so many keys that they won't fit in
 		 * an _empty_ journal write, we'll deadlock. For now, handle
 		 * this in bch_keylist_realloc() - but something to think about.
 		 */
 		BUG_ON(!w->data->keys);
 		/* XXX: tracepoint */
 		BUG_ON(!closure_wait(&w->wait, cl));
 		closure_flush(&c->journal.io);
 		journal_try_write(c);
 		continue_at(cl, bch_journal, bcache_wq);
 	}
 	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
 	w->data->keys += n;
 	op->journal = &fifo_back(&c->journal.pin);
 	atomic_inc(op->journal);
 	if (op->flush_journal) {
 		closure_flush(&c->journal.io);
 		closure_wait(&w->wait, cl->parent);
 	}
 	journal_try_write(c);
 out:
 	bch_btree_insert_async(cl);
 }
 void bch_journal_free(struct cache_set *c)
 {
 	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
 	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
 	free_fifo(&c->journal.pin);
 }
 int bch_journal_alloc(struct cache_set *c)
 {
 	struct journal *j = &c->journal;
 	closure_init_unlocked(&j->io);
 	spin_lock_init(&j->lock);
 	c->journal_delay_ms = 100;
 	j->w[0].c = c;
 	j->w[1].c = c;
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
 	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
 		return -ENOMEM;
 	return 0;
 }
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@ -0,0 +1,215 @@
 #ifndef _BCACHE_JOURNAL_H
 #define _BCACHE_JOURNAL_H
 /*
 * THE JOURNAL:
 *
 * The journal is treated as a circular buffer of buckets - a journal entry
 * never spans two buckets. This means (not implemented yet) we can resize the
 * journal at runtime, and will be needed for bcache on raw flash support.
 *
 * Journal entries contain a list of keys, ordered by the time they were
 * inserted; thus journal replay just has to reinsert the keys.
 *
 * We also keep some things in the journal header that are logically part of the
 * superblock - all the things that are frequently updated. This is for future
 * bcache on raw flash support; the superblock (which will become another
 * journal) can't be moved or wear leveled, so it contains just enough
 * information to find the main journal, and the superblock only has to be
 * rewritten when we want to move/wear level the main journal.
 *
 * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
 * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
 * from cache misses, which don't have to be journaled, and for writeback and
 * moving gc we work around it by flushing the btree to disk before updating the
 * gc information. But it is a potential issue with incremental garbage
 * collection, and it's fragile.
 *
 * OPEN JOURNAL ENTRIES:
 *
 * Each journal entry contains, in the header, the sequence number of the last
 * journal entry still open - i.e. that has keys that haven't been flushed to
 * disk in the btree.
 *
 * We track this by maintaining a refcount for every open journal entry, in a
 * fifo; each entry in the fifo corresponds to a particular journal
 * entry/sequence number. When the refcount at the tail of the fifo goes to
 * zero, we pop it off - thus, the size of the fifo tells us the number of open
 * journal entries
 *
 * We take a refcount on a journal entry when we add some keys to a journal
 * entry that we're going to insert (held by struct btree_op), and then when we
 * insert those keys into the btree the btree write we're setting up takes a
 * copy of that refcount (held by struct btree_write). That refcount is dropped
 * when the btree write completes.
 *
 * A struct btree_write can only hold a refcount on a single journal entry, but
 * might contain keys for many journal entries - we handle this by making sure
 * it always has a refcount on the _oldest_ journal entry of all the journal
 * entries it has keys for.
 *
 * JOURNAL RECLAIM:
 *
 * As mentioned previously, our fifo of refcounts tells us the number of open
 * journal entries; from that and the current journal sequence number we compute
 * last_seq - the oldest journal entry we still need. We write last_seq in each
 * journal entry, and we also have to keep track of where it exists on disk so
 * we don't overwrite it when we loop around the journal.
 *
 * To do that we track, for each journal bucket, the sequence number of the
 * newest journal entry it contains - if we don't need that journal entry we
 * don't need anything in that bucket anymore. From that we track the last
 * journal bucket we still need; all this is tracked in struct journal_device
 * and updated by journal_reclaim().
 *
 * JOURNAL FILLING UP:
 *
 * There are two ways the journal could fill up; either we could run out of
 * space to write to, or we could have too many open journal entries and run out
 * of room in the fifo of refcounts. Since those refcounts are decremented
 * without any locking we can't safely resize that fifo, so we handle it the
 * same way.
 *
 * If the journal fills up, we start flushing dirty btree nodes until we can
 * allocate space for a journal write again - preferentially flushing btree
 * nodes that are pinning the oldest journal entries first.
 */
 #define BCACHE_JSET_VERSION_UUIDv1	1
 /* Always latest UUID format */
 #define BCACHE_JSET_VERSION_UUID	1
 #define BCACHE_JSET_VERSION		1
 /*
 * On disk format for a journal entry:
 * seq is monotonically increasing; every journal entry has its own unique
 * sequence number.
 *
 * last_seq is the oldest journal entry that still has keys the btree hasn't
 * flushed to disk yet.
 *
 * version is for on disk format changes.
 */
 struct jset {
 	uint64_t		csum;
 	uint64_t		magic;
 	uint64_t		seq;
 	uint32_t		version;
 	uint32_t		keys;
 	uint64_t		last_seq;
 	BKEY_PADDED(uuid_bucket);
 	BKEY_PADDED(btree_root);
 	uint16_t		btree_level;
 	uint16_t		pad[3];
 	uint64_t		prio_bucket[MAX_CACHES_PER_SET];
 	union {
 		struct bkey	start[0];
 		uint64_t	d[0];
 	};
 };
 /*
 * Only used for holding the journal entries we read in btree_journal_read()
 * during cache_registration
 */
 struct journal_replay {
 	struct list_head	list;
 	atomic_t		*pin;
 	struct jset		j;
 };
 /*
 * We put two of these in struct journal; we used them for writes to the
 * journal that are being staged or in flight.
 */
 struct journal_write {
 	struct jset		*data;
 #define JSET_BITS		3
 	struct cache_set	*c;
 	struct closure_waitlist	wait;
 	bool			need_write;
 };
 /* Embedded in struct cache_set */
 struct journal {
 	spinlock_t		lock;
 	/* used when waiting because the journal was full */
 	struct closure_waitlist	wait;
 	struct closure_with_timer io;
 	/* Number of blocks free in the bucket(s) we're currently writing to */
 	unsigned		blocks_free;
 	uint64_t		seq;
 	DECLARE_FIFO(atomic_t, pin);
 	BKEY_PADDED(key);
 	struct journal_write	w[2], *cur;
 };
 /*
 * Embedded in struct cache. First three fields refer to the array of journal
 * buckets, in cache_sb.
 */
 struct journal_device {
 	/*
 	 * For each journal bucket, contains the max sequence number of the
 	 * journal writes it contains - so we know when a bucket can be reused.
 	 */
 	uint64_t		seq[SB_JOURNAL_BUCKETS];
 	/* Journal bucket we're currently writing to */
 	unsigned		cur_idx;
 	/* Last journal bucket that still contains an open journal entry */
 	unsigned		last_idx;
 	/* Next journal bucket to be discarded */
 	unsigned		discard_idx;
 #define DISCARD_READY		0
 #define DISCARD_IN_FLIGHT	1
 #define DISCARD_DONE		2
 	/* 1 - discard in flight, -1 - discard completed */
 	atomic_t		discard_in_flight;
 	struct work_struct	discard_work;
 	struct bio		discard_bio;
 	struct bio_vec		discard_bv;
 	/* Bio for journal reads/writes to this device */
 	struct bio		bio;
 	struct bio_vec		bv[8];
 };
 #define journal_pin_cmp(c, l, r)				\
 	(fifo_idx(&(c)->journal.pin, (l)->journal) >		\
 	 fifo_idx(&(c)->journal.pin, (r)->journal))
 #define JOURNAL_PIN	20000
 #define journal_full(j)						\
 	(!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
 struct closure;
 struct cache_set;
 struct btree_op;
 void bch_journal(struct closure *);
 void bch_journal_next(struct journal *);
 void bch_journal_mark(struct cache_set *, struct list_head *);
 void bch_journal_meta(struct cache_set *, struct closure *);
 int bch_journal_read(struct cache_set *, struct list_head *,
 			struct btree_op *);
 int bch_journal_replay(struct cache_set *, struct list_head *,
 			  struct btree_op *);
 void bch_journal_free(struct cache_set *);
 int bch_journal_alloc(struct cache_set *);
 #endif /* _BCACHE_JOURNAL_H */
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@ -0,0 +1,254 @@
 /*
 * Moving/copying garbage collector
 *
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 struct moving_io {
 	struct keybuf_key	*w;
 	struct search		s;
 	struct bbio		bio;
 };
 static bool moving_pred(struct keybuf *buf, struct bkey *k)
 {
 	struct cache_set *c = container_of(buf, struct cache_set,
 					   moving_gc_keys);
 	unsigned i;
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		struct cache *ca = PTR_CACHE(c, k, i);
 		struct bucket *g = PTR_BUCKET(c, k, i);
 		if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
 			return true;
 	}
 	return false;
 }
 /* Moving GC - IO loop */
 static void moving_io_destructor(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
 	kfree(io);
 }
 static void write_moving_finish(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
 	struct bio *bio = &io->bio.bio;
 	struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
 	while (bv-- != bio->bi_io_vec)
 		__free_page(bv->bv_page);
 	pr_debug("%s %s", io->s.op.insert_collision
 		 ? "collision moving" : "moved",
 		 pkey(&io->w->key));
 	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
 	atomic_dec_bug(&io->s.op.c->in_flight);
 	closure_wake_up(&io->s.op.c->moving_gc_wait);
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
 static void read_moving_endio(struct bio *bio, int error)
 {
 	struct moving_io *io = container_of(bio->bi_private,
 					    struct moving_io, s.cl);
 	if (error)
 		io->s.error = error;
 	bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
 }
 static void moving_init(struct moving_io *io)
 {
 	struct bio *bio = &io->bio.bio;
 	bio_init(bio);
 	bio_get(bio);
 	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	bio->bi_size		= KEY_SIZE(&io->w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
 					       PAGE_SECTORS);
 	bio->bi_private		= &io->s.cl;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bio_map(bio, NULL);
 }
 static void write_moving(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	if (!s->error) {
 		trace_bcache_write_moving(&io->bio.bio);
 		moving_init(io);
 		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
 		s->op.lock		= -1;
 		s->op.write_prio	= 1;
 		s->op.cache_bio		= &io->bio.bio;
 		s->writeback		= KEY_DIRTY(&io->w->key);
 		s->op.csum		= KEY_CSUM(&io->w->key);
 		s->op.type = BTREE_REPLACE;
 		bkey_copy(&s->op.replace, &io->w->key);
 		closure_init(&s->op.cl, cl);
 		bch_insert_data(&s->op.cl);
 	}
 	continue_at(cl, write_moving_finish, NULL);
 }
 static void read_moving_submit(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	struct bio *bio = &io->bio.bio;
 	trace_bcache_read_moving(bio);
 	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
 	continue_at(cl, write_moving, bch_gc_wq);
 }
 static void read_moving(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
 	struct keybuf_key *w;
 	struct moving_io *io;
 	struct bio *bio;
 	/* XXX: if we error, background writeback could stall indefinitely */
 	while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
 		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
 		if (!w)
 			break;
 		io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
 			     GFP_KERNEL);
 		if (!io)
 			goto err;
 		w->private	= io;
 		io->w		= w;
 		io->s.op.inode	= KEY_INODE(&w->key);
 		io->s.op.c	= c;
 		moving_init(io);
 		bio = &io->bio.bio;
 		bio->bi_rw	= READ;
 		bio->bi_end_io	= read_moving_endio;
 		if (bio_alloc_pages(bio, GFP_KERNEL))
 			goto err;
 		pr_debug("%s", pkey(&w->key));
 		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
 		if (atomic_inc_return(&c->in_flight) >= 64) {
 			closure_wait_event(&c->moving_gc_wait, cl,
 					   atomic_read(&c->in_flight) < 64);
 			continue_at(cl, read_moving, bch_gc_wq);
 		}
 	}
 	if (0) {
 err:		if (!IS_ERR_OR_NULL(w->private))
 			kfree(w->private);
 		bch_keybuf_del(&c->moving_gc_keys, w);
 	}
 	closure_return(cl);
 }
 void bch_moving_gc(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	struct cache *ca;
 	struct bucket *b;
 	unsigned i;
 	bool bucket_cmp(struct bucket *l, struct bucket *r)
 	{
 		return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
 	}
 	unsigned top(struct cache *ca)
 	{
 		return GC_SECTORS_USED(heap_peek(&ca->heap));
 	}
 	if (!c->copy_gc_enabled)
 		closure_return(cl);
 	mutex_lock(&c->bucket_lock);
 	for_each_cache(ca, c, i) {
 		unsigned sectors_to_move = 0;
 		unsigned reserve_sectors = ca->sb.bucket_size *
 			min(fifo_used(&ca->free), ca->free.size / 2);
 		ca->heap.used = 0;
 		for_each_bucket(b, ca) {
 			if (!GC_SECTORS_USED(b))
 				continue;
 			if (!heap_full(&ca->heap)) {
 				sectors_to_move += GC_SECTORS_USED(b);
 				heap_add(&ca->heap, b, bucket_cmp);
 			} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
 				sectors_to_move -= top(ca);
 				sectors_to_move += GC_SECTORS_USED(b);
 				ca->heap.data[0] = b;
 				heap_sift(&ca->heap, 0, bucket_cmp);
 			}
 		}
 		while (sectors_to_move > reserve_sectors) {
 			heap_pop(&ca->heap, b, bucket_cmp);
 			sectors_to_move -= GC_SECTORS_USED(b);
 		}
 		ca->gc_move_threshold = top(ca);
 		pr_debug("threshold %u", ca->gc_move_threshold);
 	}
 	mutex_unlock(&c->bucket_lock);
 	c->moving_gc_keys.last_scanned = ZERO_KEY;
 	closure_init(&c->moving_gc, cl);
 	read_moving(&c->moving_gc);
 	closure_return(cl);
 }
 void bch_moving_init_cache_set(struct cache_set *c)
 {
 	bch_keybuf_init(&c->moving_gc_keys, moving_pred);
 }
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@ -0,0 +1,62 @@
 #ifndef _BCACHE_REQUEST_H_
 #define _BCACHE_REQUEST_H_
 #include <linux/cgroup.h>
 struct search {
 	/* Stack frame for bio_complete */
 	struct closure		cl;
 	struct bcache_device	*d;
 	struct task_struct	*task;
 	struct bbio		bio;
 	struct bio		*orig_bio;
 	struct bio		*cache_miss;
 	unsigned		cache_bio_sectors;
 	unsigned		recoverable:1;
 	unsigned		unaligned_bvec:1;
 	unsigned		write:1;
 	unsigned		writeback:1;
 	/* IO error returned to s->bio */
 	short			error;
 	unsigned long		start_time;
 	/* Anything past op->keys won't get zeroed in do_bio_hook */
 	struct btree_op		op;
 };
 void bch_cache_read_endio(struct bio *, int);
 int bch_get_congested(struct cache_set *);
 void bch_insert_data(struct closure *cl);
 void bch_btree_insert_async(struct closure *);
 void bch_cache_read_endio(struct bio *, int);
 void bch_open_buckets_free(struct cache_set *);
 int bch_open_buckets_alloc(struct cache_set *);
 void bch_cached_dev_request_init(struct cached_dev *dc);
 void bch_flash_dev_request_init(struct bcache_device *d);
 extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
 struct bch_cgroup {
 #ifdef CONFIG_CGROUP_BCACHE
 	struct cgroup_subsys_state	css;
 #endif
 	/*
 	 * We subtract one from the index into bch_cache_modes[], so that
 	 * default == -1; this makes it so the rest match up with d->cache_mode,
 	 * and we use d->cache_mode if cgrp->cache_mode < 0
 	 */
 	short				cache_mode;
 	bool				verify;
 	struct cache_stat_collector	stats;
 };
 struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
 #endif /* _BCACHE_REQUEST_H_ */
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@ -0,0 +1,245 @@
 /*
 * bcache stats code
 *
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "stats.h"
 #include "btree.h"
 #include "request.h"
 #include "sysfs.h"
 /*
 * We keep absolute totals of various statistics, and addionally a set of three
 * rolling averages.
 *
 * Every so often, a timer goes off and rescales the rolling averages.
 * accounting_rescale[] is how many times the timer has to go off before we
 * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
 * and one day.
 *
 * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
 * and accounting_weight is what we use to rescale:
 *
 * pow(31 / 32, 22) ~= 1/2
 *
 * So that we don't have to increment each set of numbers every time we (say)
 * get a cache hit, we increment a single atomic_t in acc->collector, and when
 * the rescale function runs it resets the atomic counter to 0 and adds its
 * old value to each of the exported numbers.
 *
 * To reduce rounding error, the numbers in struct cache_stats are all
 * stored left shifted by 16, and scaled back in the sysfs show() function.
 */
 static const unsigned DAY_RESCALE		= 288;
 static const unsigned HOUR_RESCALE		= 12;
 static const unsigned FIVE_MINUTE_RESCALE	= 1;
 static const unsigned accounting_delay		= (HZ * 300) / 22;
 static const unsigned accounting_weight		= 32;
 /* sysfs reading/writing */
 read_attribute(cache_hits);
 read_attribute(cache_misses);
 read_attribute(cache_bypass_hits);
 read_attribute(cache_bypass_misses);
 read_attribute(cache_hit_ratio);
 read_attribute(cache_readaheads);
 read_attribute(cache_miss_collisions);
 read_attribute(bypassed);
 SHOW(bch_stats)
 {
 	struct cache_stats *s =
 		container_of(kobj, struct cache_stats, kobj);
 #define var(stat)		(s->stat >> 16)
 	var_print(cache_hits);
 	var_print(cache_misses);
 	var_print(cache_bypass_hits);
 	var_print(cache_bypass_misses);
 	sysfs_print(cache_hit_ratio,
 		    DIV_SAFE(var(cache_hits) * 100,
 			     var(cache_hits) + var(cache_misses)));
 	var_print(cache_readaheads);
 	var_print(cache_miss_collisions);
 	sysfs_hprint(bypassed,	var(sectors_bypassed) << 9);
 #undef var
 	return 0;
 }
 STORE(bch_stats)
 {
 	return size;
 }
 static void bch_stats_release(struct kobject *k)
 {
 }
 static struct attribute *bch_stats_files[] = {
 	&sysfs_cache_hits,
 	&sysfs_cache_misses,
 	&sysfs_cache_bypass_hits,
 	&sysfs_cache_bypass_misses,
 	&sysfs_cache_hit_ratio,
 	&sysfs_cache_readaheads,
 	&sysfs_cache_miss_collisions,
 	&sysfs_bypassed,
 	NULL
 };
 static KTYPE(bch_stats);
 static void scale_accounting(unsigned long data);
 void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent)
 {
 	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
 	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
 	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
 	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
 	closure_init(&acc->cl, parent);
 	init_timer(&acc->timer);
 	acc->timer.expires	= jiffies + accounting_delay;
 	acc->timer.data		= (unsigned long) acc;
 	acc->timer.function	= scale_accounting;
 	add_timer(&acc->timer);
 }
 int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
 				   struct kobject *parent)
 {
 	int ret = kobject_add(&acc->total.kobj, parent,
 			      "stats_total");
 	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
 				 "stats_five_minute");
 	ret = ret ?: kobject_add(&acc->hour.kobj, parent,
 				 "stats_hour");
 	ret = ret ?: kobject_add(&acc->day.kobj, parent,
 				 "stats_day");
 	return ret;
 }
 void bch_cache_accounting_clear(struct cache_accounting *acc)
 {
 	memset(&acc->total.cache_hits,
 	       0,
 	       sizeof(unsigned long) * 7);
 }
 void bch_cache_accounting_destroy(struct cache_accounting *acc)
 {
 	kobject_put(&acc->total.kobj);
 	kobject_put(&acc->five_minute.kobj);
 	kobject_put(&acc->hour.kobj);
 	kobject_put(&acc->day.kobj);
 	atomic_set(&acc->closing, 1);
 	if (del_timer_sync(&acc->timer))
 		closure_return(&acc->cl);
 }
 /* EWMA scaling */
 static void scale_stat(unsigned long *stat)
 {
 	*stat =  ewma_add(*stat, 0, accounting_weight, 0);
 }
 static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
 {
 	if (++stats->rescale == rescale_at) {
 		stats->rescale = 0;
 		scale_stat(&stats->cache_hits);
 		scale_stat(&stats->cache_misses);
 		scale_stat(&stats->cache_bypass_hits);
 		scale_stat(&stats->cache_bypass_misses);
 		scale_stat(&stats->cache_readaheads);
 		scale_stat(&stats->cache_miss_collisions);
 		scale_stat(&stats->sectors_bypassed);
 	}
 }
 static void scale_accounting(unsigned long data)
 {
 	struct cache_accounting *acc = (struct cache_accounting *) data;
 #define move_stat(name) do {						\
 	unsigned t = atomic_xchg(&acc->collector.name, 0);		\
 	t <<= 16;							\
 	acc->five_minute.name += t;					\
 	acc->hour.name += t;						\
 	acc->day.name += t;						\
 	acc->total.name += t;						\
 } while (0)
 	move_stat(cache_hits);
 	move_stat(cache_misses);
 	move_stat(cache_bypass_hits);
 	move_stat(cache_bypass_misses);
 	move_stat(cache_readaheads);
 	move_stat(cache_miss_collisions);
 	move_stat(sectors_bypassed);
 	scale_stats(&acc->total, 0);
 	scale_stats(&acc->day, DAY_RESCALE);
 	scale_stats(&acc->hour, HOUR_RESCALE);
 	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
 	acc->timer.expires += accounting_delay;
 	if (!atomic_read(&acc->closing))
 		add_timer(&acc->timer);
 	else
 		closure_return(&acc->cl);
 }
 static void mark_cache_stats(struct cache_stat_collector *stats,
 			     bool hit, bool bypass)
 {
 	if (!bypass)
 		if (hit)
 			atomic_inc(&stats->cache_hits);
 		else
 			atomic_inc(&stats->cache_misses);
 	else
 		if (hit)
 			atomic_inc(&stats->cache_bypass_hits);
 		else
 			atomic_inc(&stats->cache_bypass_misses);
 }
 void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
 	mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
 #ifdef CONFIG_CGROUP_BCACHE
 	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
 #endif
 }
 void bch_mark_cache_readahead(struct search *s)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_readaheads);
 	atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
 }
 void bch_mark_cache_miss_collision(struct search *s)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
 	atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
 }
 void bch_mark_sectors_bypassed(struct search *s, int sectors)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
 	atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
 }
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@ -0,0 +1,58 @@
 #ifndef _BCACHE_STATS_H_
 #define _BCACHE_STATS_H_
 struct cache_stat_collector {
 	atomic_t cache_hits;
 	atomic_t cache_misses;
 	atomic_t cache_bypass_hits;
 	atomic_t cache_bypass_misses;
 	atomic_t cache_readaheads;
 	atomic_t cache_miss_collisions;
 	atomic_t sectors_bypassed;
 };
 struct cache_stats {
 	struct kobject		kobj;
 	unsigned long cache_hits;
 	unsigned long cache_misses;
 	unsigned long cache_bypass_hits;
 	unsigned long cache_bypass_misses;
 	unsigned long cache_readaheads;
 	unsigned long cache_miss_collisions;
 	unsigned long sectors_bypassed;
 	unsigned		rescale;
 };
 struct cache_accounting {
 	struct closure		cl;
 	struct timer_list	timer;
 	atomic_t		closing;
 	struct cache_stat_collector collector;
 	struct cache_stats total;
 	struct cache_stats five_minute;
 	struct cache_stats hour;
 	struct cache_stats day;
 };
 struct search;
 void bch_cache_accounting_init(struct cache_accounting *acc,
 			       struct closure *parent);
 int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
 				   struct kobject *parent);
 void bch_cache_accounting_clear(struct cache_accounting *acc);
 void bch_cache_accounting_destroy(struct cache_accounting *acc);
 void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
 void bch_mark_cache_readahead(struct search *s);
 void bch_mark_cache_miss_collision(struct search *s);
 void bch_mark_sectors_bypassed(struct search *s, int sectors);
 #endif /* _BCACHE_STATS_H_ */
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@ -0,0 +1,817 @@
 /*
 * bcache sysfs interfaces
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "sysfs.h"
 #include "btree.h"
 #include "request.h"
 #include <linux/sort.h>
 static const char * const cache_replacement_policies[] = {
 	"lru",
 	"fifo",
 	"random",
 	NULL
 };
 write_attribute(attach);
 write_attribute(detach);
 write_attribute(unregister);
 write_attribute(stop);
 write_attribute(clear_stats);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 write_attribute(flash_vol_create);
 read_attribute(bucket_size);
 read_attribute(block_size);
 read_attribute(nbuckets);
 read_attribute(tree_depth);
 read_attribute(root_usage_percent);
 read_attribute(priority_stats);
 read_attribute(btree_cache_size);
 read_attribute(btree_cache_max_chain);
 read_attribute(cache_available_percent);
 read_attribute(written);
 read_attribute(btree_written);
 read_attribute(metadata_written);
 read_attribute(active_journal_entries);
 sysfs_time_stats_attribute(btree_gc,	sec, ms);
 sysfs_time_stats_attribute(btree_split, sec, us);
 sysfs_time_stats_attribute(btree_sort,	ms,  us);
 sysfs_time_stats_attribute(btree_read,	ms,  us);
 sysfs_time_stats_attribute(try_harder,	ms,  us);
 read_attribute(btree_nodes);
 read_attribute(btree_used_percent);
 read_attribute(average_key_size);
 read_attribute(dirty_data);
 read_attribute(bset_tree_stats);
 read_attribute(state);
 read_attribute(cache_read_races);
 read_attribute(writeback_keys_done);
 read_attribute(writeback_keys_failed);
 read_attribute(io_errors);
 read_attribute(congested);
 rw_attribute(congested_read_threshold_us);
 rw_attribute(congested_write_threshold_us);
 rw_attribute(sequential_cutoff);
 rw_attribute(sequential_merge);
 rw_attribute(data_csum);
 rw_attribute(cache_mode);
 rw_attribute(writeback_metadata);
 rw_attribute(writeback_running);
 rw_attribute(writeback_percent);
 rw_attribute(writeback_delay);
 rw_attribute(writeback_rate);
 rw_attribute(writeback_rate_update_seconds);
 rw_attribute(writeback_rate_d_term);
 rw_attribute(writeback_rate_p_term_inverse);
 rw_attribute(writeback_rate_d_smooth);
 read_attribute(writeback_rate_debug);
 rw_attribute(synchronous);
 rw_attribute(journal_delay_ms);
 rw_attribute(discard);
 rw_attribute(running);
 rw_attribute(label);
 rw_attribute(readahead);
 rw_attribute(io_error_limit);
 rw_attribute(io_error_halflife);
 rw_attribute(verify);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
 	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
 #define var(stat)		(dc->stat)
 	if (attr == &sysfs_cache_mode)
 		return snprint_string_list(buf, PAGE_SIZE,
 					   bch_cache_modes + 1,
 					   BDEV_CACHE_MODE(&dc->sb));
 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
 	var_printf(verify,		"%i");
 	var_printf(writeback_metadata,	"%i");
 	var_printf(writeback_running,	"%i");
 	var_print(writeback_delay);
 	var_print(writeback_percent);
 	sysfs_print(writeback_rate,	dc->writeback_rate.rate);
 	var_print(writeback_rate_update_seconds);
 	var_print(writeback_rate_d_term);
 	var_print(writeback_rate_p_term_inverse);
 	var_print(writeback_rate_d_smooth);
 	if (attr == &sysfs_writeback_rate_debug) {
 		char dirty[20];
 		char derivative[20];
 		char target[20];
 		hprint(dirty,
 		       atomic_long_read(&dc->disk.sectors_dirty) << 9);
 		hprint(derivative,	dc->writeback_rate_derivative << 9);
 		hprint(target,		dc->writeback_rate_target << 9);
 		return sprintf(buf,
 			       "rate:\t\t%u\n"
 			       "change:\t\t%i\n"
 			       "dirty:\t\t%s\n"
 			       "derivative:\t%s\n"
 			       "target:\t\t%s\n",
 			       dc->writeback_rate.rate,
 			       dc->writeback_rate_change,
 			       dirty, derivative, target);
 	}
 	sysfs_hprint(dirty_data,
 		     atomic_long_read(&dc->disk.sectors_dirty) << 9);
 	var_printf(sequential_merge,	"%i");
 	var_hprint(sequential_cutoff);
 	var_hprint(readahead);
 	sysfs_print(running,		atomic_read(&dc->running));
 	sysfs_print(state,		states[BDEV_STATE(&dc->sb)]);
 	if (attr == &sysfs_label) {
 		memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
 		buf[SB_LABEL_SIZE + 1] = '\0';
 		strcat(buf, "\n");
 		return strlen(buf);
 	}
 #undef var
 	return 0;
 }
 SHOW_LOCKED(bch_cached_dev)
 STORE(__cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
 	unsigned v = size;
 	struct cache_set *c;
 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoi_h(var)		sysfs_hatoi(var, dc->var)
 	sysfs_strtoul(data_csum,	dc->disk.data_csum);
 	d_strtoul(verify);
 	d_strtoul(writeback_metadata);
 	d_strtoul(writeback_running);
 	d_strtoul(writeback_delay);
 	sysfs_strtoul_clamp(writeback_rate,
 			    dc->writeback_rate.rate, 1, 1000000);
 	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
 	d_strtoul(writeback_rate_update_seconds);
 	d_strtoul(writeback_rate_d_term);
 	d_strtoul(writeback_rate_p_term_inverse);
 	sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
 			    dc->writeback_rate_p_term_inverse, 1, INT_MAX);
 	d_strtoul(writeback_rate_d_smooth);
 	d_strtoul(sequential_merge);
 	d_strtoi_h(sequential_cutoff);
 	d_strtoi_h(readahead);
 	if (attr == &sysfs_clear_stats)
 		bch_cache_accounting_clear(&dc->accounting);
 	if (attr == &sysfs_running &&
 	    strtoul_or_return(buf))
 		bch_cached_dev_run(dc);
 	if (attr == &sysfs_cache_mode) {
 		ssize_t v = read_string_list(buf, bch_cache_modes + 1);
 		if (v < 0)
 			return v;
 		if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
 			SET_BDEV_CACHE_MODE(&dc->sb, v);
 			bch_write_bdev_super(dc, NULL);
 		}
 	}
 	if (attr == &sysfs_label) {
 		memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
 		bch_write_bdev_super(dc, NULL);
 		if (dc->disk.c) {
 			memcpy(dc->disk.c->uuids[dc->disk.id].label,
 			       buf, SB_LABEL_SIZE);
 			bch_uuid_write(dc->disk.c);
 		}
 	}
 	if (attr == &sysfs_attach) {
 		if (parse_uuid(buf, dc->sb.set_uuid) < 16)
 			return -EINVAL;
 		list_for_each_entry(c, &bch_cache_sets, list) {
 			v = bch_cached_dev_attach(dc, c);
 			if (!v)
 				return size;
 		}
 		pr_err("Can't attach %s: cache set not found", buf);
 		size = v;
 	}
 	if (attr == &sysfs_detach && dc->disk.c)
 		bch_cached_dev_detach(dc);
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 	return size;
 }
 STORE(bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
 	mutex_lock(&bch_register_lock);
 	size = __cached_dev_store(kobj, attr, buf, size);
 	if (attr == &sysfs_writeback_running)
 		bch_writeback_queue(dc);
 	if (attr == &sysfs_writeback_percent)
 		schedule_delayed_work(&dc->writeback_rate_update,
 				      dc->writeback_rate_update_seconds * HZ);
 	mutex_unlock(&bch_register_lock);
 	return size;
 }
 static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_attach,
 	&sysfs_detach,
 	&sysfs_stop,
 #if 0
 	&sysfs_data_csum,
 #endif
 	&sysfs_cache_mode,
 	&sysfs_writeback_metadata,
 	&sysfs_writeback_running,
 	&sysfs_writeback_delay,
 	&sysfs_writeback_percent,
 	&sysfs_writeback_rate,
 	&sysfs_writeback_rate_update_seconds,
 	&sysfs_writeback_rate_d_term,
 	&sysfs_writeback_rate_p_term_inverse,
 	&sysfs_writeback_rate_d_smooth,
 	&sysfs_writeback_rate_debug,
 	&sysfs_dirty_data,
 	&sysfs_sequential_cutoff,
 	&sysfs_sequential_merge,
 	&sysfs_clear_stats,
 	&sysfs_running,
 	&sysfs_state,
 	&sysfs_label,
 	&sysfs_readahead,
 #ifdef CONFIG_BCACHE_DEBUG
 	&sysfs_verify,
 #endif
 	NULL
 };
 KTYPE(bch_cached_dev);
 SHOW(bch_flash_dev)
 {
 	struct bcache_device *d = container_of(kobj, struct bcache_device,
 					       kobj);
 	struct uuid_entry *u = &d->c->uuids[d->id];
 	sysfs_printf(data_csum,	"%i", d->data_csum);
 	sysfs_hprint(size,	u->sectors << 9);
 	if (attr == &sysfs_label) {
 		memcpy(buf, u->label, SB_LABEL_SIZE);
 		buf[SB_LABEL_SIZE + 1] = '\0';
 		strcat(buf, "\n");
 		return strlen(buf);
 	}
 	return 0;
 }
 STORE(__bch_flash_dev)
 {
 	struct bcache_device *d = container_of(kobj, struct bcache_device,
 					       kobj);
 	struct uuid_entry *u = &d->c->uuids[d->id];
 	sysfs_strtoul(data_csum,	d->data_csum);
 	if (attr == &sysfs_size) {
 		uint64_t v;
 		strtoi_h_or_return(buf, v);
 		u->sectors = v >> 9;
 		bch_uuid_write(d->c);
 		set_capacity(d->disk, u->sectors);
 	}
 	if (attr == &sysfs_label) {
 		memcpy(u->label, buf, SB_LABEL_SIZE);
 		bch_uuid_write(d->c);
 	}
 	if (attr == &sysfs_unregister) {
 		atomic_set(&d->detaching, 1);
 		bcache_device_stop(d);
 	}
 	return size;
 }
 STORE_LOCKED(bch_flash_dev)
 static struct attribute *bch_flash_dev_files[] = {
 	&sysfs_unregister,
 #if 0
 	&sysfs_data_csum,
 #endif
 	&sysfs_label,
 	&sysfs_size,
 	NULL
 };
 KTYPE(bch_flash_dev);
 SHOW(__bch_cache_set)
 {
 	unsigned root_usage(struct cache_set *c)
 	{
 		unsigned bytes = 0;
 		struct bkey *k;
 		struct btree *b;
 		struct btree_iter iter;
 		goto lock_root;
 		do {
 			rw_unlock(false, b);
 lock_root:
 			b = c->root;
 			rw_lock(false, b, b->level);
 		} while (b != c->root);
 		for_each_key_filter(b, k, &iter, bch_ptr_bad)
 			bytes += bkey_bytes(k);
 		rw_unlock(false, b);
 		return (bytes * 100) / btree_bytes(c);
 	}
 	size_t cache_size(struct cache_set *c)
 	{
 		size_t ret = 0;
 		struct btree *b;
 		mutex_lock(&c->bucket_lock);
 		list_for_each_entry(b, &c->btree_cache, list)
 			ret += 1 << (b->page_order + PAGE_SHIFT);
 		mutex_unlock(&c->bucket_lock);
 		return ret;
 	}
 	unsigned cache_max_chain(struct cache_set *c)
 	{
 		unsigned ret = 0;
 		struct hlist_head *h;
 		mutex_lock(&c->bucket_lock);
 		for (h = c->bucket_hash;
 		     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
 		     h++) {
 			unsigned i = 0;
 			struct hlist_node *p;
 			hlist_for_each(p, h)
 				i++;
 			ret = max(ret, i);
 		}
 		mutex_unlock(&c->bucket_lock);
 		return ret;
 	}
 	unsigned btree_used(struct cache_set *c)
 	{
 		return div64_u64(c->gc_stats.key_bytes * 100,
 				 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
 	}
 	unsigned average_key_size(struct cache_set *c)
 	{
 		return c->gc_stats.nkeys
 			? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
 			: 0;
 	}
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
 	sysfs_print(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_hprint(bucket_size,		bucket_bytes(c));
 	sysfs_hprint(block_size,		block_bytes(c));
 	sysfs_print(tree_depth,			c->root->level);
 	sysfs_print(root_usage_percent,		root_usage(c));
 	sysfs_hprint(btree_cache_size,		cache_size(c));
 	sysfs_print(btree_cache_max_chain,	cache_max_chain(c));
 	sysfs_print(cache_available_percent,	100 - c->gc_stats.in_use);
 	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms);
 	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us);
 	sysfs_print_time_stats(&c->sort_time,		btree_sort, ms, us);
 	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us);
 	sysfs_print_time_stats(&c->try_harder_time,	try_harder, ms, us);
 	sysfs_print(btree_used_percent,	btree_used(c));
 	sysfs_print(btree_nodes,	c->gc_stats.nodes);
 	sysfs_hprint(dirty_data,	c->gc_stats.dirty);
 	sysfs_hprint(average_key_size,	average_key_size(c));
 	sysfs_print(cache_read_races,
 		    atomic_long_read(&c->cache_read_races));
 	sysfs_print(writeback_keys_done,
 		    atomic_long_read(&c->writeback_keys_done));
 	sysfs_print(writeback_keys_failed,
 		    atomic_long_read(&c->writeback_keys_failed));
 	/* See count_io_errors for why 88 */
 	sysfs_print(io_error_halflife,	c->error_decay * 88);
 	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
 	sysfs_hprint(congested,
 		     ((uint64_t) bch_get_congested(c)) << 9);
 	sysfs_print(congested_read_threshold_us,
 		    c->congested_read_threshold_us);
 	sysfs_print(congested_write_threshold_us,
 		    c->congested_write_threshold_us);
 	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
 	sysfs_printf(verify,			"%i", c->verify);
 	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
 	if (attr == &sysfs_bset_tree_stats)
 		return bch_bset_print_stats(c, buf);
 	return 0;
 }
 SHOW_LOCKED(bch_cache_set)
 STORE(__bch_cache_set)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 	if (attr == &sysfs_unregister)
 		bch_cache_set_unregister(c);
 	if (attr == &sysfs_stop)
 		bch_cache_set_stop(c);
 	if (attr == &sysfs_synchronous) {
 		bool sync = strtoul_or_return(buf);
 		if (sync != CACHE_SYNC(&c->sb)) {
 			SET_CACHE_SYNC(&c->sb, sync);
 			bcache_write_super(c);
 		}
 	}
 	if (attr == &sysfs_flash_vol_create) {
 		int r;
 		uint64_t v;
 		strtoi_h_or_return(buf, v);
 		r = bch_flash_dev_create(c, v);
 		if (r)
 			return r;
 	}
 	if (attr == &sysfs_clear_stats) {
 		atomic_long_set(&c->writeback_keys_done,	0);
 		atomic_long_set(&c->writeback_keys_failed,	0);
 		memset(&c->gc_stats, 0, sizeof(struct gc_stat));
 		bch_cache_accounting_clear(&c->accounting);
 	}
 	if (attr == &sysfs_trigger_gc)
 		bch_queue_gc(c);
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
 		sc.gfp_mask = GFP_KERNEL;
 		sc.nr_to_scan = strtoul_or_return(buf);
 		c->shrink.shrink(&c->shrink, &sc);
 	}
 	sysfs_strtoul(congested_read_threshold_us,
 		      c->congested_read_threshold_us);
 	sysfs_strtoul(congested_write_threshold_us,
 		      c->congested_write_threshold_us);
 	if (attr == &sysfs_io_error_limit)
 		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
 	/* See count_io_errors() for why 88 */
 	if (attr == &sysfs_io_error_halflife)
 		c->error_decay = strtoul_or_return(buf) / 88;
 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_strtoul(verify,			c->verify);
 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
 	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
 	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
 	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
 	return size;
 }
 STORE_LOCKED(bch_cache_set)
 SHOW(bch_cache_set_internal)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, internal);
 	return bch_cache_set_show(&c->kobj, attr, buf);
 }
 STORE(bch_cache_set_internal)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, internal);
 	return bch_cache_set_store(&c->kobj, attr, buf, size);
 }
 static void bch_cache_set_internal_release(struct kobject *k)
 {
 }
 static struct attribute *bch_cache_set_files[] = {
 	&sysfs_unregister,
 	&sysfs_stop,
 	&sysfs_synchronous,
 	&sysfs_journal_delay_ms,
 	&sysfs_flash_vol_create,
 	&sysfs_bucket_size,
 	&sysfs_block_size,
 	&sysfs_tree_depth,
 	&sysfs_root_usage_percent,
 	&sysfs_btree_cache_size,
 	&sysfs_cache_available_percent,
 	&sysfs_average_key_size,
 	&sysfs_dirty_data,
 	&sysfs_io_error_limit,
 	&sysfs_io_error_halflife,
 	&sysfs_congested,
 	&sysfs_congested_read_threshold_us,
 	&sysfs_congested_write_threshold_us,
 	&sysfs_clear_stats,
 	NULL
 };
 KTYPE(bch_cache_set);
 static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_active_journal_entries,
 	sysfs_time_stats_attribute_list(btree_gc, sec, ms)
 	sysfs_time_stats_attribute_list(btree_split, sec, us)
 	sysfs_time_stats_attribute_list(btree_sort, ms, us)
 	sysfs_time_stats_attribute_list(btree_read, ms, us)
 	sysfs_time_stats_attribute_list(try_harder, ms, us)
 	&sysfs_btree_nodes,
 	&sysfs_btree_used_percent,
 	&sysfs_btree_cache_max_chain,
 	&sysfs_bset_tree_stats,
 	&sysfs_cache_read_races,
 	&sysfs_writeback_keys_done,
 	&sysfs_writeback_keys_failed,
 	&sysfs_trigger_gc,
 	&sysfs_prune_cache,
 #ifdef CONFIG_BCACHE_DEBUG
 	&sysfs_verify,
 	&sysfs_key_merging_disabled,
 #endif
 	&sysfs_gc_always_rewrite,
 	&sysfs_btree_shrinker_disabled,
 	&sysfs_copy_gc_enabled,
 	NULL
 };
 KTYPE(bch_cache_set_internal);
 SHOW(__bch_cache)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 	sysfs_hprint(bucket_size,	bucket_bytes(ca));
 	sysfs_hprint(block_size,	block_bytes(ca));
 	sysfs_print(nbuckets,		ca->sb.nbuckets);
 	sysfs_print(discard,		ca->discard);
 	sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
 	sysfs_hprint(btree_written,
 		     atomic_long_read(&ca->btree_sectors_written) << 9);
 	sysfs_hprint(metadata_written,
 		     (atomic_long_read(&ca->meta_sectors_written) +
 		      atomic_long_read(&ca->btree_sectors_written)) << 9);
 	sysfs_print(io_errors,
 		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
 	sysfs_print(freelist_percent, ca->free.size * 100 /
 		    ((size_t) ca->sb.nbuckets));
 	if (attr == &sysfs_cache_replacement_policy)
 		return snprint_string_list(buf, PAGE_SIZE,
 					   cache_replacement_policies,
 					   CACHE_REPLACEMENT(&ca->sb));
 	if (attr == &sysfs_priority_stats) {
 		int cmp(const void *l, const void *r)
 		{	return *((uint16_t *) r) - *((uint16_t *) l); }
 		/* Number of quantiles we compute */
 		const unsigned nq = 31;
 		size_t n = ca->sb.nbuckets, i, unused, btree;
 		uint64_t sum = 0;
 		uint16_t q[nq], *p, *cached;
 		ssize_t ret;
 		cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
 		if (!p)
 			return -ENOMEM;
 		mutex_lock(&ca->set->bucket_lock);
 		for (i = ca->sb.first_bucket; i < n; i++)
 			p[i] = ca->buckets[i].prio;
 		mutex_unlock(&ca->set->bucket_lock);
 		sort(p, n, sizeof(uint16_t), cmp, NULL);
 		while (n &&
 		       !cached[n - 1])
 			--n;
 		unused = ca->sb.nbuckets - n;
 		while (cached < p + n &&
 		       *cached == BTREE_PRIO)
 			cached++;
 		btree = cached - p;
 		n -= btree;
 		for (i = 0; i < n; i++)
 			sum += INITIAL_PRIO - cached[i];
 		if (n)
 			do_div(sum, n);
 		for (i = 0; i < nq; i++)
 			q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
 		vfree(p);
 		ret = snprintf(buf, PAGE_SIZE,
 			       "Unused:		%zu%%\n"
 			       "Metadata:	%zu%%\n"
 			       "Average:	%llu\n"
 			       "Sectors per Q:	%zu\n"
 			       "Quantiles:	[",
 			       unused * 100 / (size_t) ca->sb.nbuckets,
 			       btree * 100 / (size_t) ca->sb.nbuckets, sum,
 			       n * ca->sb.bucket_size / (nq + 1));
 		for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
 			ret += snprintf(buf + ret, PAGE_SIZE - ret,
 					i < nq - 1 ? "%u " : "%u]\n", q[i]);
 		buf[PAGE_SIZE - 1] = '\0';
 		return ret;
 	}
 	return 0;
 }
 SHOW_LOCKED(bch_cache)
 STORE(__bch_cache)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
 		if (blk_queue_discard(bdev_get_queue(ca->bdev)))
 			ca->discard = v;
 		if (v != CACHE_DISCARD(&ca->sb)) {
 			SET_CACHE_DISCARD(&ca->sb, v);
 			bcache_write_super(ca->set);
 		}
 	}
 	if (attr == &sysfs_cache_replacement_policy) {
 		ssize_t v = read_string_list(buf, cache_replacement_policies);
 		if (v < 0)
 			return v;
 		if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
 			mutex_lock(&ca->set->bucket_lock);
 			SET_CACHE_REPLACEMENT(&ca->sb, v);
 			mutex_unlock(&ca->set->bucket_lock);
 			bcache_write_super(ca->set);
 		}
 	}
 	if (attr == &sysfs_freelist_percent) {
 		DECLARE_FIFO(long, free);
 		long i;
 		size_t p = strtoul_or_return(buf);
 		p = clamp_t(size_t,
 			    ((size_t) ca->sb.nbuckets * p) / 100,
 			    roundup_pow_of_two(ca->sb.nbuckets) >> 9,
 			    ca->sb.nbuckets / 2);
 		if (!init_fifo_exact(&free, p, GFP_KERNEL))
 			return -ENOMEM;
 		mutex_lock(&ca->set->bucket_lock);
 		fifo_move(&free, &ca->free);
 		fifo_swap(&free, &ca->free);
 		mutex_unlock(&ca->set->bucket_lock);
 		while (fifo_pop(&free, i))
 			atomic_dec(&ca->buckets[i].pin);
 		free_fifo(&free);
 	}
 	if (attr == &sysfs_clear_stats) {
 		atomic_long_set(&ca->sectors_written, 0);
 		atomic_long_set(&ca->btree_sectors_written, 0);
 		atomic_long_set(&ca->meta_sectors_written, 0);
 		atomic_set(&ca->io_count, 0);
 		atomic_set(&ca->io_errors, 0);
 	}
 	return size;
 }
 STORE_LOCKED(bch_cache)
 static struct attribute *bch_cache_files[] = {
 	&sysfs_bucket_size,
 	&sysfs_block_size,
 	&sysfs_nbuckets,
 	&sysfs_priority_stats,
 	&sysfs_discard,
 	&sysfs_written,
 	&sysfs_btree_written,
 	&sysfs_metadata_written,
 	&sysfs_io_errors,
 	&sysfs_clear_stats,
 	&sysfs_freelist_percent,
 	&sysfs_cache_replacement_policy,
 	NULL
 };
 KTYPE(bch_cache);
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@ -0,0 +1,110 @@
 #ifndef _BCACHE_SYSFS_H_
 #define _BCACHE_SYSFS_H_
 #define KTYPE(type)							\
 struct kobj_type type ## _ktype = {					\
 	.release	= type ## _release,				\
 	.sysfs_ops	= &((const struct sysfs_ops) {			\
 		.show	= type ## _show,				\
 		.store	= type ## _store				\
 	}),								\
 	.default_attrs	= type ## _files				\
 }
 #define SHOW(fn)							\
 static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 			   char *buf)					\
 #define STORE(fn)							\
 static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 			    const char *buf, size_t size)		\
 #define SHOW_LOCKED(fn)							\
 SHOW(fn)								\
 {									\
 	ssize_t ret;							\
 	mutex_lock(&bch_register_lock);					\
 	ret = __ ## fn ## _show(kobj, attr, buf);			\
 	mutex_unlock(&bch_register_lock);				\
 	return ret;							\
 }
 #define STORE_LOCKED(fn)						\
 STORE(fn)								\
 {									\
 	ssize_t ret;							\
 	mutex_lock(&bch_register_lock);					\
 	ret = __ ## fn ## _store(kobj, attr, buf, size);		\
 	mutex_unlock(&bch_register_lock);				\
 	return ret;							\
 }
 #define __sysfs_attribute(_name, _mode)					\
 	static struct attribute sysfs_##_name =				\
 		{ .name = #_name, .mode = _mode }
 #define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
 #define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
 #define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
 #define sysfs_printf(file, fmt, ...)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
 		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\
 } while (0)
 #define sysfs_print(file, var)						\
 do {									\
 	if (attr == &sysfs_ ## file)					\
 		return snprint(buf, PAGE_SIZE, var);			\
 } while (0)
 #define sysfs_hprint(file, val)						\
 do {									\
 	if (attr == &sysfs_ ## file) {					\
 		ssize_t ret = hprint(buf, val);				\
 		strcat(buf, "\n");					\
 		return ret + 1;						\
 	}								\
 } while (0)
 #define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
 #define var_print(_var)		sysfs_print(_var, var(_var))
 #define var_hprint(_var)	sysfs_hprint(_var, var(_var))
 #define sysfs_strtoul(file, var)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
 		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
 } while (0)
 #define sysfs_strtoul_clamp(file, var, min, max)			\
 do {									\
 	if (attr == &sysfs_ ## file)					\
 		return strtoul_safe_clamp(buf, var, min, max)		\
 			?: (ssize_t) size;				\
 } while (0)
 #define strtoul_or_return(cp)						\
 ({									\
 	unsigned long _v;						\
 	int _r = kstrtoul(cp, 10, &_v);					\
 	if (_r)								\
 		return _r;						\
 	_v;								\
 })
 #define strtoi_h_or_return(cp, v)					\
 do {									\
 	int _r = strtoi_h(cp, &v);					\
 	if (_r)								\
 		return _r;						\
 } while (0)
 #define sysfs_hatoi(file, var)						\
 do {									\
 	if (attr == &sysfs_ ## file)					\
 		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
 } while (0)
 #endif  /* _BCACHE_SYSFS_H_ */
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@ -0,0 +1,26 @@
 #include "bcache.h"
 #include "btree.h"
 #include "request.h"
 #include <linux/module.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/bcache.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@ -0,0 +1,389 @@
 /*
 * random utiility code, for bcache but in theory not specific to bcache
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/types.h>
 #include "util.h"
 #define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
 #define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
 #define STRTO_H(name, type)					\
 int name ## _h(const char *cp, type *res)		        \
 {								\
 	int u = 0;						\
 	char *e;						\
 	type i = simple_ ## name(cp, &e, 10);			\
 								\
 	switch (tolower(*e)) {					\
 	default:						\
 		return -EINVAL;					\
 	case 'y':						\
 	case 'z':						\
 		u++;						\
 	case 'e':						\
 		u++;						\
 	case 'p':						\
 		u++;						\
 	case 't':						\
 		u++;						\
 	case 'g':						\
 		u++;						\
 	case 'm':						\
 		u++;						\
 	case 'k':						\
 		u++;						\
 		if (e++ == cp)					\
 			return -EINVAL;				\
 	case '\n':						\
 	case '\0':						\
 		if (*e == '\n')					\
 			e++;					\
 	}							\
 								\
 	if (*e)							\
 		return -EINVAL;					\
 								\
 	while (u--) {						\
 		if ((type) ~0 > 0 &&				\
 		    (type) ~0 / 1024 <= i)			\
 			return -EINVAL;				\
 		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\
 		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\
 			return -EINVAL;				\
 		i *= 1024;					\
 	}							\
 								\
 	*res = i;						\
 	return 0;						\
 }								\
 EXPORT_SYMBOL_GPL(name ## _h);
 STRTO_H(strtoint, int)
 STRTO_H(strtouint, unsigned int)
 STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 ssize_t hprint(char *buf, int64_t v)
 {
 	static const char units[] = "?kMGTPEZY";
 	char dec[3] = "";
 	int u, t = 0;
 	for (u = 0; v >= 1024 || v <= -1024; u++) {
 		t = v & ~(~0 << 10);
 		v >>= 10;
 	}
 	if (!u)
 		return sprintf(buf, "%llu", v);
 	if (v < 100 && v > -100)
 		sprintf(dec, ".%i", t / 100);
 	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
 }
 EXPORT_SYMBOL_GPL(hprint);
 ssize_t snprint_string_list(char *buf, size_t size, const char * const list[],
 			    size_t selected)
 {
 	char *out = buf;
 	size_t i;
 	for (i = 0; list[i]; i++)
 		out += snprintf(out, buf + size - out,
 				i == selected ? "[%s] " : "%s ", list[i]);
 	out[-1] = '\n';
 	return out - buf;
 }
 EXPORT_SYMBOL_GPL(snprint_string_list);
 ssize_t read_string_list(const char *buf, const char * const list[])
 {
 	size_t i;
 	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
 	if (!d)
 		return -ENOMEM;
 	s = strim(d);
 	for (i = 0; list[i]; i++)
 		if (!strcmp(list[i], s))
 			break;
 	kfree(d);
 	if (!list[i])
 		return -EINVAL;
 	return i;
 }
 EXPORT_SYMBOL_GPL(read_string_list);
 bool is_zero(const char *p, size_t n)
 {
 	size_t i;
 	for (i = 0; i < n; i++)
 		if (p[i])
 			return false;
 	return true;
 }
 EXPORT_SYMBOL_GPL(is_zero);
 int parse_uuid(const char *s, char *uuid)
 {
 	size_t i, j, x;
 	memset(uuid, 0, 16);
 	for (i = 0, j = 0;
 	     i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
 	     i++) {
 		x = s[i] | 32;
 		switch (x) {
 		case '0'...'9':
 			x -= '0';
 			break;
 		case 'a'...'f':
 			x -= 'a' - 10;
 			break;
 		default:
 			continue;
 		}
 		if (!(j & 1))
 			x <<= 4;
 		uuid[j++ >> 1] |= x;
 	}
 	return i;
 }
 EXPORT_SYMBOL_GPL(parse_uuid);
 void time_stats_update(struct time_stats *stats, uint64_t start_time)
 {
 	uint64_t now		= local_clock();
 	uint64_t duration	= time_after64(now, start_time)
 		? now - start_time : 0;
 	uint64_t last		= time_after64(now, stats->last)
 		? now - stats->last : 0;
 	stats->max_duration = max(stats->max_duration, duration);
 	if (stats->last) {
 		ewma_add(stats->average_duration, duration, 8, 8);
 		if (stats->average_frequency)
 			ewma_add(stats->average_frequency, last, 8, 8);
 		else
 			stats->average_frequency  = last << 8;
 	} else {
 		stats->average_duration  = duration << 8;
 	}
 	stats->last = now ?: 1;
 }
 EXPORT_SYMBOL_GPL(time_stats_update);
 unsigned next_delay(struct ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();
 	d->next += div_u64(done, d->rate);
 	return time_after64(d->next, now)
 		? div_u64(d->next - now, NSEC_PER_SEC / HZ)
 		: 0;
 }
 EXPORT_SYMBOL_GPL(next_delay);
 void bio_map(struct bio *bio, void *base)
 {
 	size_t size = bio->bi_size;
 	struct bio_vec *bv = bio->bi_io_vec;
 	BUG_ON(!bio->bi_size);
 	BUG_ON(bio->bi_vcnt);
 	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
 	goto start;
 	for (; size; bio->bi_vcnt++, bv++) {
 		bv->bv_offset	= 0;
 start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
 					size);
 		if (base) {
 			bv->bv_page = is_vmalloc_addr(base)
 				? vmalloc_to_page(base)
 				: virt_to_page(base);
 			base += bv->bv_len;
 		}
 		size -= bv->bv_len;
 	}
 }
 EXPORT_SYMBOL_GPL(bio_map);
 int bio_alloc_pages(struct bio *bio, gfp_t gfp)
 {
 	int i;
 	struct bio_vec *bv;
 	bio_for_each_segment(bv, bio, i) {
 		bv->bv_page = alloc_page(gfp);
 		if (!bv->bv_page) {
 			while (bv-- != bio->bi_io_vec + bio->bi_idx)
 				__free_page(bv->bv_page);
 			return -ENOMEM;
 		}
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bio_alloc_pages);
 /*
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
 * use permitted, subject to terms of PostgreSQL license; see.)
 * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
 * usual sort of implementation. (See Ross Williams' excellent introduction
 * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
 * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
 * If we have no working 64-bit type, then fake it with two 32-bit registers.
 *
 * The present implementation is a normal (not "reflected", in Williams'
 * terms) 64-bit CRC, using initial all-ones register contents and a final
 * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
 * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
 *
 * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
 * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
 * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
 * x^7 + x^4 + x + 1
 */
 static const uint64_t crc_table[256] = {
 	0x0000000000000000, 0x42F0E1EBA9EA3693, 0x85E1C3D753D46D26,
 	0xC711223CFA3E5BB5, 0x493366450E42ECDF, 0x0BC387AEA7A8DA4C,
 	0xCCD2A5925D9681F9, 0x8E224479F47CB76A, 0x9266CC8A1C85D9BE,
 	0xD0962D61B56FEF2D, 0x17870F5D4F51B498, 0x5577EEB6E6BB820B,
 	0xDB55AACF12C73561, 0x99A54B24BB2D03F2, 0x5EB4691841135847,
 	0x1C4488F3E8F96ED4, 0x663D78FF90E185EF, 0x24CD9914390BB37C,
 	0xE3DCBB28C335E8C9, 0xA12C5AC36ADFDE5A, 0x2F0E1EBA9EA36930,
 	0x6DFEFF5137495FA3, 0xAAEFDD6DCD770416, 0xE81F3C86649D3285,
 	0xF45BB4758C645C51, 0xB6AB559E258E6AC2, 0x71BA77A2DFB03177,
 	0x334A9649765A07E4, 0xBD68D2308226B08E, 0xFF9833DB2BCC861D,
 	0x388911E7D1F2DDA8, 0x7A79F00C7818EB3B, 0xCC7AF1FF21C30BDE,
 	0x8E8A101488293D4D, 0x499B3228721766F8, 0x0B6BD3C3DBFD506B,
 	0x854997BA2F81E701, 0xC7B97651866BD192, 0x00A8546D7C558A27,
 	0x4258B586D5BFBCB4, 0x5E1C3D753D46D260, 0x1CECDC9E94ACE4F3,
 	0xDBFDFEA26E92BF46, 0x990D1F49C77889D5, 0x172F5B3033043EBF,
 	0x55DFBADB9AEE082C, 0x92CE98E760D05399, 0xD03E790CC93A650A,
 	0xAA478900B1228E31, 0xE8B768EB18C8B8A2, 0x2FA64AD7E2F6E317,
 	0x6D56AB3C4B1CD584, 0xE374EF45BF6062EE, 0xA1840EAE168A547D,
 	0x66952C92ECB40FC8, 0x2465CD79455E395B, 0x3821458AADA7578F,
 	0x7AD1A461044D611C, 0xBDC0865DFE733AA9, 0xFF3067B657990C3A,
 	0x711223CFA3E5BB50, 0x33E2C2240A0F8DC3, 0xF4F3E018F031D676,
 	0xB60301F359DBE0E5, 0xDA050215EA6C212F, 0x98F5E3FE438617BC,
 	0x5FE4C1C2B9B84C09, 0x1D14202910527A9A, 0x93366450E42ECDF0,
 	0xD1C685BB4DC4FB63, 0x16D7A787B7FAA0D6, 0x5427466C1E109645,
 	0x4863CE9FF6E9F891, 0x0A932F745F03CE02, 0xCD820D48A53D95B7,
 	0x8F72ECA30CD7A324, 0x0150A8DAF8AB144E, 0x43A04931514122DD,
 	0x84B16B0DAB7F7968, 0xC6418AE602954FFB, 0xBC387AEA7A8DA4C0,
 	0xFEC89B01D3679253, 0x39D9B93D2959C9E6, 0x7B2958D680B3FF75,
 	0xF50B1CAF74CF481F, 0xB7FBFD44DD257E8C, 0x70EADF78271B2539,
 	0x321A3E938EF113AA, 0x2E5EB66066087D7E, 0x6CAE578BCFE24BED,
 	0xABBF75B735DC1058, 0xE94F945C9C3626CB, 0x676DD025684A91A1,
 	0x259D31CEC1A0A732, 0xE28C13F23B9EFC87, 0xA07CF2199274CA14,
 	0x167FF3EACBAF2AF1, 0x548F120162451C62, 0x939E303D987B47D7,
 	0xD16ED1D631917144, 0x5F4C95AFC5EDC62E, 0x1DBC74446C07F0BD,
 	0xDAAD56789639AB08, 0x985DB7933FD39D9B, 0x84193F60D72AF34F,
 	0xC6E9DE8B7EC0C5DC, 0x01F8FCB784FE9E69, 0x43081D5C2D14A8FA,
 	0xCD2A5925D9681F90, 0x8FDAB8CE70822903, 0x48CB9AF28ABC72B6,
 	0x0A3B7B1923564425, 0x70428B155B4EAF1E, 0x32B26AFEF2A4998D,
 	0xF5A348C2089AC238, 0xB753A929A170F4AB, 0x3971ED50550C43C1,
 	0x7B810CBBFCE67552, 0xBC902E8706D82EE7, 0xFE60CF6CAF321874,
 	0xE224479F47CB76A0, 0xA0D4A674EE214033, 0x67C58448141F1B86,
 	0x253565A3BDF52D15, 0xAB1721DA49899A7F, 0xE9E7C031E063ACEC,
 	0x2EF6E20D1A5DF759, 0x6C0603E6B3B7C1CA, 0xF6FAE5C07D3274CD,
 	0xB40A042BD4D8425E, 0x731B26172EE619EB, 0x31EBC7FC870C2F78,
 	0xBFC9838573709812, 0xFD39626EDA9AAE81, 0x3A28405220A4F534,
 	0x78D8A1B9894EC3A7, 0x649C294A61B7AD73, 0x266CC8A1C85D9BE0,
 	0xE17DEA9D3263C055, 0xA38D0B769B89F6C6, 0x2DAF4F0F6FF541AC,
 	0x6F5FAEE4C61F773F, 0xA84E8CD83C212C8A, 0xEABE6D3395CB1A19,
 	0x90C79D3FEDD3F122, 0xD2377CD44439C7B1, 0x15265EE8BE079C04,
 	0x57D6BF0317EDAA97, 0xD9F4FB7AE3911DFD, 0x9B041A914A7B2B6E,
 	0x5C1538ADB04570DB, 0x1EE5D94619AF4648, 0x02A151B5F156289C,
 	0x4051B05E58BC1E0F, 0x87409262A28245BA, 0xC5B073890B687329,
 	0x4B9237F0FF14C443, 0x0962D61B56FEF2D0, 0xCE73F427ACC0A965,
 	0x8C8315CC052A9FF6, 0x3A80143F5CF17F13, 0x7870F5D4F51B4980,
 	0xBF61D7E80F251235, 0xFD913603A6CF24A6, 0x73B3727A52B393CC,
 	0x31439391FB59A55F, 0xF652B1AD0167FEEA, 0xB4A25046A88DC879,
 	0xA8E6D8B54074A6AD, 0xEA16395EE99E903E, 0x2D071B6213A0CB8B,
 	0x6FF7FA89BA4AFD18, 0xE1D5BEF04E364A72, 0xA3255F1BE7DC7CE1,
 	0x64347D271DE22754, 0x26C49CCCB40811C7, 0x5CBD6CC0CC10FAFC,
 	0x1E4D8D2B65FACC6F, 0xD95CAF179FC497DA, 0x9BAC4EFC362EA149,
 	0x158E0A85C2521623, 0x577EEB6E6BB820B0, 0x906FC95291867B05,
 	0xD29F28B9386C4D96, 0xCEDBA04AD0952342, 0x8C2B41A1797F15D1,
 	0x4B3A639D83414E64, 0x09CA82762AAB78F7, 0x87E8C60FDED7CF9D,
 	0xC51827E4773DF90E, 0x020905D88D03A2BB, 0x40F9E43324E99428,
 	0x2CFFE7D5975E55E2, 0x6E0F063E3EB46371, 0xA91E2402C48A38C4,
 	0xEBEEC5E96D600E57, 0x65CC8190991CB93D, 0x273C607B30F68FAE,
 	0xE02D4247CAC8D41B, 0xA2DDA3AC6322E288, 0xBE992B5F8BDB8C5C,
 	0xFC69CAB42231BACF, 0x3B78E888D80FE17A, 0x7988096371E5D7E9,
 	0xF7AA4D1A85996083, 0xB55AACF12C735610, 0x724B8ECDD64D0DA5,
 	0x30BB6F267FA73B36, 0x4AC29F2A07BFD00D, 0x08327EC1AE55E69E,
 	0xCF235CFD546BBD2B, 0x8DD3BD16FD818BB8, 0x03F1F96F09FD3CD2,
 	0x41011884A0170A41, 0x86103AB85A2951F4, 0xC4E0DB53F3C36767,
 	0xD8A453A01B3A09B3, 0x9A54B24BB2D03F20, 0x5D45907748EE6495,
 	0x1FB5719CE1045206, 0x919735E51578E56C, 0xD367D40EBC92D3FF,
 	0x1476F63246AC884A, 0x568617D9EF46BED9, 0xE085162AB69D5E3C,
 	0xA275F7C11F7768AF, 0x6564D5FDE549331A, 0x279434164CA30589,
 	0xA9B6706FB8DFB2E3, 0xEB46918411358470, 0x2C57B3B8EB0BDFC5,
 	0x6EA7525342E1E956, 0x72E3DAA0AA188782, 0x30133B4B03F2B111,
 	0xF7021977F9CCEAA4, 0xB5F2F89C5026DC37, 0x3BD0BCE5A45A6B5D,
 	0x79205D0E0DB05DCE, 0xBE317F32F78E067B, 0xFCC19ED95E6430E8,
 	0x86B86ED5267CDBD3, 0xC4488F3E8F96ED40, 0x0359AD0275A8B6F5,
 	0x41A94CE9DC428066, 0xCF8B0890283E370C, 0x8D7BE97B81D4019F,
 	0x4A6ACB477BEA5A2A, 0x089A2AACD2006CB9, 0x14DEA25F3AF9026D,
 	0x562E43B4931334FE, 0x913F6188692D6F4B, 0xD3CF8063C0C759D8,
 	0x5DEDC41A34BBEEB2, 0x1F1D25F19D51D821, 0xD80C07CD676F8394,
 	0x9AFCE626CE85B507
 };
 uint64_t crc64_update(uint64_t crc, const void *_data, size_t len)
 {
 	const unsigned char *data = _data;
 	while (len--) {
 		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
 		crc = crc_table[i] ^ (crc << 8);
 	}
 	return crc;
 }
 EXPORT_SYMBOL(crc64_update);
 uint64_t crc64(const void *data, size_t len)
 {
 	uint64_t crc = 0xffffffffffffffff;
 	crc = crc64_update(crc, data, len);
 	return crc ^ 0xffffffffffffffff;
 }
 EXPORT_SYMBOL(crc64);
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@ -0,0 +1,589 @@
 #ifndef _BCACHE_UTIL_H
 #define _BCACHE_UTIL_H
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/llist.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include "closure.h"
 #define PAGE_SECTORS		(PAGE_SIZE / 512)
 struct closure;
 #include <trace/events/bcache.h>
 #ifdef CONFIG_BCACHE_EDEBUG
 #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
 #else /* EDEBUG */
 #define atomic_dec_bug(v)	atomic_dec(v)
 #define atomic_inc_bug(v, i)	atomic_inc(v)
 #endif
 #define BITMASK(name, type, field, offset, size)		\
 static inline uint64_t name(const type *k)			\
 { return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\
 								\
 static inline void SET_##name(type *k, uint64_t v)		\
 {								\
 	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\
 	k->field |= v << offset;				\
 }
 #define DECLARE_HEAP(type, name)					\
 	struct {							\
 		size_t size, used;					\
 		type *data;						\
 	} name
 #define init_heap(heap, _size, gfp)					\
 ({									\
 	size_t _bytes;							\
 	(heap)->used = 0;						\
 	(heap)->size = (_size);						\
 	_bytes = (heap)->size * sizeof(*(heap)->data);			\
 	(heap)->data = NULL;						\
 	if (_bytes < KMALLOC_MAX_SIZE)					\
 		(heap)->data = kmalloc(_bytes, (gfp));			\
 	if ((!(heap)->data) && ((gfp) & GFP_KERNEL))			\
 		(heap)->data = vmalloc(_bytes);				\
 	(heap)->data;							\
 })
 #define free_heap(heap)							\
 do {									\
 	if (is_vmalloc_addr((heap)->data))				\
 		vfree((heap)->data);					\
 	else								\
 		kfree((heap)->data);					\
 	(heap)->data = NULL;						\
 } while (0)
 #define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
 #define heap_sift(h, i, cmp)						\
 do {									\
 	size_t _r, _j = i;						\
 									\
 	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
 		_r = _j * 2 + 1;					\
 		if (_r + 1 < (h)->used &&				\
 		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
 			_r++;						\
 									\
 		if (cmp((h)->data[_r], (h)->data[_j]))			\
 			break;						\
 		heap_swap(h, _r, _j);					\
 	}								\
 } while (0)
 #define heap_sift_down(h, i, cmp)					\
 do {									\
 	while (i) {							\
 		size_t p = (i - 1) / 2;					\
 		if (cmp((h)->data[i], (h)->data[p]))			\
 			break;						\
 		heap_swap(h, i, p);					\
 		i = p;							\
 	}								\
 } while (0)
 #define heap_add(h, d, cmp)						\
 ({									\
 	bool _r = !heap_full(h);					\
 	if (_r) {							\
 		size_t _i = (h)->used++;				\
 		(h)->data[_i] = d;					\
 									\
 		heap_sift_down(h, _i, cmp);				\
 		heap_sift(h, _i, cmp);					\
 	}								\
 	_r;								\
 })
 #define heap_pop(h, d, cmp)						\
 ({									\
 	bool _r = (h)->used;						\
 	if (_r) {							\
 		(d) = (h)->data[0];					\
 		(h)->used--;						\
 		heap_swap(h, 0, (h)->used);				\
 		heap_sift(h, 0, cmp);					\
 	}								\
 	_r;								\
 })
 #define heap_peek(h)	((h)->size ? (h)->data[0] : NULL)
 #define heap_full(h)	((h)->used == (h)->size)
 #define DECLARE_FIFO(type, name)					\
 	struct {							\
 		size_t front, back, size, mask;				\
 		type *data;						\
 	} name
 #define fifo_for_each(c, fifo, iter)					\
 	for (iter = (fifo)->front;					\
 	     c = (fifo)->data[iter], iter != (fifo)->back;		\
 	     iter = (iter + 1) & (fifo)->mask)
 #define __init_fifo(fifo, gfp)						\
 ({									\
 	size_t _allocated_size, _bytes;					\
 	BUG_ON(!(fifo)->size);						\
 									\
 	_allocated_size = roundup_pow_of_two((fifo)->size + 1);		\
 	_bytes = _allocated_size * sizeof(*(fifo)->data);		\
 									\
 	(fifo)->mask = _allocated_size - 1;				\
 	(fifo)->front = (fifo)->back = 0;				\
 	(fifo)->data = NULL;						\
 									\
 	if (_bytes < KMALLOC_MAX_SIZE)					\
 		(fifo)->data = kmalloc(_bytes, (gfp));			\
 	if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))			\
 		(fifo)->data = vmalloc(_bytes);				\
 	(fifo)->data;							\
 })
 #define init_fifo_exact(fifo, _size, gfp)				\
 ({									\
 	(fifo)->size = (_size);						\
 	__init_fifo(fifo, gfp);						\
 })
 #define init_fifo(fifo, _size, gfp)					\
 ({									\
 	(fifo)->size = (_size);						\
 	if ((fifo)->size > 4)						\
 		(fifo)->size = roundup_pow_of_two((fifo)->size) - 1;	\
 	__init_fifo(fifo, gfp);						\
 })
 #define free_fifo(fifo)							\
 do {									\
 	if (is_vmalloc_addr((fifo)->data))				\
 		vfree((fifo)->data);					\
 	else								\
 		kfree((fifo)->data);					\
 	(fifo)->data = NULL;						\
 } while (0)
 #define fifo_used(fifo)		(((fifo)->back - (fifo)->front) & (fifo)->mask)
 #define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
 #define fifo_empty(fifo)	(!fifo_used(fifo))
 #define fifo_full(fifo)		(!fifo_free(fifo))
 #define fifo_front(fifo)	((fifo)->data[(fifo)->front])
 #define fifo_back(fifo)							\
 	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
 #define fifo_idx(fifo, p)	(((p) - &fifo_front(fifo)) & (fifo)->mask)
 #define fifo_push_back(fifo, i)						\
 ({									\
 	bool _r = !fifo_full((fifo));					\
 	if (_r) {							\
 		(fifo)->data[(fifo)->back++] = (i);			\
 		(fifo)->back &= (fifo)->mask;				\
 	}								\
 	_r;								\
 })
 #define fifo_pop_front(fifo, i)						\
 ({									\
 	bool _r = !fifo_empty((fifo));					\
 	if (_r) {							\
 		(i) = (fifo)->data[(fifo)->front++];			\
 		(fifo)->front &= (fifo)->mask;				\
 	}								\
 	_r;								\
 })
 #define fifo_push_front(fifo, i)					\
 ({									\
 	bool _r = !fifo_full((fifo));					\
 	if (_r) {							\
 		--(fifo)->front;					\
 		(fifo)->front &= (fifo)->mask;				\
 		(fifo)->data[(fifo)->front] = (i);			\
 	}								\
 	_r;								\
 })
 #define fifo_pop_back(fifo, i)						\
 ({									\
 	bool _r = !fifo_empty((fifo));					\
 	if (_r) {							\
 		--(fifo)->back;						\
 		(fifo)->back &= (fifo)->mask;				\
 		(i) = (fifo)->data[(fifo)->back]			\
 	}								\
 	_r;								\
 })
 #define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
 #define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
 #define fifo_swap(l, r)							\
 do {									\
 	swap((l)->front, (r)->front);					\
 	swap((l)->back, (r)->back);					\
 	swap((l)->size, (r)->size);					\
 	swap((l)->mask, (r)->mask);					\
 	swap((l)->data, (r)->data);					\
 } while (0)
 #define fifo_move(dest, src)						\
 do {									\
 	typeof(*((dest)->data)) _t;					\
 	while (!fifo_full(dest) &&					\
 	       fifo_pop(src, _t))					\
 		fifo_push(dest, _t);					\
 } while (0)
 /*
 * Simple array based allocator - preallocates a number of elements and you can
 * never allocate more than that, also has no locking.
 *
 * Handy because if you know you only need a fixed number of elements you don't
 * have to worry about memory allocation failure, and sometimes a mempool isn't
 * what you want.
 *
 * We treat the free elements as entries in a singly linked list, and the
 * freelist as a stack - allocating and freeing push and pop off the freelist.
 */
 #define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\
 	struct {							\
 		type	*freelist;					\
 		type	data[size];					\
 	} name
 #define array_alloc(array)						\
 ({									\
 	typeof((array)->freelist) _ret = (array)->freelist;		\
 									\
 	if (_ret)							\
 		(array)->freelist = *((typeof((array)->freelist) *) _ret);\
 									\
 	_ret;								\
 })
 #define array_free(array, ptr)						\
 do {									\
 	typeof((array)->freelist) _ptr = ptr;				\
 									\
 	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\
 	(array)->freelist = _ptr;					\
 } while (0)
 #define array_allocator_init(array)					\
 do {									\
 	typeof((array)->freelist) _i;					\
 									\
 	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\
 	(array)->freelist = NULL;					\
 									\
 	for (_i = (array)->data;					\
 	     _i < (array)->data + ARRAY_SIZE((array)->data);		\
 	     _i++)							\
 		array_free(array, _i);					\
 } while (0)
 #define array_freelist_empty(array)	((array)->freelist == NULL)
 #define ANYSINT_MAX(t)							\
 	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 int strtoint_h(const char *, int *);
 int strtouint_h(const char *, unsigned int *);
 int strtoll_h(const char *, long long *);
 int strtoull_h(const char *, unsigned long long *);
 static inline int strtol_h(const char *cp, long *res)
 {
 #if BITS_PER_LONG == 32
 	return strtoint_h(cp, (int *) res);
 #else
 	return strtoll_h(cp, (long long *) res);
 #endif
 }
 static inline int strtoul_h(const char *cp, long *res)
 {
 #if BITS_PER_LONG == 32
 	return strtouint_h(cp, (unsigned int *) res);
 #else
 	return strtoull_h(cp, (unsigned long long *) res);
 #endif
 }
 #define strtoi_h(cp, res)						\
 	(__builtin_types_compatible_p(typeof(*res), int)		\
 	? strtoint_h(cp, (void *) res)					\
 	: __builtin_types_compatible_p(typeof(*res), long)		\
 	? strtol_h(cp, (void *) res)					\
 	: __builtin_types_compatible_p(typeof(*res), long long)		\
 	? strtoll_h(cp, (void *) res)					\
 	: __builtin_types_compatible_p(typeof(*res), unsigned int)	\
 	? strtouint_h(cp, (void *) res)					\
 	: __builtin_types_compatible_p(typeof(*res), unsigned long)	\
 	? strtoul_h(cp, (void *) res)					\
 	: __builtin_types_compatible_p(typeof(*res), unsigned long long)\
 	? strtoull_h(cp, (void *) res) : -EINVAL)
 #define strtoul_safe(cp, var)						\
 ({									\
 	unsigned long _v;						\
 	int _r = kstrtoul(cp, 10, &_v);					\
 	if (!_r)							\
 		var = _v;						\
 	_r;								\
 })
 #define strtoul_safe_clamp(cp, var, min, max)				\
 ({									\
 	unsigned long _v;						\
 	int _r = kstrtoul(cp, 10, &_v);					\
 	if (!_r)							\
 		var = clamp_t(typeof(var), _v, min, max);		\
 	_r;								\
 })
 #define snprint(buf, size, var)						\
 	snprintf(buf, size,						\
 		__builtin_types_compatible_p(typeof(var), int)		\
 		     ? "%i\n" :						\
 		__builtin_types_compatible_p(typeof(var), unsigned)	\
 		     ? "%u\n" :						\
 		__builtin_types_compatible_p(typeof(var), long)		\
 		     ? "%li\n" :					\
 		__builtin_types_compatible_p(typeof(var), unsigned long)\
 		     ? "%lu\n" :					\
 		__builtin_types_compatible_p(typeof(var), int64_t)	\
 		     ? "%lli\n" :					\
 		__builtin_types_compatible_p(typeof(var), uint64_t)	\
 		     ? "%llu\n" :					\
 		__builtin_types_compatible_p(typeof(var), const char *)	\
 		     ? "%s\n" : "%i\n", var)
 ssize_t hprint(char *buf, int64_t v);
 bool is_zero(const char *p, size_t n);
 int parse_uuid(const char *s, char *uuid);
 ssize_t snprint_string_list(char *buf, size_t size, const char * const list[],
 			    size_t selected);
 ssize_t read_string_list(const char *buf, const char * const list[]);
 struct time_stats {
 	/*
 	 * all fields are in nanoseconds, averages are ewmas stored left shifted
 	 * by 8
 	 */
 	uint64_t	max_duration;
 	uint64_t	average_duration;
 	uint64_t	average_frequency;
 	uint64_t	last;
 };
 void time_stats_update(struct time_stats *stats, uint64_t time);
 #define NSEC_PER_ns			1L
 #define NSEC_PER_us			NSEC_PER_USEC
 #define NSEC_PER_ms			NSEC_PER_MSEC
 #define NSEC_PER_sec			NSEC_PER_SEC
 #define __print_time_stat(stats, name, stat, units)			\
 	sysfs_print(name ## _ ## stat ## _ ## units,			\
 		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
 #define sysfs_print_time_stats(stats, name,				\
 			       frequency_units,				\
 			       duration_units)				\
 do {									\
 	__print_time_stat(stats, name,					\
 			  average_frequency,	frequency_units);	\
 	__print_time_stat(stats, name,					\
 			  average_duration,	duration_units);	\
 	__print_time_stat(stats, name,					\
 			  max_duration,		duration_units);	\
 									\
 	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
 		    ? div_s64(local_clock() - (stats)->last,		\
 			      NSEC_PER_ ## frequency_units)		\
 		    : -1LL);						\
 } while (0)
 #define sysfs_time_stats_attribute(name,				\
 				   frequency_units,			\
 				   duration_units)			\
 read_attribute(name ## _average_frequency_ ## frequency_units);		\
 read_attribute(name ## _average_duration_ ## duration_units);		\
 read_attribute(name ## _max_duration_ ## duration_units);		\
 read_attribute(name ## _last_ ## frequency_units)
 #define sysfs_time_stats_attribute_list(name,				\
 					frequency_units,		\
 					duration_units)			\
 &sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
 &sysfs_ ## name ## _average_duration_ ## duration_units,		\
 &sysfs_ ## name ## _max_duration_ ## duration_units,			\
 &sysfs_ ## name ## _last_ ## frequency_units,
 #define ewma_add(ewma, val, weight, factor)				\
 ({									\
 	(ewma) *= (weight) - 1;						\
 	(ewma) += (val) << factor;					\
 	(ewma) /= (weight);						\
 	(ewma) >> factor;						\
 })
 struct ratelimit {
 	uint64_t		next;
 	unsigned		rate;
 };
 static inline void ratelimit_reset(struct ratelimit *d)
 {
 	d->next = local_clock();
 }
 unsigned next_delay(struct ratelimit *d, uint64_t done);
 #define __DIV_SAFE(n, d, zero)						\
 ({									\
 	typeof(n) _n = (n);						\
 	typeof(d) _d = (d);						\
 	_d ? _n / _d : zero;						\
 })
 #define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
 #define container_of_or_null(ptr, type, member)				\
 ({									\
 	typeof(ptr) _ptr = ptr;						\
 	_ptr ? container_of(_ptr, type, member) : NULL;			\
 })
 #define RB_INSERT(root, new, member, cmp)				\
 ({									\
 	__label__ dup;							\
 	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
 	typeof(new) this;						\
 	int res, ret = -1;						\
 									\
 	while (*n) {							\
 		parent = *n;						\
 		this = container_of(*n, typeof(*(new)), member);	\
 		res = cmp(new, this);					\
 		if (!res)						\
 			goto dup;					\
 		n = res < 0						\
 			? &(*n)->rb_left				\
 			: &(*n)->rb_right;				\
 	}								\
 									\
 	rb_link_node(&(new)->member, parent, n);			\
 	rb_insert_color(&(new)->member, root);				\
 	ret = 0;							\
 dup:									\
 	ret;								\
 })
 #define RB_SEARCH(root, search, member, cmp)				\
 ({									\
 	struct rb_node *n = (root)->rb_node;				\
 	typeof(&(search)) this, ret = NULL;				\
 	int res;							\
 									\
 	while (n) {							\
 		this = container_of(n, typeof(search), member);		\
 		res = cmp(&(search), this);				\
 		if (!res) {						\
 			ret = this;					\
 			break;						\
 		}							\
 		n = res < 0						\
 			? n->rb_left					\
 			: n->rb_right;					\
 	}								\
 	ret;								\
 })
 #define RB_GREATER(root, search, member, cmp)				\
 ({									\
 	struct rb_node *n = (root)->rb_node;				\
 	typeof(&(search)) this, ret = NULL;				\
 	int res;							\
 									\
 	while (n) {							\
 		this = container_of(n, typeof(search), member);		\
 		res = cmp(&(search), this);				\
 		if (res < 0) {						\
 			ret = this;					\
 			n = n->rb_left;					\
 		} else							\
 			n = n->rb_right;				\
 	}								\
 	ret;								\
 })
 #define RB_FIRST(root, type, member)					\
 	container_of_or_null(rb_first(root), type, member)
 #define RB_LAST(root, type, member)					\
 	container_of_or_null(rb_last(root), type, member)
 #define RB_NEXT(ptr, member)						\
 	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
 #define RB_PREV(ptr, member)						\
 	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
 /* Does linear interpolation between powers of two */
 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 {
 	unsigned fract = x & ~(~0 << fract_bits);
 	x >>= fract_bits;
 	x   = 1 << x;
 	x  += (x * fract) >> fract_bits;
 	return x;
 }
 #define bio_end(bio)	((bio)->bi_sector + bio_sectors(bio))
 void bio_map(struct bio *bio, void *base);
 int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
 	return bdev->bd_inode->i_size >> 9;
 }
 #define closure_bio_submit(bio, cl, dev)				\
 do {									\
 	closure_get(cl);						\
 	bch_generic_make_request(bio, &(dev)->bio_split_hook);		\
 } while (0)
 uint64_t crc64_update(uint64_t, const void *, size_t);
 uint64_t crc64(const void *, size_t);
 #endif /* _BCACHE_UTIL_H */
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@ -0,0 +1,414 @@
 /*
 * background writeback - scan btree for dirty data and write it to the backing
 * device
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 static struct workqueue_struct *dirty_wq;
 static void read_dirty(struct closure *);
 struct dirty_io {
 	struct closure		cl;
 	struct cached_dev	*dc;
 	struct bio		bio;
 };
 /* Rate limiting */
 static void __update_writeback_rate(struct cached_dev *dc)
 {
 	struct cache_set *c = dc->disk.c;
 	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
 	uint64_t cache_dirty_target =
 		div_u64(cache_sectors * dc->writeback_percent, 100);
 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
 				   c->cached_dev_sectors);
 	/* PD controller */
 	int change = 0;
 	int64_t error;
 	int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
 	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
 	dc->disk.sectors_dirty_last = dirty;
 	derivative *= dc->writeback_rate_d_term;
 	derivative = clamp(derivative, -dirty, dirty);
 	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
 			      dc->writeback_rate_d_smooth, 0);
 	/* Avoid divide by zero */
 	if (!target)
 		goto out;
 	error = div64_s64((dirty + derivative - target) << 8, target);
 	change = div_s64((dc->writeback_rate.rate * error) >> 8,
 			 dc->writeback_rate_p_term_inverse);
 	/* Don't increase writeback rate if the device isn't keeping up */
 	if (change > 0 &&
 	    time_after64(local_clock(),
 			 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
 		change = 0;
 	dc->writeback_rate.rate =
 		clamp_t(int64_t, dc->writeback_rate.rate + change,
 			1, NSEC_PER_MSEC);
 out:
 	dc->writeback_rate_derivative = derivative;
 	dc->writeback_rate_change = change;
 	dc->writeback_rate_target = target;
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 }
 static void update_writeback_rate(struct work_struct *work)
 {
 	struct cached_dev *dc = container_of(to_delayed_work(work),
 					     struct cached_dev,
 					     writeback_rate_update);
 	down_read(&dc->writeback_lock);
 	if (atomic_read(&dc->has_dirty) &&
 	    dc->writeback_percent)
 		__update_writeback_rate(dc);
 	up_read(&dc->writeback_lock);
 }
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
 	if (atomic_read(&dc->disk.detaching) ||
 	    !dc->writeback_percent)
 		return 0;
 	return next_delay(&dc->writeback_rate, sectors * 10000000ULL);
 }
 /* Background writeback */
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
 	return KEY_DIRTY(k);
 }
 static void dirty_init(struct keybuf_key *w)
 {
 	struct dirty_io *io = w->private;
 	struct bio *bio = &io->bio;
 	bio_init(bio);
 	if (!io->dc->writeback_percent)
 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	bio->bi_size		= KEY_SIZE(&w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
 	bio->bi_private		= w;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bio_map(bio, NULL);
 }
 static void refill_dirty(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev,
 					     writeback.cl);
 	struct keybuf *buf = &dc->writeback_keys;
 	bool searched_from_start = false;
 	struct bkey end = MAX_KEY;
 	SET_KEY_INODE(&end, dc->disk.id);
 	if (!atomic_read(&dc->disk.detaching) &&
 	    !dc->writeback_running)
 		closure_return(cl);
 	down_write(&dc->writeback_lock);
 	if (!atomic_read(&dc->has_dirty)) {
 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 		bch_write_bdev_super(dc, NULL);
 		up_write(&dc->writeback_lock);
 		closure_return(cl);
 	}
 	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
 		buf->last_scanned = KEY(dc->disk.id, 0, 0);
 		searched_from_start = true;
 	}
 	bch_refill_keybuf(dc->disk.c, buf, &end);
 	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
 		/* Searched the entire btree  - delay awhile */
 		if (RB_EMPTY_ROOT(&buf->keys)) {
 			atomic_set(&dc->has_dirty, 0);
 			cached_dev_put(dc);
 		}
 		if (!atomic_read(&dc->disk.detaching))
 			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
 	}
 	up_write(&dc->writeback_lock);
 	ratelimit_reset(&dc->writeback_rate);
 	/* Punt to workqueue only so we don't recurse and blow the stack */
 	continue_at(cl, read_dirty, dirty_wq);
 }
 void bch_writeback_queue(struct cached_dev *dc)
 {
 	if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
 		if (!atomic_read(&dc->disk.detaching))
 			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
 		continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
 	}
 }
 void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
 {
 	atomic_long_add(sectors, &dc->disk.sectors_dirty);
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
 		atomic_inc(&dc->count);
 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
 			/* XXX: should do this synchronously */
 			bch_write_bdev_super(dc, NULL);
 		}
 		bch_writeback_queue(dc);
 		if (dc->writeback_percent)
 			schedule_delayed_work(&dc->writeback_rate_update,
 				      dc->writeback_rate_update_seconds * HZ);
 	}
 }
 /* Background writeback - IO loop */
 static void dirty_io_destructor(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	kfree(io);
 }
 static void write_dirty_finish(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
 	struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
 	while (bv-- != io->bio.bi_io_vec)
 		__free_page(bv->bv_page);
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
 		unsigned i;
 		struct btree_op op;
 		bch_btree_op_init_stack(&op);
 		op.type = BTREE_REPLACE;
 		bkey_copy(&op.replace, &w->key);
 		SET_KEY_DIRTY(&w->key, false);
 		bch_keylist_add(&op.keys, &w->key);
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 		pr_debug("clearing %s", pkey(&w->key));
 		bch_btree_insert(&op, dc->disk.c);
 		closure_sync(&op.cl);
 		atomic_long_inc(op.insert_collision
 				? &dc->disk.c->writeback_keys_failed
 				: &dc->disk.c->writeback_keys_done);
 	}
 	bch_keybuf_del(&dc->writeback_keys, w);
 	atomic_dec_bug(&dc->in_flight);
 	closure_wake_up(&dc->writeback_wait);
 	closure_return_with_destructor(cl, dirty_io_destructor);
 }
 static void dirty_endio(struct bio *bio, int error)
 {
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 	if (error)
 		SET_KEY_DIRTY(&w->key, false);
 	closure_put(&io->cl);
 }
 static void write_dirty(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	dirty_init(w);
 	io->bio.bi_rw		= WRITE;
 	io->bio.bi_sector	= KEY_START(&w->key);
 	io->bio.bi_bdev		= io->dc->bdev;
 	io->bio.bi_end_io	= dirty_endio;
 	trace_bcache_write_dirty(&io->bio);
 	closure_bio_submit(&io->bio, cl, &io->dc->disk);
 	continue_at(cl, write_dirty_finish, dirty_wq);
 }
 static void read_dirty_endio(struct bio *bio, int error)
 {
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
 			    error, "reading dirty data from cache");
 	dirty_endio(bio, error);
 }
 static void read_dirty_submit(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	trace_bcache_read_dirty(&io->bio);
 	closure_bio_submit(&io->bio, cl, &io->dc->disk);
 	continue_at(cl, write_dirty, dirty_wq);
 }
 static void read_dirty(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev,
 					     writeback.cl);
 	unsigned delay = writeback_delay(dc, 0);
 	struct keybuf_key *w;
 	struct dirty_io *io;
 	/*
 	 * XXX: if we error, background writeback just spins. Should use some
 	 * mempools.
 	 */
 	while (1) {
 		w = bch_keybuf_next(&dc->writeback_keys);
 		if (!w)
 			break;
 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
 		if (delay > 0 &&
 		    (KEY_START(&w->key) != dc->last_read ||
 		     jiffies_to_msecs(delay) > 50)) {
 			w->private = NULL;
 			closure_delay(&dc->writeback, delay);
 			continue_at(cl, read_dirty, dirty_wq);
 		}
 		dc->last_read	= KEY_OFFSET(&w->key);
 		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
 			     GFP_KERNEL);
 		if (!io)
 			goto err;
 		w->private	= io;
 		io->dc		= dc;
 		dirty_init(w);
 		io->bio.bi_sector	= PTR_OFFSET(&w->key, 0);
 		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
 						    &w->key, 0)->bdev;
 		io->bio.bi_rw		= READ;
 		io->bio.bi_end_io	= read_dirty_endio;
 		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
 			goto err_free;
 		pr_debug("%s", pkey(&w->key));
 		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
 		atomic_inc(&dc->in_flight);
 		if (!closure_wait_event(&dc->writeback_wait, cl,
 					atomic_read(&dc->in_flight) < 64))
 			continue_at(cl, read_dirty, dirty_wq);
 	}
 	if (0) {
 err_free:
 		kfree(w->private);
 err:
 		bch_keybuf_del(&dc->writeback_keys, w);
 	}
 	refill_dirty(cl);
 }
 void bch_writeback_init_cached_dev(struct cached_dev *dc)
 {
 	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);
 	bch_keybuf_init(&dc->writeback_keys, dirty_pred);
 	dc->writeback_metadata		= true;
 	dc->writeback_running		= true;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	dc->writeback_rate.rate		= 1024;
 	dc->writeback_rate_update_seconds = 30;
 	dc->writeback_rate_d_term	= 16;
 	dc->writeback_rate_p_term_inverse = 64;
 	dc->writeback_rate_d_smooth	= 8;
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 }
 void bch_writeback_exit(void)
 {
 	if (dirty_wq)
 		destroy_workqueue(dirty_wq);
 }
 int __init bch_writeback_init(void)
 {
 	dirty_wq = create_singlethread_workqueue("bcache_writeback");
 	if (!dirty_wq)
 		return -ENOMEM;
 	return 0;
 }
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@ -78,3 +78,9 @@ SUBSYS(hugetlb)
 #endif
 /* */
 #ifdef CONFIG_CGROUP_BCACHE
 SUBSYS(bcache)
 #endif
 /* */
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@ -133,10 +133,20 @@ do {								\
 	_down_write_nest_lock(sem, &(nest_lock)->dep_map);	\
 } while (0);
 /*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
 extern void down_read_non_owner(struct rw_semaphore *sem);
 extern void up_read_non_owner(struct rw_semaphore *sem);
 #else
 # define down_read_nested(sem, subclass)		down_read(sem)
 # define down_write_nest_lock(sem, nest_lock)	down_write(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
 # define down_read_non_owner(sem)		down_read(sem)
 # define up_read_non_owner(sem)			up_read(sem)
 #endif
 #endif /* _LINUX_RWSEM_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1576,6 +1576,10 @@ struct task_struct {
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 #endif
 #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
 	unsigned int	sequential_io;
 	unsigned int	sequential_io_avg;
 #endif
 };
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@ -0,0 +1,271 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM bcache
 #if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BCACHE_H
 #include <linux/tracepoint.h>
 struct search;
 DECLARE_EVENT_CLASS(bcache_request,
 	TP_PROTO(struct search *s, struct bio *bio),
 	TP_ARGS(s, bio),
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
 		__field(unsigned int,	orig_major		)
 		__field(unsigned int,	orig_minor		)
 		__field(sector_t,	sector			)
 		__field(dev_t,		orig_sector		)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
 		__array(char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->orig_major	= s->d->disk->major;
 		__entry->orig_minor	= s->d->disk->first_minor;
 		__entry->sector		= bio->bi_sector;
 		__entry->orig_sector	= bio->bi_sector - 16;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm,
 		  __entry->orig_major, __entry->orig_minor,
 		  (unsigned long long)__entry->orig_sector)
 );
 DEFINE_EVENT(bcache_request, bcache_request_start,
 	TP_PROTO(struct search *s, struct bio *bio),
 	TP_ARGS(s, bio)
 );
 DEFINE_EVENT(bcache_request, bcache_request_end,
 	TP_PROTO(struct search *s, struct bio *bio),
 	TP_ARGS(s, bio)
 );
 DECLARE_EVENT_CLASS(bcache_bio,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio),
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
 		__field(sector_t,	sector			)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
 		__array(char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d  %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm)
 );
 DEFINE_EVENT(bcache_bio, bcache_passthrough,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_cache_hit,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_cache_miss,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_read_retry,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_writethrough,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_writeback,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_write_skip,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_btree_read,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_btree_write,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_write_dirty,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_read_dirty,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_write_moving,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_read_moving,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DEFINE_EVENT(bcache_bio, bcache_journal_write,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 DECLARE_EVENT_CLASS(bcache_cache_bio,
 	TP_PROTO(struct bio *bio,
 		 sector_t orig_sector,
 		 struct block_device* orig_bdev),
 	TP_ARGS(bio, orig_sector, orig_bdev),
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
 		__field(dev_t,		orig_dev		)
 		__field(sector_t,	sector			)
 		__field(sector_t,	orig_sector		)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
 		__array(char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->orig_dev	= orig_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->orig_sector	= orig_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d  %s %llu + %u [%s] (from %d,%d %llu)",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm,
 		  MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
 		  (unsigned long long)__entry->orig_sector)
 );
 DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
 	TP_PROTO(struct bio *bio,
 		 sector_t orig_sector,
 		 struct block_device *orig_bdev),
 	TP_ARGS(bio, orig_sector, orig_bdev)
 );
 DECLARE_EVENT_CLASS(bcache_gc,
 	TP_PROTO(uint8_t *uuid),
 	TP_ARGS(uuid),
 	TP_STRUCT__entry(
 		__field(uint8_t *,	uuid)
 	),
 	TP_fast_assign(
 		__entry->uuid		= uuid;
 	),
 	TP_printk("%pU", __entry->uuid)
 );
 DEFINE_EVENT(bcache_gc, bcache_gc_start,
 	     TP_PROTO(uint8_t *uuid),
 	     TP_ARGS(uuid)
 );
 DEFINE_EVENT(bcache_gc, bcache_gc_end,
 	     TP_PROTO(uint8_t *uuid),
 	     TP_ARGS(uuid)
 );
 #endif /* _TRACE_BCACHE_H */
 /* This part must be outside protection */
 #include <trace/define_trace.h>
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -1303,6 +1303,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.do_batch = 0;
 	p->memcg_batch.memcg = NULL;
 #endif
 #ifdef CONFIG_BCACHE
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
 #endif
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p);
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@ -2997,6 +2997,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 static int
 print_lock_nested_lock_not_held(struct task_struct *curr,
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 EXPORT_SYMBOL(_down_write_nest_lock);
 void down_read_non_owner(struct rw_semaphore *sem)
 {
 	might_sleep();
 	__down_read(sem);
 }
 EXPORT_SYMBOL(down_read_non_owner);
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	might_sleep();
@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_write_nested);
 void up_read_non_owner(struct rw_semaphore *sem)
 {
 	__up_read(sem);
 }
 EXPORT_SYMBOL(up_read_non_owner);
 #endif
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@ -1828,6 +1828,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 	rwbs[i] = '\0';
 }
 EXPORT_SYMBOL_GPL(blk_fill_rwbs);
 #endif /* CONFIG_EVENT_TRACING */