md update for 3.13.
Mostly optimisations and obscure bug fixes.
  - raid5 gets less lock contention
  - raid1 gets less contention between normal-io and resync-io
    during resync.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.0.19 (GNU/Linux)
 
 iQIVAwUAUovzDznsnt1WYoG5AQJ1pQ//bDuXadoJ5dwjWjVxFOKoQ9j/9joEI0yH
 XTApD3ADKckdBc4TSLOIbCNLW1Pbe23HlOI/GjCiJ/7mePL3OwHd7Fx8Rfq3BubV
 f7NgjVwu8nwYD0OXEZsshImptEtrbYwQdy+qlKcHXcZz1MUfR+Egih3r/ouTEfEt
 FNq/6MpyN0IKSY82xP/jFZgesBucgKz/YOUIbwClxm7UiyISKvWQLBIAfLB3dyI3
 HoEdEzQX6I56Rw0mkSUG4Mk+8xx/8twxL+yqEUqfdJREWuB56Km8kl8y/e465Nk0
 ZZg6j/TrslVEwbEeVMx0syvYcaAWFZ4X2jdKfo1lI0g9beZp7H1GRF8yR1s2t/h4
 g/vb55MEN++4LPaE9ut4z7SG2yLyGkZgFTzTjyq5of+DFL0cayO7wXxbgpcD7JYf
 Doef/OSa6csKiGiJI48iQa08Bolmz9ZWzZQXhAthKfFQ9Rv+GEtIAi4kLR8EZPbu
 0/FL1ylYNUY9O7p0g+iy9Kcoc+xW36I95pPZf8pO8GFcXTjyuCCBVh/SNvFZZHPl
 3xk3aZJknAEID8VrVG2IJPkeDI8WK8YxmpU/nARCoytn07Df6Ye8jGvLdR8pL3lB
 TIZV6eRY4yciB8LtoK9Kg4XTmOMhBtjt4c3znkljp98vhOQQb/oHN+BXMGcwqvr9
 fk0KGrg31VA=
 =8RCg
 -----END PGP SIGNATURE-----
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown:
 "Mostly optimisations and obscure bug fixes.
   - raid5 gets less lock contention
   - raid1 gets less contention between normal-io and resync-io during
     resync"
* tag 'md/3.13' of git://neil.brown.name/md:
  md/raid5: Use conf->device_lock protect changing of multi-thread resources.
  md/raid5: Before freeing old multi-thread worker, it should flush them.
  md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE.
  UAPI: include <asm/byteorder.h> in linux/raid/md_p.h
  raid1: Rewrite the implementation of iobarrier.
  raid1: Add some macros to make code clearly.
  raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array.
  raid1: Add a field array_frozen to indicate whether raid in freeze state.
  md: Convert use of typedef ctl_table to struct ctl_table
  md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
  md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread.
  md: fix some places where mddev_lock return value is not checked.
  raid5: Retry R5_ReadNoMerge flag when hit a read error.
  raid5: relieve lock contention in get_active_stripe()
  raid5: relieve lock contention in get_active_stripe()
  wait: add wait_event_cmd()
  md/raid5.c: add proper locking to error path of raid5_start_reshape.
  md: fix calculation of stacking limits on level change.
  raid5: Use slow_path to release stripe when mddev->thread is null
			
			
This commit is contained in:
		
						commit
						6d6e352c80
					
				
							
								
								
									
										133
									
								
								drivers/md/md.c
									
									
									
									
									
								
							
							
						
						
									
										133
									
								
								drivers/md/md.c
									
									
									
									
									
								
							| @ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) | ||||
| 
 | ||||
| static struct ctl_table_header *raid_table_header; | ||||
| 
 | ||||
| static ctl_table raid_table[] = { | ||||
| static struct ctl_table raid_table[] = { | ||||
| 	{ | ||||
| 		.procname	= "speed_limit_min", | ||||
| 		.data		= &sysctl_speed_limit_min, | ||||
| @ -130,7 +130,7 @@ static ctl_table raid_table[] = { | ||||
| 	{ } | ||||
| }; | ||||
| 
 | ||||
| static ctl_table raid_dir_table[] = { | ||||
| static struct ctl_table raid_dir_table[] = { | ||||
| 	{ | ||||
| 		.procname	= "raid", | ||||
| 		.maxlen		= 0, | ||||
| @ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { | ||||
| 	{ } | ||||
| }; | ||||
| 
 | ||||
| static ctl_table raid_root_table[] = { | ||||
| static struct ctl_table raid_root_table[] = { | ||||
| 	{ | ||||
| 		.procname	= "dev", | ||||
| 		.maxlen		= 0, | ||||
| @ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) | ||||
| 	goto retry; | ||||
| } | ||||
| 
 | ||||
| static inline int mddev_lock(struct mddev * mddev) | ||||
| static inline int __must_check mddev_lock(struct mddev * mddev) | ||||
| { | ||||
| 	return mutex_lock_interruptible(&mddev->reconfig_mutex); | ||||
| } | ||||
| 
 | ||||
| /* Sometimes we need to take the lock in a situation where
 | ||||
|  * failure due to interrupts is not acceptable. | ||||
|  */ | ||||
| static inline void mddev_lock_nointr(struct mddev * mddev) | ||||
| { | ||||
| 	mutex_lock(&mddev->reconfig_mutex); | ||||
| } | ||||
| 
 | ||||
| static inline int mddev_is_locked(struct mddev *mddev) | ||||
| { | ||||
| 	return mutex_is_locked(&mddev->reconfig_mutex); | ||||
| @ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | ||||
| 		for_each_mddev(mddev, tmp) { | ||||
| 			struct md_rdev *rdev2; | ||||
| 
 | ||||
| 			mddev_lock(mddev); | ||||
| 			mddev_lock_nointr(mddev); | ||||
| 			rdev_for_each(rdev2, mddev) | ||||
| 				if (rdev->bdev == rdev2->bdev && | ||||
| 				    rdev != rdev2 && | ||||
| @ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | ||||
| 				break; | ||||
| 			} | ||||
| 		} | ||||
| 		mddev_lock(my_mddev); | ||||
| 		mddev_lock_nointr(my_mddev); | ||||
| 		if (overlap) { | ||||
| 			/* Someone else could have slipped in a size
 | ||||
| 			 * change here, but doing so is just silly. | ||||
| @ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | ||||
| 		mddev->in_sync = 1; | ||||
| 		del_timer_sync(&mddev->safemode_timer); | ||||
| 	} | ||||
| 	blk_set_stacking_limits(&mddev->queue->limits); | ||||
| 	pers->run(mddev); | ||||
| 	set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||||
| 	mddev_resume(mddev); | ||||
| @ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev) | ||||
| 
 | ||||
| void md_stop_writes(struct mddev *mddev) | ||||
| { | ||||
| 	mddev_lock(mddev); | ||||
| 	mddev_lock_nointr(mddev); | ||||
| 	__md_stop_writes(mddev); | ||||
| 	mddev_unlock(mddev); | ||||
| } | ||||
| @ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop); | ||||
| static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | ||||
| { | ||||
| 	int err = 0; | ||||
| 	int did_freeze = 0; | ||||
| 
 | ||||
| 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||||
| 		did_freeze = 1; | ||||
| 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||||
| 		md_wakeup_thread(mddev->thread); | ||||
| 	} | ||||
| 	if (mddev->sync_thread) { | ||||
| 		set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||||
| 		/* Thread might be blocked waiting for metadata update
 | ||||
| 		 * which will now never happen */ | ||||
| 		wake_up_process(mddev->sync_thread->tsk); | ||||
| 	} | ||||
| 	mddev_unlock(mddev); | ||||
| 	wait_event(resync_wait, mddev->sync_thread == NULL); | ||||
| 	mddev_lock_nointr(mddev); | ||||
| 
 | ||||
| 	mutex_lock(&mddev->open_mutex); | ||||
| 	if (atomic_read(&mddev->openers) > !!bdev) { | ||||
| 	if (atomic_read(&mddev->openers) > !!bdev || | ||||
| 	    mddev->sync_thread || | ||||
| 	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||||
| 		printk("md: %s still in use.\n",mdname(mddev)); | ||||
| 		if (did_freeze) { | ||||
| 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||||
| 			md_wakeup_thread(mddev->thread); | ||||
| 		} | ||||
| 		err = -EBUSY; | ||||
| 		goto out; | ||||
| 	} | ||||
| 	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | ||||
| 		/* Someone opened the device since we flushed it
 | ||||
| 		 * so page cache could be dirty and it is too late | ||||
| 		 * to flush.  So abort | ||||
| 		 */ | ||||
| 		mutex_unlock(&mddev->open_mutex); | ||||
| 		return -EBUSY; | ||||
| 	} | ||||
| 	if (mddev->pers) { | ||||
| 		__md_stop_writes(mddev); | ||||
| 
 | ||||
| @ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | ||||
| 		set_disk_ro(mddev->gendisk, 1); | ||||
| 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||||
| 		sysfs_notify_dirent_safe(mddev->sysfs_state); | ||||
| 		err = 0;	 | ||||
| 		err = 0; | ||||
| 	} | ||||
| out: | ||||
| 	mutex_unlock(&mddev->open_mutex); | ||||
| @ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode, | ||||
| { | ||||
| 	struct gendisk *disk = mddev->gendisk; | ||||
| 	struct md_rdev *rdev; | ||||
| 	int did_freeze = 0; | ||||
| 
 | ||||
| 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||||
| 		did_freeze = 1; | ||||
| 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||||
| 		md_wakeup_thread(mddev->thread); | ||||
| 	} | ||||
| 	if (mddev->sync_thread) { | ||||
| 		set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||||
| 		/* Thread might be blocked waiting for metadata update
 | ||||
| 		 * which will now never happen */ | ||||
| 		wake_up_process(mddev->sync_thread->tsk); | ||||
| 	} | ||||
| 	mddev_unlock(mddev); | ||||
| 	wait_event(resync_wait, mddev->sync_thread == NULL); | ||||
| 	mddev_lock_nointr(mddev); | ||||
| 
 | ||||
| 	mutex_lock(&mddev->open_mutex); | ||||
| 	if (atomic_read(&mddev->openers) > !!bdev || | ||||
| 	    mddev->sysfs_active) { | ||||
| 	    mddev->sysfs_active || | ||||
| 	    mddev->sync_thread || | ||||
| 	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||||
| 		printk("md: %s still in use.\n",mdname(mddev)); | ||||
| 		mutex_unlock(&mddev->open_mutex); | ||||
| 		return -EBUSY; | ||||
| 	} | ||||
| 	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | ||||
| 		/* Someone opened the device since we flushed it
 | ||||
| 		 * so page cache could be dirty and it is too late | ||||
| 		 * to flush.  So abort | ||||
| 		 */ | ||||
| 		mutex_unlock(&mddev->open_mutex); | ||||
| 		if (did_freeze) { | ||||
| 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||||
| 			md_wakeup_thread(mddev->thread); | ||||
| 		} | ||||
| 		return -EBUSY; | ||||
| 	} | ||||
| 	if (mddev->pers) { | ||||
| @ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | ||||
| 				wait_event(mddev->sb_wait, | ||||
| 					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | ||||
| 					   !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||||
| 				mddev_lock(mddev); | ||||
| 				mddev_lock_nointr(mddev); | ||||
| 			} | ||||
| 		} else { | ||||
| 			err = -EROFS; | ||||
| @ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread) | ||||
| 		mddev->curr_resync = 2; | ||||
| 
 | ||||
| 	try_again: | ||||
| 		if (kthread_should_stop()) | ||||
| 			set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||||
| 
 | ||||
| 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||||
| 			goto skip; | ||||
| 		for_each_mddev(mddev2, tmp) { | ||||
| @ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread) | ||||
| 				 * be caught by 'softlockup' | ||||
| 				 */ | ||||
| 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); | ||||
| 				if (!kthread_should_stop() && | ||||
| 				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | ||||
| 				    mddev2->curr_resync >= mddev->curr_resync) { | ||||
| 					printk(KERN_INFO "md: delaying %s of %s" | ||||
| 					       " until %s has finished (they" | ||||
| @ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread) | ||||
| 	last_check = 0; | ||||
| 
 | ||||
| 	if (j>2) { | ||||
| 		printk(KERN_INFO  | ||||
| 		printk(KERN_INFO | ||||
| 		       "md: resuming %s of %s from checkpoint.\n", | ||||
| 		       desc, mdname(mddev)); | ||||
| 		mddev->curr_resync = j; | ||||
| @ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread) | ||||
| 			sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||||
| 		} | ||||
| 
 | ||||
| 		while (j >= mddev->resync_max && !kthread_should_stop()) { | ||||
| 		while (j >= mddev->resync_max && | ||||
| 		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||||
| 			/* As this condition is controlled by user-space,
 | ||||
| 			 * we can block indefinitely, so use '_interruptible' | ||||
| 			 * to avoid triggering warnings. | ||||
| @ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread) | ||||
| 			flush_signals(current); /* just in case */ | ||||
| 			wait_event_interruptible(mddev->recovery_wait, | ||||
| 						 mddev->resync_max > j | ||||
| 						 || kthread_should_stop()); | ||||
| 						 || test_bit(MD_RECOVERY_INTR, | ||||
| 							     &mddev->recovery)); | ||||
| 		} | ||||
| 
 | ||||
| 		if (kthread_should_stop()) | ||||
| 			goto interrupted; | ||||
| 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||||
| 			break; | ||||
| 
 | ||||
| 		sectors = mddev->pers->sync_request(mddev, j, &skipped, | ||||
| 						  currspeed < speed_min(mddev)); | ||||
| 		if (sectors == 0) { | ||||
| 			set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||||
| 			goto out; | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!skipped) { /* actual IO requested */ | ||||
| @ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread) | ||||
| 			last_mark = next; | ||||
| 		} | ||||
| 
 | ||||
| 
 | ||||
| 		if (kthread_should_stop()) | ||||
| 			goto interrupted; | ||||
| 
 | ||||
| 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||||
| 			break; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * this loop exits only if either when we are slower than | ||||
| @ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread) | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); | ||||
| 	printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, | ||||
| 	       test_bit(MD_RECOVERY_INTR, &mddev->recovery) | ||||
| 	       ? "interrupted" : "done"); | ||||
| 	/*
 | ||||
| 	 * this also signals 'finished resyncing' to md_stop | ||||
| 	 */ | ||||
|  out: | ||||
| 	blk_finish_plug(&plug); | ||||
| 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | ||||
| 
 | ||||
| @ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread) | ||||
| 	set_bit(MD_RECOVERY_DONE, &mddev->recovery); | ||||
| 	md_wakeup_thread(mddev->thread); | ||||
| 	return; | ||||
| 
 | ||||
|  interrupted: | ||||
| 	/*
 | ||||
| 	 * got a signal, exit. | ||||
| 	 */ | ||||
| 	printk(KERN_INFO | ||||
| 	       "md: md_do_sync() got signal ... exiting\n"); | ||||
| 	set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||||
| 	goto out; | ||||
| 
 | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(md_do_sync); | ||||
| 
 | ||||
| @ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev) | ||||
| 
 | ||||
| 	/* resync has finished, collect result */ | ||||
| 	md_unregister_thread(&mddev->sync_thread); | ||||
| 	wake_up(&resync_wait); | ||||
| 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | ||||
| 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||||
| 		/* success...*/ | ||||
|  | ||||
| @ -66,7 +66,8 @@ | ||||
|  */ | ||||
| static int max_queued_requests = 1024; | ||||
| 
 | ||||
| static void allow_barrier(struct r1conf *conf); | ||||
| static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | ||||
| 			  sector_t bi_sector); | ||||
| static void lower_barrier(struct r1conf *conf); | ||||
| 
 | ||||
| static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | ||||
| @ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) | ||||
| } | ||||
| 
 | ||||
| #define RESYNC_BLOCK_SIZE (64*1024) | ||||
| //#define RESYNC_BLOCK_SIZE PAGE_SIZE
 | ||||
| #define RESYNC_DEPTH 32 | ||||
| #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) | ||||
| #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) | ||||
| #define RESYNC_WINDOW (2048*1024) | ||||
| #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) | ||||
| #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | ||||
| #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||||
| 
 | ||||
| static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | ||||
| { | ||||
| @ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) | ||||
| 	struct bio *bio = r1_bio->master_bio; | ||||
| 	int done; | ||||
| 	struct r1conf *conf = r1_bio->mddev->private; | ||||
| 	sector_t start_next_window = r1_bio->start_next_window; | ||||
| 	sector_t bi_sector = bio->bi_sector; | ||||
| 
 | ||||
| 	if (bio->bi_phys_segments) { | ||||
| 		unsigned long flags; | ||||
| @ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) | ||||
| 		bio->bi_phys_segments--; | ||||
| 		done = (bio->bi_phys_segments == 0); | ||||
| 		spin_unlock_irqrestore(&conf->device_lock, flags); | ||||
| 		/*
 | ||||
| 		 * make_request() might be waiting for | ||||
| 		 * bi_phys_segments to decrease | ||||
| 		 */ | ||||
| 		wake_up(&conf->wait_barrier); | ||||
| 	} else | ||||
| 		done = 1; | ||||
| 
 | ||||
| @ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | ||||
| 		 * Wake up any possible resync thread that waits for the device | ||||
| 		 * to go idle. | ||||
| 		 */ | ||||
| 		allow_barrier(conf); | ||||
| 		allow_barrier(conf, start_next_window, bi_sector); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| @ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) | ||||
|  *    there is no normal IO happeing.  It must arrange to call | ||||
|  *    lower_barrier when the particular background IO completes. | ||||
|  */ | ||||
| #define RESYNC_DEPTH 32 | ||||
| 
 | ||||
| static void raise_barrier(struct r1conf *conf) | ||||
| { | ||||
| 	spin_lock_irq(&conf->resync_lock); | ||||
| @ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) | ||||
| 	/* block any new IO from starting */ | ||||
| 	conf->barrier++; | ||||
| 
 | ||||
| 	/* Now wait for all pending IO to complete */ | ||||
| 	/* For these conditions we must wait:
 | ||||
| 	 * A: while the array is in frozen state | ||||
| 	 * B: while barrier >= RESYNC_DEPTH, meaning resync reach | ||||
| 	 *    the max count which allowed. | ||||
| 	 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | ||||
| 	 *    next resync will reach to the window which normal bios are | ||||
| 	 *    handling. | ||||
| 	 */ | ||||
| 	wait_event_lock_irq(conf->wait_barrier, | ||||
| 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | ||||
| 			    !conf->array_frozen && | ||||
| 			    conf->barrier < RESYNC_DEPTH && | ||||
| 			    (conf->start_next_window >= | ||||
| 			     conf->next_resync + RESYNC_SECTORS), | ||||
| 			    conf->resync_lock); | ||||
| 
 | ||||
| 	spin_unlock_irq(&conf->resync_lock); | ||||
| @ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) | ||||
| 	wake_up(&conf->wait_barrier); | ||||
| } | ||||
| 
 | ||||
| static void wait_barrier(struct r1conf *conf) | ||||
| static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | ||||
| { | ||||
| 	bool wait = false; | ||||
| 
 | ||||
| 	if (conf->array_frozen || !bio) | ||||
| 		wait = true; | ||||
| 	else if (conf->barrier && bio_data_dir(bio) == WRITE) { | ||||
| 		if (conf->next_resync < RESYNC_WINDOW_SECTORS) | ||||
| 			wait = true; | ||||
| 		else if ((conf->next_resync - RESYNC_WINDOW_SECTORS | ||||
| 				>= bio_end_sector(bio)) || | ||||
| 			 (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||||
| 				<= bio->bi_sector)) | ||||
| 			wait = false; | ||||
| 		else | ||||
| 			wait = true; | ||||
| 	} | ||||
| 
 | ||||
| 	return wait; | ||||
| } | ||||
| 
 | ||||
| static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | ||||
| { | ||||
| 	sector_t sector = 0; | ||||
| 
 | ||||
| 	spin_lock_irq(&conf->resync_lock); | ||||
| 	if (conf->barrier) { | ||||
| 	if (need_to_wait_for_sync(conf, bio)) { | ||||
| 		conf->nr_waiting++; | ||||
| 		/* Wait for the barrier to drop.
 | ||||
| 		 * However if there are already pending | ||||
| @ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) | ||||
| 		 * count down. | ||||
| 		 */ | ||||
| 		wait_event_lock_irq(conf->wait_barrier, | ||||
| 				    !conf->barrier || | ||||
| 				    (conf->nr_pending && | ||||
| 				    !conf->array_frozen && | ||||
| 				    (!conf->barrier || | ||||
| 				    ((conf->start_next_window < | ||||
| 				      conf->next_resync + RESYNC_SECTORS) && | ||||
| 				     current->bio_list && | ||||
| 				     !bio_list_empty(current->bio_list)), | ||||
| 				     !bio_list_empty(current->bio_list))), | ||||
| 				    conf->resync_lock); | ||||
| 		conf->nr_waiting--; | ||||
| 	} | ||||
| 
 | ||||
| 	if (bio && bio_data_dir(bio) == WRITE) { | ||||
| 		if (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||||
| 		    <= bio->bi_sector) { | ||||
| 			if (conf->start_next_window == MaxSector) | ||||
| 				conf->start_next_window = | ||||
| 					conf->next_resync + | ||||
| 					NEXT_NORMALIO_DISTANCE; | ||||
| 
 | ||||
| 			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||||
| 			    <= bio->bi_sector) | ||||
| 				conf->next_window_requests++; | ||||
| 			else | ||||
| 				conf->current_window_requests++; | ||||
| 		} | ||||
| 		if (bio->bi_sector >= conf->start_next_window) | ||||
| 			sector = conf->start_next_window; | ||||
| 	} | ||||
| 
 | ||||
| 	conf->nr_pending++; | ||||
| 	spin_unlock_irq(&conf->resync_lock); | ||||
| 	return sector; | ||||
| } | ||||
| 
 | ||||
| static void allow_barrier(struct r1conf *conf) | ||||
| static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | ||||
| 			  sector_t bi_sector) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	spin_lock_irqsave(&conf->resync_lock, flags); | ||||
| 	conf->nr_pending--; | ||||
| 	if (start_next_window) { | ||||
| 		if (start_next_window == conf->start_next_window) { | ||||
| 			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | ||||
| 			    <= bi_sector) | ||||
| 				conf->next_window_requests--; | ||||
| 			else | ||||
| 				conf->current_window_requests--; | ||||
| 		} else | ||||
| 			conf->current_window_requests--; | ||||
| 
 | ||||
| 		if (!conf->current_window_requests) { | ||||
| 			if (conf->next_window_requests) { | ||||
| 				conf->current_window_requests = | ||||
| 					conf->next_window_requests; | ||||
| 				conf->next_window_requests = 0; | ||||
| 				conf->start_next_window += | ||||
| 					NEXT_NORMALIO_DISTANCE; | ||||
| 			} else | ||||
| 				conf->start_next_window = MaxSector; | ||||
| 		} | ||||
| 	} | ||||
| 	spin_unlock_irqrestore(&conf->resync_lock, flags); | ||||
| 	wake_up(&conf->wait_barrier); | ||||
| } | ||||
| @ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) | ||||
| { | ||||
| 	/* stop syncio and normal IO and wait for everything to
 | ||||
| 	 * go quite. | ||||
| 	 * We increment barrier and nr_waiting, and then | ||||
| 	 * wait until nr_pending match nr_queued+extra | ||||
| 	 * We wait until nr_pending match nr_queued+extra | ||||
| 	 * This is called in the context of one normal IO request | ||||
| 	 * that has failed. Thus any sync request that might be pending | ||||
| 	 * will be blocked by nr_pending, and we need to wait for | ||||
| @ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) | ||||
| 	 * we continue. | ||||
| 	 */ | ||||
| 	spin_lock_irq(&conf->resync_lock); | ||||
| 	conf->barrier++; | ||||
| 	conf->nr_waiting++; | ||||
| 	conf->array_frozen = 1; | ||||
| 	wait_event_lock_irq_cmd(conf->wait_barrier, | ||||
| 				conf->nr_pending == conf->nr_queued+extra, | ||||
| 				conf->resync_lock, | ||||
| @ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) | ||||
| { | ||||
| 	/* reverse the effect of the freeze */ | ||||
| 	spin_lock_irq(&conf->resync_lock); | ||||
| 	conf->barrier--; | ||||
| 	conf->nr_waiting--; | ||||
| 	conf->array_frozen = 0; | ||||
| 	wake_up(&conf->wait_barrier); | ||||
| 	spin_unlock_irq(&conf->resync_lock); | ||||
| } | ||||
| @ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | ||||
| 	int first_clone; | ||||
| 	int sectors_handled; | ||||
| 	int max_sectors; | ||||
| 	sector_t start_next_window; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Register the new request and wait if the reconstruction | ||||
| @ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | ||||
| 		finish_wait(&conf->wait_barrier, &w); | ||||
| 	} | ||||
| 
 | ||||
| 	wait_barrier(conf); | ||||
| 	start_next_window = wait_barrier(conf, bio); | ||||
| 
 | ||||
| 	bitmap = mddev->bitmap; | ||||
| 
 | ||||
| @ -1163,6 +1247,7 @@ read_again: | ||||
| 
 | ||||
| 	disks = conf->raid_disks * 2; | ||||
|  retry_write: | ||||
| 	r1_bio->start_next_window = start_next_window; | ||||
| 	blocked_rdev = NULL; | ||||
| 	rcu_read_lock(); | ||||
| 	max_sectors = r1_bio->sectors; | ||||
| @ -1231,14 +1316,24 @@ read_again: | ||||
| 	if (unlikely(blocked_rdev)) { | ||||
| 		/* Wait for this device to become unblocked */ | ||||
| 		int j; | ||||
| 		sector_t old = start_next_window; | ||||
| 
 | ||||
| 		for (j = 0; j < i; j++) | ||||
| 			if (r1_bio->bios[j]) | ||||
| 				rdev_dec_pending(conf->mirrors[j].rdev, mddev); | ||||
| 		r1_bio->state = 0; | ||||
| 		allow_barrier(conf); | ||||
| 		allow_barrier(conf, start_next_window, bio->bi_sector); | ||||
| 		md_wait_for_blocked_rdev(blocked_rdev, mddev); | ||||
| 		wait_barrier(conf); | ||||
| 		start_next_window = wait_barrier(conf, bio); | ||||
| 		/*
 | ||||
| 		 * We must make sure the multi r1bios of bio have | ||||
| 		 * the same value of bi_phys_segments | ||||
| 		 */ | ||||
| 		if (bio->bi_phys_segments && old && | ||||
| 		    old != start_next_window) | ||||
| 			/* Wait for the former r1bio(s) to complete */ | ||||
| 			wait_event(conf->wait_barrier, | ||||
| 				   bio->bi_phys_segments == 1); | ||||
| 		goto retry_write; | ||||
| 	} | ||||
| 
 | ||||
| @ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) | ||||
| 
 | ||||
| static void close_sync(struct r1conf *conf) | ||||
| { | ||||
| 	wait_barrier(conf); | ||||
| 	allow_barrier(conf); | ||||
| 	wait_barrier(conf, NULL); | ||||
| 	allow_barrier(conf, 0, 0); | ||||
| 
 | ||||
| 	mempool_destroy(conf->r1buf_pool); | ||||
| 	conf->r1buf_pool = NULL; | ||||
| 
 | ||||
| 	conf->next_resync = 0; | ||||
| 	conf->start_next_window = MaxSector; | ||||
| } | ||||
| 
 | ||||
| static int raid1_spare_active(struct mddev *mddev) | ||||
| @ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | ||||
| 	conf->pending_count = 0; | ||||
| 	conf->recovery_disabled = mddev->recovery_disabled - 1; | ||||
| 
 | ||||
| 	conf->start_next_window = MaxSector; | ||||
| 	conf->current_window_requests = conf->next_window_requests = 0; | ||||
| 
 | ||||
| 	err = -EIO; | ||||
| 	for (i = 0; i < conf->raid_disks * 2; i++) { | ||||
| 
 | ||||
| @ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) | ||||
| 			   atomic_read(&bitmap->behind_writes) == 0); | ||||
| 	} | ||||
| 
 | ||||
| 	raise_barrier(conf); | ||||
| 	lower_barrier(conf); | ||||
| 	freeze_array(conf, 0); | ||||
| 	unfreeze_array(conf); | ||||
| 
 | ||||
| 	md_unregister_thread(&mddev->thread); | ||||
| 	if (conf->r1bio_pool) | ||||
| @ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) | ||||
| 		wake_up(&conf->wait_barrier); | ||||
| 		break; | ||||
| 	case 1: | ||||
| 		raise_barrier(conf); | ||||
| 		freeze_array(conf, 0); | ||||
| 		break; | ||||
| 	case 0: | ||||
| 		lower_barrier(conf); | ||||
| 		unfreeze_array(conf); | ||||
| 		break; | ||||
| 	} | ||||
| } | ||||
| @ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) | ||||
| 		mddev->new_chunk_sectors = 0; | ||||
| 		conf = setup_conf(mddev); | ||||
| 		if (!IS_ERR(conf)) | ||||
| 			conf->barrier = 1; | ||||
| 			/* Array must appear to be quiesced */ | ||||
| 			conf->array_frozen = 1; | ||||
| 		return conf; | ||||
| 	} | ||||
| 	return ERR_PTR(-EINVAL); | ||||
|  | ||||
| @ -41,6 +41,19 @@ struct r1conf { | ||||
| 	 */ | ||||
| 	sector_t		next_resync; | ||||
| 
 | ||||
| 	/* When raid1 starts resync, we divide array into four partitions
 | ||||
| 	 * |---------|--------------|---------------------|-------------| | ||||
| 	 *        next_resync   start_next_window       end_window | ||||
| 	 * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE | ||||
| 	 * end_window = start_next_window + NEXT_NORMALIO_DISTANCE | ||||
| 	 * current_window_requests means the count of normalIO between | ||||
| 	 *   start_next_window and end_window. | ||||
| 	 * next_window_requests means the count of normalIO after end_window. | ||||
| 	 * */ | ||||
| 	sector_t		start_next_window; | ||||
| 	int			current_window_requests; | ||||
| 	int			next_window_requests; | ||||
| 
 | ||||
| 	spinlock_t		device_lock; | ||||
| 
 | ||||
| 	/* list of 'struct r1bio' that need to be processed by raid1d,
 | ||||
| @ -65,6 +78,7 @@ struct r1conf { | ||||
| 	int			nr_waiting; | ||||
| 	int			nr_queued; | ||||
| 	int			barrier; | ||||
| 	int			array_frozen; | ||||
| 
 | ||||
| 	/* Set to 1 if a full sync is needed, (fresh device added).
 | ||||
| 	 * Cleared when a sync completes. | ||||
| @ -111,6 +125,7 @@ struct r1bio { | ||||
| 						 * in this BehindIO request | ||||
| 						 */ | ||||
| 	sector_t		sector; | ||||
| 	sector_t		start_next_window; | ||||
| 	int			sectors; | ||||
| 	unsigned long		state; | ||||
| 	struct mddev		*mddev; | ||||
|  | ||||
| @ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||||
| 		set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||||
| 		md_wakeup_thread(mddev->thread); | ||||
| 		wait_event(mddev->sb_wait, mddev->flags == 0 || | ||||
| 			   kthread_should_stop()); | ||||
| 			   test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||||
| 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||||
| 			allow_barrier(conf); | ||||
| 			return sectors_done; | ||||
| 		} | ||||
| 		conf->reshape_safe = mddev->reshape_position; | ||||
| 		allow_barrier(conf); | ||||
| 	} | ||||
|  | ||||
| @ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) | ||||
| 	return &conf->stripe_hashtbl[hash]; | ||||
| } | ||||
| 
 | ||||
| static inline int stripe_hash_locks_hash(sector_t sect) | ||||
| { | ||||
| 	return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; | ||||
| } | ||||
| 
 | ||||
| static inline void lock_device_hash_lock(struct r5conf *conf, int hash) | ||||
| { | ||||
| 	spin_lock_irq(conf->hash_locks + hash); | ||||
| 	spin_lock(&conf->device_lock); | ||||
| } | ||||
| 
 | ||||
| static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) | ||||
| { | ||||
| 	spin_unlock(&conf->device_lock); | ||||
| 	spin_unlock_irq(conf->hash_locks + hash); | ||||
| } | ||||
| 
 | ||||
| static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) | ||||
| { | ||||
| 	int i; | ||||
| 	local_irq_disable(); | ||||
| 	spin_lock(conf->hash_locks); | ||||
| 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||||
| 		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); | ||||
| 	spin_lock(&conf->device_lock); | ||||
| } | ||||
| 
 | ||||
| static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | ||||
| { | ||||
| 	int i; | ||||
| 	spin_unlock(&conf->device_lock); | ||||
| 	for (i = NR_STRIPE_HASH_LOCKS; i; i--) | ||||
| 		spin_unlock(conf->hash_locks + i - 1); | ||||
| 	local_irq_enable(); | ||||
| } | ||||
| 
 | ||||
| /* bio's attached to a stripe+device for I/O are linked together in bi_sector
 | ||||
|  * order without overlap.  There may be several bio's per stripe+device, and | ||||
|  * a bio could span several devices. | ||||
| @ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | ||||
| static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, | ||||
| 			      struct list_head *temp_inactive_list) | ||||
| { | ||||
| 	BUG_ON(!list_empty(&sh->lru)); | ||||
| 	BUG_ON(atomic_read(&conf->active_stripes)==0); | ||||
| @ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | ||||
| 			    < IO_THRESHOLD) | ||||
| 				md_wakeup_thread(conf->mddev->thread); | ||||
| 		atomic_dec(&conf->active_stripes); | ||||
| 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | ||||
| 			list_add_tail(&sh->lru, &conf->inactive_list); | ||||
| 			wake_up(&conf->wait_for_stripe); | ||||
| 			if (conf->retry_read_aligned) | ||||
| 				md_wakeup_thread(conf->mddev->thread); | ||||
| 		} | ||||
| 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) | ||||
| 			list_add_tail(&sh->lru, temp_inactive_list); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | ||||
| static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, | ||||
| 			     struct list_head *temp_inactive_list) | ||||
| { | ||||
| 	if (atomic_dec_and_test(&sh->count)) | ||||
| 		do_release_stripe(conf, sh); | ||||
| 		do_release_stripe(conf, sh, temp_inactive_list); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list | ||||
|  * | ||||
|  * Be careful: Only one task can add/delete stripes from temp_inactive_list at | ||||
|  * given time. Adding stripes only takes device lock, while deleting stripes | ||||
|  * only takes hash lock. | ||||
|  */ | ||||
| static void release_inactive_stripe_list(struct r5conf *conf, | ||||
| 					 struct list_head *temp_inactive_list, | ||||
| 					 int hash) | ||||
| { | ||||
| 	int size; | ||||
| 	bool do_wakeup = false; | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	if (hash == NR_STRIPE_HASH_LOCKS) { | ||||
| 		size = NR_STRIPE_HASH_LOCKS; | ||||
| 		hash = NR_STRIPE_HASH_LOCKS - 1; | ||||
| 	} else | ||||
| 		size = 1; | ||||
| 	while (size) { | ||||
| 		struct list_head *list = &temp_inactive_list[size - 1]; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We don't hold any lock here yet, get_active_stripe() might | ||||
| 		 * remove stripes from the list | ||||
| 		 */ | ||||
| 		if (!list_empty_careful(list)) { | ||||
| 			spin_lock_irqsave(conf->hash_locks + hash, flags); | ||||
| 			if (list_empty(conf->inactive_list + hash) && | ||||
| 			    !list_empty(list)) | ||||
| 				atomic_dec(&conf->empty_inactive_list_nr); | ||||
| 			list_splice_tail_init(list, conf->inactive_list + hash); | ||||
| 			do_wakeup = true; | ||||
| 			spin_unlock_irqrestore(conf->hash_locks + hash, flags); | ||||
| 		} | ||||
| 		size--; | ||||
| 		hash--; | ||||
| 	} | ||||
| 
 | ||||
| 	if (do_wakeup) { | ||||
| 		wake_up(&conf->wait_for_stripe); | ||||
| 		if (conf->retry_read_aligned) | ||||
| 			md_wakeup_thread(conf->mddev->thread); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /* should hold conf->device_lock already */ | ||||
| static int release_stripe_list(struct r5conf *conf) | ||||
| static int release_stripe_list(struct r5conf *conf, | ||||
| 			       struct list_head *temp_inactive_list) | ||||
| { | ||||
| 	struct stripe_head *sh; | ||||
| 	int count = 0; | ||||
| @ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) | ||||
| 	head = llist_del_all(&conf->released_stripes); | ||||
| 	head = llist_reverse_order(head); | ||||
| 	while (head) { | ||||
| 		int hash; | ||||
| 
 | ||||
| 		sh = llist_entry(head, struct stripe_head, release_list); | ||||
| 		head = llist_next(head); | ||||
| 		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ | ||||
| @ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) | ||||
| 		 * again, the count is always > 1. This is true for | ||||
| 		 * STRIPE_ON_UNPLUG_LIST bit too. | ||||
| 		 */ | ||||
| 		__release_stripe(conf, sh); | ||||
| 		hash = sh->hash_lock_index; | ||||
| 		__release_stripe(conf, sh, &temp_inactive_list[hash]); | ||||
| 		count++; | ||||
| 	} | ||||
| 
 | ||||
| @ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) | ||||
| { | ||||
| 	struct r5conf *conf = sh->raid_conf; | ||||
| 	unsigned long flags; | ||||
| 	struct list_head list; | ||||
| 	int hash; | ||||
| 	bool wakeup; | ||||
| 
 | ||||
| 	if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | ||||
| 	if (unlikely(!conf->mddev->thread) || | ||||
| 		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | ||||
| 		goto slow_path; | ||||
| 	wakeup = llist_add(&sh->release_list, &conf->released_stripes); | ||||
| 	if (wakeup) | ||||
| @ -336,8 +424,11 @@ slow_path: | ||||
| 	local_irq_save(flags); | ||||
| 	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ | ||||
| 	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { | ||||
| 		do_release_stripe(conf, sh); | ||||
| 		INIT_LIST_HEAD(&list); | ||||
| 		hash = sh->hash_lock_index; | ||||
| 		do_release_stripe(conf, sh, &list); | ||||
| 		spin_unlock(&conf->device_lock); | ||||
| 		release_inactive_stripe_list(conf, &list, hash); | ||||
| 	} | ||||
| 	local_irq_restore(flags); | ||||
| } | ||||
| @ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) | ||||
| 
 | ||||
| 
 | ||||
| /* find an idle stripe, make sure it is unhashed, and return it. */ | ||||
| static struct stripe_head *get_free_stripe(struct r5conf *conf) | ||||
| static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) | ||||
| { | ||||
| 	struct stripe_head *sh = NULL; | ||||
| 	struct list_head *first; | ||||
| 
 | ||||
| 	if (list_empty(&conf->inactive_list)) | ||||
| 	if (list_empty(conf->inactive_list + hash)) | ||||
| 		goto out; | ||||
| 	first = conf->inactive_list.next; | ||||
| 	first = (conf->inactive_list + hash)->next; | ||||
| 	sh = list_entry(first, struct stripe_head, lru); | ||||
| 	list_del_init(first); | ||||
| 	remove_hash(sh); | ||||
| 	atomic_inc(&conf->active_stripes); | ||||
| 	BUG_ON(hash != sh->hash_lock_index); | ||||
| 	if (list_empty(conf->inactive_list + hash)) | ||||
| 		atomic_inc(&conf->empty_inactive_list_nr); | ||||
| out: | ||||
| 	return sh; | ||||
| } | ||||
| @ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, | ||||
| static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | ||||
| { | ||||
| 	struct r5conf *conf = sh->raid_conf; | ||||
| 	int i; | ||||
| 	int i, seq; | ||||
| 
 | ||||
| 	BUG_ON(atomic_read(&sh->count) != 0); | ||||
| 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | ||||
| @ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | ||||
| 		(unsigned long long)sh->sector); | ||||
| 
 | ||||
| 	remove_hash(sh); | ||||
| 
 | ||||
| retry: | ||||
| 	seq = read_seqcount_begin(&conf->gen_lock); | ||||
| 	sh->generation = conf->generation - previous; | ||||
| 	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; | ||||
| 	sh->sector = sector; | ||||
| @ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | ||||
| 		dev->flags = 0; | ||||
| 		raid5_build_block(sh, i, previous); | ||||
| 	} | ||||
| 	if (read_seqcount_retry(&conf->gen_lock, seq)) | ||||
| 		goto retry; | ||||
| 	insert_hash(conf, sh); | ||||
| 	sh->cpu = smp_processor_id(); | ||||
| } | ||||
| @ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | ||||
| 		  int previous, int noblock, int noquiesce) | ||||
| { | ||||
| 	struct stripe_head *sh; | ||||
| 	int hash = stripe_hash_locks_hash(sector); | ||||
| 
 | ||||
| 	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); | ||||
| 
 | ||||
| 	spin_lock_irq(&conf->device_lock); | ||||
| 	spin_lock_irq(conf->hash_locks + hash); | ||||
| 
 | ||||
| 	do { | ||||
| 		wait_event_lock_irq(conf->wait_for_stripe, | ||||
| 				    conf->quiesce == 0 || noquiesce, | ||||
| 				    conf->device_lock); | ||||
| 				    *(conf->hash_locks + hash)); | ||||
| 		sh = __find_stripe(conf, sector, conf->generation - previous); | ||||
| 		if (!sh) { | ||||
| 			if (!conf->inactive_blocked) | ||||
| 				sh = get_free_stripe(conf); | ||||
| 				sh = get_free_stripe(conf, hash); | ||||
| 			if (noblock && sh == NULL) | ||||
| 				break; | ||||
| 			if (!sh) { | ||||
| 				conf->inactive_blocked = 1; | ||||
| 				wait_event_lock_irq(conf->wait_for_stripe, | ||||
| 						    !list_empty(&conf->inactive_list) && | ||||
| 						    (atomic_read(&conf->active_stripes) | ||||
| 						     < (conf->max_nr_stripes *3/4) | ||||
| 						     || !conf->inactive_blocked), | ||||
| 						    conf->device_lock); | ||||
| 				wait_event_lock_irq( | ||||
| 					conf->wait_for_stripe, | ||||
| 					!list_empty(conf->inactive_list + hash) && | ||||
| 					(atomic_read(&conf->active_stripes) | ||||
| 					 < (conf->max_nr_stripes * 3 / 4) | ||||
| 					 || !conf->inactive_blocked), | ||||
| 					*(conf->hash_locks + hash)); | ||||
| 				conf->inactive_blocked = 0; | ||||
| 			} else | ||||
| 				init_stripe(sh, sector, previous); | ||||
| @ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | ||||
| 				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) | ||||
| 				    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); | ||||
| 			} else { | ||||
| 				spin_lock(&conf->device_lock); | ||||
| 				if (!test_bit(STRIPE_HANDLE, &sh->state)) | ||||
| 					atomic_inc(&conf->active_stripes); | ||||
| 				if (list_empty(&sh->lru) && | ||||
| 				    !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) && | ||||
| 				    !test_bit(STRIPE_EXPANDING, &sh->state)) | ||||
| 					BUG(); | ||||
| 				list_del_init(&sh->lru); | ||||
| @ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | ||||
| 					sh->group->stripes_cnt--; | ||||
| 					sh->group = NULL; | ||||
| 				} | ||||
| 				spin_unlock(&conf->device_lock); | ||||
| 			} | ||||
| 		} | ||||
| 	} while (sh == NULL); | ||||
| @ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | ||||
| 	if (sh) | ||||
| 		atomic_inc(&sh->count); | ||||
| 
 | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 	spin_unlock_irq(conf->hash_locks + hash); | ||||
| 	return sh; | ||||
| } | ||||
| 
 | ||||
| @ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | ||||
| 				bi->bi_sector = (sh->sector | ||||
| 						 + rdev->data_offset); | ||||
| 			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||||
| 				bi->bi_rw |= REQ_FLUSH; | ||||
| 				bi->bi_rw |= REQ_NOMERGE; | ||||
| 
 | ||||
| 			bi->bi_vcnt = 1; | ||||
| 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||||
| @ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | ||||
| 	put_cpu(); | ||||
| } | ||||
| 
 | ||||
| static int grow_one_stripe(struct r5conf *conf) | ||||
| static int grow_one_stripe(struct r5conf *conf, int hash) | ||||
| { | ||||
| 	struct stripe_head *sh; | ||||
| 	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); | ||||
| @ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf) | ||||
| 		kmem_cache_free(conf->slab_cache, sh); | ||||
| 		return 0; | ||||
| 	} | ||||
| 	sh->hash_lock_index = hash; | ||||
| 	/* we just created an active stripe so... */ | ||||
| 	atomic_set(&sh->count, 1); | ||||
| 	atomic_inc(&conf->active_stripes); | ||||
| @ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num) | ||||
| { | ||||
| 	struct kmem_cache *sc; | ||||
| 	int devs = max(conf->raid_disks, conf->previous_raid_disks); | ||||
| 	int hash; | ||||
| 
 | ||||
| 	if (conf->mddev->gendisk) | ||||
| 		sprintf(conf->cache_name[0], | ||||
| @ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num) | ||||
| 		return 1; | ||||
| 	conf->slab_cache = sc; | ||||
| 	conf->pool_size = devs; | ||||
| 	while (num--) | ||||
| 		if (!grow_one_stripe(conf)) | ||||
| 	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||||
| 	while (num--) { | ||||
| 		if (!grow_one_stripe(conf, hash)) | ||||
| 			return 1; | ||||
| 		conf->max_nr_stripes++; | ||||
| 		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| @ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) | ||||
| 	int err; | ||||
| 	struct kmem_cache *sc; | ||||
| 	int i; | ||||
| 	int hash, cnt; | ||||
| 
 | ||||
| 	if (newsize <= conf->pool_size) | ||||
| 		return 0; /* never bother to shrink */ | ||||
| @ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) | ||||
| 	 * OK, we have enough stripes, start collecting inactive | ||||
| 	 * stripes and copying them over | ||||
| 	 */ | ||||
| 	hash = 0; | ||||
| 	cnt = 0; | ||||
| 	list_for_each_entry(nsh, &newstripes, lru) { | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		wait_event_lock_irq(conf->wait_for_stripe, | ||||
| 				    !list_empty(&conf->inactive_list), | ||||
| 				    conf->device_lock); | ||||
| 		osh = get_free_stripe(conf); | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 		lock_device_hash_lock(conf, hash); | ||||
| 		wait_event_cmd(conf->wait_for_stripe, | ||||
| 				    !list_empty(conf->inactive_list + hash), | ||||
| 				    unlock_device_hash_lock(conf, hash), | ||||
| 				    lock_device_hash_lock(conf, hash)); | ||||
| 		osh = get_free_stripe(conf, hash); | ||||
| 		unlock_device_hash_lock(conf, hash); | ||||
| 		atomic_set(&nsh->count, 1); | ||||
| 		for(i=0; i<conf->pool_size; i++) | ||||
| 			nsh->dev[i].page = osh->dev[i].page; | ||||
| 		for( ; i<newsize; i++) | ||||
| 			nsh->dev[i].page = NULL; | ||||
| 		nsh->hash_lock_index = hash; | ||||
| 		kmem_cache_free(conf->slab_cache, osh); | ||||
| 		cnt++; | ||||
| 		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + | ||||
| 		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { | ||||
| 			hash++; | ||||
| 			cnt = 0; | ||||
| 		} | ||||
| 	} | ||||
| 	kmem_cache_destroy(conf->slab_cache); | ||||
| 
 | ||||
| @ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| static int drop_one_stripe(struct r5conf *conf) | ||||
| static int drop_one_stripe(struct r5conf *conf, int hash) | ||||
| { | ||||
| 	struct stripe_head *sh; | ||||
| 
 | ||||
| 	spin_lock_irq(&conf->device_lock); | ||||
| 	sh = get_free_stripe(conf); | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 	spin_lock_irq(conf->hash_locks + hash); | ||||
| 	sh = get_free_stripe(conf, hash); | ||||
| 	spin_unlock_irq(conf->hash_locks + hash); | ||||
| 	if (!sh) | ||||
| 		return 0; | ||||
| 	BUG_ON(atomic_read(&sh->count)); | ||||
| @ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf) | ||||
| 
 | ||||
| static void shrink_stripes(struct r5conf *conf) | ||||
| { | ||||
| 	while (drop_one_stripe(conf)) | ||||
| 		; | ||||
| 	int hash; | ||||
| 	for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) | ||||
| 		while (drop_one_stripe(conf, hash)) | ||||
| 			; | ||||
| 
 | ||||
| 	if (conf->slab_cache) | ||||
| 		kmem_cache_destroy(conf->slab_cache); | ||||
| @ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | ||||
| 			       mdname(conf->mddev), bdn); | ||||
| 		else | ||||
| 			retry = 1; | ||||
| 		if (set_bad && test_bit(In_sync, &rdev->flags) | ||||
| 		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||||
| 			retry = 1; | ||||
| 		if (retry) | ||||
| 			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { | ||||
| 				set_bit(R5_ReadError, &sh->dev[i].flags); | ||||
| @ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void activate_bit_delay(struct r5conf *conf) | ||||
| static void activate_bit_delay(struct r5conf *conf, | ||||
| 	struct list_head *temp_inactive_list) | ||||
| { | ||||
| 	/* device_lock is held */ | ||||
| 	struct list_head head; | ||||
| @ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf) | ||||
| 	list_del_init(&conf->bitmap_list); | ||||
| 	while (!list_empty(&head)) { | ||||
| 		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | ||||
| 		int hash; | ||||
| 		list_del_init(&sh->lru); | ||||
| 		atomic_inc(&sh->count); | ||||
| 		__release_stripe(conf, sh); | ||||
| 		hash = sh->hash_lock_index; | ||||
| 		__release_stripe(conf, sh, &temp_inactive_list[hash]); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| @ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) | ||||
| 		return 1; | ||||
| 	if (conf->quiesce) | ||||
| 		return 1; | ||||
| 	if (list_empty_careful(&conf->inactive_list)) | ||||
| 	if (atomic_read(&conf->empty_inactive_list_nr)) | ||||
| 		return 1; | ||||
| 
 | ||||
| 	return 0; | ||||
| @ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) | ||||
| struct raid5_plug_cb { | ||||
| 	struct blk_plug_cb	cb; | ||||
| 	struct list_head	list; | ||||
| 	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||||
| }; | ||||
| 
 | ||||
| static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | ||||
| @ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | ||||
| 	struct mddev *mddev = cb->cb.data; | ||||
| 	struct r5conf *conf = mddev->private; | ||||
| 	int cnt = 0; | ||||
| 	int hash; | ||||
| 
 | ||||
| 	if (cb->list.next && !list_empty(&cb->list)) { | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| @ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | ||||
| 			 * STRIPE_ON_RELEASE_LIST could be set here. In that | ||||
| 			 * case, the count is always > 1 here | ||||
| 			 */ | ||||
| 			__release_stripe(conf, sh); | ||||
| 			hash = sh->hash_lock_index; | ||||
| 			__release_stripe(conf, sh, &cb->temp_inactive_list[hash]); | ||||
| 			cnt++; | ||||
| 		} | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 	} | ||||
| 	release_inactive_stripe_list(conf, cb->temp_inactive_list, | ||||
| 				     NR_STRIPE_HASH_LOCKS); | ||||
| 	if (mddev->queue) | ||||
| 		trace_block_unplug(mddev->queue, cnt, !from_schedule); | ||||
| 	kfree(cb); | ||||
| @ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev, | ||||
| 
 | ||||
| 	cb = container_of(blk_cb, struct raid5_plug_cb, cb); | ||||
| 
 | ||||
| 	if (cb->list.next == NULL) | ||||
| 	if (cb->list.next == NULL) { | ||||
| 		int i; | ||||
| 		INIT_LIST_HEAD(&cb->list); | ||||
| 		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||||
| 			INIT_LIST_HEAD(cb->temp_inactive_list + i); | ||||
| 	} | ||||
| 
 | ||||
| 	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) | ||||
| 		list_add_tail(&sh->lru, &cb->list); | ||||
| @ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | ||||
| 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | ||||
| 		/* Cannot proceed until we've updated the superblock... */ | ||||
| 		wait_event(conf->wait_for_overlap, | ||||
| 			   atomic_read(&conf->reshape_stripes)==0); | ||||
| 			   atomic_read(&conf->reshape_stripes)==0 | ||||
| 			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||||
| 		if (atomic_read(&conf->reshape_stripes) != 0) | ||||
| 			return 0; | ||||
| 		mddev->reshape_position = conf->reshape_progress; | ||||
| 		mddev->curr_resync_completed = sector_nr; | ||||
| 		conf->reshape_checkpoint = jiffies; | ||||
| 		set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||||
| 		md_wakeup_thread(mddev->thread); | ||||
| 		wait_event(mddev->sb_wait, mddev->flags == 0 || | ||||
| 			   kthread_should_stop()); | ||||
| 			   test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||||
| 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||||
| 			return 0; | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		conf->reshape_safe = mddev->reshape_position; | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| @ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | ||||
| 	    >= mddev->resync_max - mddev->curr_resync_completed) { | ||||
| 		/* Cannot proceed until we've updated the superblock... */ | ||||
| 		wait_event(conf->wait_for_overlap, | ||||
| 			   atomic_read(&conf->reshape_stripes) == 0); | ||||
| 			   atomic_read(&conf->reshape_stripes) == 0 | ||||
| 			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||||
| 		if (atomic_read(&conf->reshape_stripes) != 0) | ||||
| 			goto ret; | ||||
| 		mddev->reshape_position = conf->reshape_progress; | ||||
| 		mddev->curr_resync_completed = sector_nr; | ||||
| 		conf->reshape_checkpoint = jiffies; | ||||
| @ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | ||||
| 		md_wakeup_thread(mddev->thread); | ||||
| 		wait_event(mddev->sb_wait, | ||||
| 			   !test_bit(MD_CHANGE_DEVS, &mddev->flags) | ||||
| 			   || kthread_should_stop()); | ||||
| 			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||||
| 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||||
| 			goto ret; | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		conf->reshape_safe = mddev->reshape_position; | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 		wake_up(&conf->wait_for_overlap); | ||||
| 		sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||||
| 	} | ||||
| ret: | ||||
| 	return reshape_sectors; | ||||
| } | ||||
| 
 | ||||
| @ -4954,27 +5101,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | ||||
| } | ||||
| 
 | ||||
| static int handle_active_stripes(struct r5conf *conf, int group, | ||||
| 				 struct r5worker *worker) | ||||
| 				 struct r5worker *worker, | ||||
| 				 struct list_head *temp_inactive_list) | ||||
| { | ||||
| 	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; | ||||
| 	int i, batch_size = 0; | ||||
| 	int i, batch_size = 0, hash; | ||||
| 	bool release_inactive = false; | ||||
| 
 | ||||
| 	while (batch_size < MAX_STRIPE_BATCH && | ||||
| 			(sh = __get_priority_stripe(conf, group)) != NULL) | ||||
| 		batch[batch_size++] = sh; | ||||
| 
 | ||||
| 	if (batch_size == 0) | ||||
| 		return batch_size; | ||||
| 	if (batch_size == 0) { | ||||
| 		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||||
| 			if (!list_empty(temp_inactive_list + i)) | ||||
| 				break; | ||||
| 		if (i == NR_STRIPE_HASH_LOCKS) | ||||
| 			return batch_size; | ||||
| 		release_inactive = true; | ||||
| 	} | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 
 | ||||
| 	release_inactive_stripe_list(conf, temp_inactive_list, | ||||
| 				     NR_STRIPE_HASH_LOCKS); | ||||
| 
 | ||||
| 	if (release_inactive) { | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		return 0; | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < batch_size; i++) | ||||
| 		handle_stripe(batch[i]); | ||||
| 
 | ||||
| 	cond_resched(); | ||||
| 
 | ||||
| 	spin_lock_irq(&conf->device_lock); | ||||
| 	for (i = 0; i < batch_size; i++) | ||||
| 		__release_stripe(conf, batch[i]); | ||||
| 	for (i = 0; i < batch_size; i++) { | ||||
| 		hash = batch[i]->hash_lock_index; | ||||
| 		__release_stripe(conf, batch[i], &temp_inactive_list[hash]); | ||||
| 	} | ||||
| 	return batch_size; | ||||
| } | ||||
| 
 | ||||
| @ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work) | ||||
| 	while (1) { | ||||
| 		int batch_size, released; | ||||
| 
 | ||||
| 		released = release_stripe_list(conf); | ||||
| 		released = release_stripe_list(conf, worker->temp_inactive_list); | ||||
| 
 | ||||
| 		batch_size = handle_active_stripes(conf, group_id, worker); | ||||
| 		batch_size = handle_active_stripes(conf, group_id, worker, | ||||
| 						   worker->temp_inactive_list); | ||||
| 		worker->working = false; | ||||
| 		if (!batch_size && !released) | ||||
| 			break; | ||||
| @ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread) | ||||
| 		struct bio *bio; | ||||
| 		int batch_size, released; | ||||
| 
 | ||||
| 		released = release_stripe_list(conf); | ||||
| 		released = release_stripe_list(conf, conf->temp_inactive_list); | ||||
| 
 | ||||
| 		if ( | ||||
| 		    !list_empty(&conf->bitmap_list)) { | ||||
| @ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread) | ||||
| 			bitmap_unplug(mddev->bitmap); | ||||
| 			spin_lock_irq(&conf->device_lock); | ||||
| 			conf->seq_write = conf->seq_flush; | ||||
| 			activate_bit_delay(conf); | ||||
| 			activate_bit_delay(conf, conf->temp_inactive_list); | ||||
| 		} | ||||
| 		raid5_activate_delayed(conf); | ||||
| 
 | ||||
| @ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread) | ||||
| 			handled++; | ||||
| 		} | ||||
| 
 | ||||
| 		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); | ||||
| 		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, | ||||
| 						   conf->temp_inactive_list); | ||||
| 		if (!batch_size && !released) | ||||
| 			break; | ||||
| 		handled += batch_size; | ||||
| @ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) | ||||
| { | ||||
| 	struct r5conf *conf = mddev->private; | ||||
| 	int err; | ||||
| 	int hash; | ||||
| 
 | ||||
| 	if (size <= 16 || size > 32768) | ||||
| 		return -EINVAL; | ||||
| 	hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | ||||
| 	while (size < conf->max_nr_stripes) { | ||||
| 		if (drop_one_stripe(conf)) | ||||
| 		if (drop_one_stripe(conf, hash)) | ||||
| 			conf->max_nr_stripes--; | ||||
| 		else | ||||
| 			break; | ||||
| 		hash--; | ||||
| 		if (hash < 0) | ||||
| 			hash = NR_STRIPE_HASH_LOCKS - 1; | ||||
| 	} | ||||
| 	err = md_allow_write(mddev); | ||||
| 	if (err) | ||||
| 		return err; | ||||
| 	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||||
| 	while (size > conf->max_nr_stripes) { | ||||
| 		if (grow_one_stripe(conf)) | ||||
| 		if (grow_one_stripe(conf, hash)) | ||||
| 			conf->max_nr_stripes++; | ||||
| 		else break; | ||||
| 		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| @ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) | ||||
| 		return 0; | ||||
| } | ||||
| 
 | ||||
| static int alloc_thread_groups(struct r5conf *conf, int cnt); | ||||
| static int alloc_thread_groups(struct r5conf *conf, int cnt, | ||||
| 			       int *group_cnt, | ||||
| 			       int *worker_cnt_per_group, | ||||
| 			       struct r5worker_group **worker_groups); | ||||
| static ssize_t | ||||
| raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | ||||
| { | ||||
| 	struct r5conf *conf = mddev->private; | ||||
| 	unsigned long new; | ||||
| 	int err; | ||||
| 	struct r5worker_group *old_groups; | ||||
| 	int old_group_cnt; | ||||
| 	struct r5worker_group *new_groups, *old_groups; | ||||
| 	int group_cnt, worker_cnt_per_group; | ||||
| 
 | ||||
| 	if (len >= PAGE_SIZE) | ||||
| 		return -EINVAL; | ||||
| @ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | ||||
| 	mddev_suspend(mddev); | ||||
| 
 | ||||
| 	old_groups = conf->worker_groups; | ||||
| 	old_group_cnt = conf->worker_cnt_per_group; | ||||
| 	if (old_groups) | ||||
| 		flush_workqueue(raid5_wq); | ||||
| 
 | ||||
| 	err = alloc_thread_groups(conf, new, | ||||
| 				  &group_cnt, &worker_cnt_per_group, | ||||
| 				  &new_groups); | ||||
| 	if (!err) { | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		conf->group_cnt = group_cnt; | ||||
| 		conf->worker_cnt_per_group = worker_cnt_per_group; | ||||
| 		conf->worker_groups = new_groups; | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 
 | ||||
| 	conf->worker_groups = NULL; | ||||
| 	err = alloc_thread_groups(conf, new); | ||||
| 	if (err) { | ||||
| 		conf->worker_groups = old_groups; | ||||
| 		conf->worker_cnt_per_group = old_group_cnt; | ||||
| 	} else { | ||||
| 		if (old_groups) | ||||
| 			kfree(old_groups[0].workers); | ||||
| 		kfree(old_groups); | ||||
| @ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = { | ||||
| 	.attrs = raid5_attrs, | ||||
| }; | ||||
| 
 | ||||
| static int alloc_thread_groups(struct r5conf *conf, int cnt) | ||||
| static int alloc_thread_groups(struct r5conf *conf, int cnt, | ||||
| 			       int *group_cnt, | ||||
| 			       int *worker_cnt_per_group, | ||||
| 			       struct r5worker_group **worker_groups) | ||||
| { | ||||
| 	int i, j; | ||||
| 	int i, j, k; | ||||
| 	ssize_t size; | ||||
| 	struct r5worker *workers; | ||||
| 
 | ||||
| 	conf->worker_cnt_per_group = cnt; | ||||
| 	*worker_cnt_per_group = cnt; | ||||
| 	if (cnt == 0) { | ||||
| 		conf->worker_groups = NULL; | ||||
| 		*group_cnt = 0; | ||||
| 		*worker_groups = NULL; | ||||
| 		return 0; | ||||
| 	} | ||||
| 	conf->group_cnt = num_possible_nodes(); | ||||
| 	*group_cnt = num_possible_nodes(); | ||||
| 	size = sizeof(struct r5worker) * cnt; | ||||
| 	workers = kzalloc(size * conf->group_cnt, GFP_NOIO); | ||||
| 	conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * | ||||
| 				conf->group_cnt, GFP_NOIO); | ||||
| 	if (!conf->worker_groups || !workers) { | ||||
| 	workers = kzalloc(size * *group_cnt, GFP_NOIO); | ||||
| 	*worker_groups = kzalloc(sizeof(struct r5worker_group) * | ||||
| 				*group_cnt, GFP_NOIO); | ||||
| 	if (!*worker_groups || !workers) { | ||||
| 		kfree(workers); | ||||
| 		kfree(conf->worker_groups); | ||||
| 		conf->worker_groups = NULL; | ||||
| 		kfree(*worker_groups); | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < conf->group_cnt; i++) { | ||||
| 	for (i = 0; i < *group_cnt; i++) { | ||||
| 		struct r5worker_group *group; | ||||
| 
 | ||||
| 		group = &conf->worker_groups[i]; | ||||
| 		group = worker_groups[i]; | ||||
| 		INIT_LIST_HEAD(&group->handle_list); | ||||
| 		group->conf = conf; | ||||
| 		group->workers = workers + i * cnt; | ||||
| 
 | ||||
| 		for (j = 0; j < cnt; j++) { | ||||
| 			group->workers[j].group = group; | ||||
| 			INIT_WORK(&group->workers[j].work, raid5_do_work); | ||||
| 			struct r5worker *worker = group->workers + j; | ||||
| 			worker->group = group; | ||||
| 			INIT_WORK(&worker->work, raid5_do_work); | ||||
| 
 | ||||
| 			for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) | ||||
| 				INIT_LIST_HEAD(worker->temp_inactive_list + k); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| @ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) | ||||
| 	struct md_rdev *rdev; | ||||
| 	struct disk_info *disk; | ||||
| 	char pers_name[6]; | ||||
| 	int i; | ||||
| 	int group_cnt, worker_cnt_per_group; | ||||
| 	struct r5worker_group *new_group; | ||||
| 
 | ||||
| 	if (mddev->new_level != 5 | ||||
| 	    && mddev->new_level != 4 | ||||
| @ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) | ||||
| 	if (conf == NULL) | ||||
| 		goto abort; | ||||
| 	/* Don't enable multi-threading by default*/ | ||||
| 	if (alloc_thread_groups(conf, 0)) | ||||
| 	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, | ||||
| 				 &new_group)) { | ||||
| 		conf->group_cnt = group_cnt; | ||||
| 		conf->worker_cnt_per_group = worker_cnt_per_group; | ||||
| 		conf->worker_groups = new_group; | ||||
| 	} else | ||||
| 		goto abort; | ||||
| 	spin_lock_init(&conf->device_lock); | ||||
| 	seqcount_init(&conf->gen_lock); | ||||
| @ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | ||||
| 	INIT_LIST_HEAD(&conf->hold_list); | ||||
| 	INIT_LIST_HEAD(&conf->delayed_list); | ||||
| 	INIT_LIST_HEAD(&conf->bitmap_list); | ||||
| 	INIT_LIST_HEAD(&conf->inactive_list); | ||||
| 	init_llist_head(&conf->released_stripes); | ||||
| 	atomic_set(&conf->active_stripes, 0); | ||||
| 	atomic_set(&conf->preread_active_stripes, 0); | ||||
| @ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) | ||||
| 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | ||||
| 		goto abort; | ||||
| 
 | ||||
| 	/* We init hash_locks[0] separately to that it can be used
 | ||||
| 	 * as the reference lock in the spin_lock_nest_lock() call | ||||
| 	 * in lock_all_device_hash_locks_irq in order to convince | ||||
| 	 * lockdep that we know what we are doing. | ||||
| 	 */ | ||||
| 	spin_lock_init(conf->hash_locks); | ||||
| 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||||
| 		spin_lock_init(conf->hash_locks + i); | ||||
| 
 | ||||
| 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||||
| 		INIT_LIST_HEAD(conf->inactive_list + i); | ||||
| 
 | ||||
| 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||||
| 		INIT_LIST_HEAD(conf->temp_inactive_list + i); | ||||
| 
 | ||||
| 	conf->level = mddev->new_level; | ||||
| 	if (raid5_alloc_percpu(conf) != 0) | ||||
| 		goto abort; | ||||
| @ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | ||||
| 	else | ||||
| 		conf->max_degraded = 1; | ||||
| 	conf->algorithm = mddev->new_layout; | ||||
| 	conf->max_nr_stripes = NR_STRIPES; | ||||
| 	conf->reshape_progress = mddev->reshape_position; | ||||
| 	if (conf->reshape_progress != MaxSector) { | ||||
| 		conf->prev_chunk_sectors = mddev->chunk_sectors; | ||||
| @ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | ||||
| 
 | ||||
| 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | ||||
| 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | ||||
| 	if (grow_stripes(conf, conf->max_nr_stripes)) { | ||||
| 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); | ||||
| 	if (grow_stripes(conf, NR_STRIPES)) { | ||||
| 		printk(KERN_ERR | ||||
| 		       "md/raid:%s: couldn't allocate %dkB for buffers\n", | ||||
| 		       mdname(mddev), memory); | ||||
| @ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev) | ||||
| 	if (!mddev->sync_thread) { | ||||
| 		mddev->recovery = 0; | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		write_seqcount_begin(&conf->gen_lock); | ||||
| 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | ||||
| 		mddev->new_chunk_sectors = | ||||
| 			conf->chunk_sectors = conf->prev_chunk_sectors; | ||||
| 		mddev->new_layout = conf->algorithm = conf->prev_algo; | ||||
| 		rdev_for_each(rdev, mddev) | ||||
| 			rdev->new_data_offset = rdev->data_offset; | ||||
| 		smp_wmb(); | ||||
| 		conf->generation --; | ||||
| 		conf->reshape_progress = MaxSector; | ||||
| 		mddev->reshape_position = MaxSector; | ||||
| 		write_seqcount_end(&conf->gen_lock); | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 		return -EAGAIN; | ||||
| 	} | ||||
| @ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) | ||||
| 		break; | ||||
| 
 | ||||
| 	case 1: /* stop all writes */ | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		lock_all_device_hash_locks_irq(conf); | ||||
| 		/* '2' tells resync/reshape to pause so that all
 | ||||
| 		 * active stripes can drain | ||||
| 		 */ | ||||
| 		conf->quiesce = 2; | ||||
| 		wait_event_lock_irq(conf->wait_for_stripe, | ||||
| 		wait_event_cmd(conf->wait_for_stripe, | ||||
| 				    atomic_read(&conf->active_stripes) == 0 && | ||||
| 				    atomic_read(&conf->active_aligned_reads) == 0, | ||||
| 				    conf->device_lock); | ||||
| 				    unlock_all_device_hash_locks_irq(conf), | ||||
| 				    lock_all_device_hash_locks_irq(conf)); | ||||
| 		conf->quiesce = 1; | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 		unlock_all_device_hash_locks_irq(conf); | ||||
| 		/* allow reshape to continue */ | ||||
| 		wake_up(&conf->wait_for_overlap); | ||||
| 		break; | ||||
| 
 | ||||
| 	case 0: /* re-enable writes */ | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		lock_all_device_hash_locks_irq(conf); | ||||
| 		conf->quiesce = 0; | ||||
| 		wake_up(&conf->wait_for_stripe); | ||||
| 		wake_up(&conf->wait_for_overlap); | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 		unlock_all_device_hash_locks_irq(conf); | ||||
| 		break; | ||||
| 	} | ||||
| } | ||||
|  | ||||
| @ -205,6 +205,7 @@ struct stripe_head { | ||||
| 	short			pd_idx;		/* parity disk index */ | ||||
| 	short			qd_idx;		/* 'Q' disk index for raid6 */ | ||||
| 	short			ddf_layout;/* use DDF ordering to calculate Q */ | ||||
| 	short			hash_lock_index; | ||||
| 	unsigned long		state;		/* state flags */ | ||||
| 	atomic_t		count;	      /* nr of active thread/requests */ | ||||
| 	int			bm_seq;	/* sequence number for bitmap flushes */ | ||||
| @ -367,9 +368,18 @@ struct disk_info { | ||||
| 	struct md_rdev	*rdev, *replacement; | ||||
| }; | ||||
| 
 | ||||
| /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
 | ||||
|  * This is because we sometimes take all the spinlocks | ||||
|  * and creating that much locking depth can cause | ||||
|  * problems. | ||||
|  */ | ||||
| #define NR_STRIPE_HASH_LOCKS 8 | ||||
| #define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) | ||||
| 
 | ||||
| struct r5worker { | ||||
| 	struct work_struct work; | ||||
| 	struct r5worker_group *group; | ||||
| 	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||||
| 	bool working; | ||||
| }; | ||||
| 
 | ||||
| @ -382,6 +392,8 @@ struct r5worker_group { | ||||
| 
 | ||||
| struct r5conf { | ||||
| 	struct hlist_head	*stripe_hashtbl; | ||||
| 	/* only protect corresponding hash list and inactive_list */ | ||||
| 	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS]; | ||||
| 	struct mddev		*mddev; | ||||
| 	int			chunk_sectors; | ||||
| 	int			level, algorithm; | ||||
| @ -462,7 +474,8 @@ struct r5conf { | ||||
| 	 * Free stripes pool | ||||
| 	 */ | ||||
| 	atomic_t		active_stripes; | ||||
| 	struct list_head	inactive_list; | ||||
| 	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS]; | ||||
| 	atomic_t		empty_inactive_list_nr; | ||||
| 	struct llist_head	released_stripes; | ||||
| 	wait_queue_head_t	wait_for_stripe; | ||||
| 	wait_queue_head_t	wait_for_overlap; | ||||
| @ -477,6 +490,7 @@ struct r5conf { | ||||
| 	 * the new thread here until we fully activate the array. | ||||
| 	 */ | ||||
| 	struct md_thread	*thread; | ||||
| 	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||||
| 	struct r5worker_group	*worker_groups; | ||||
| 	int			group_cnt; | ||||
| 	int			worker_cnt_per_group; | ||||
|  | ||||
| @ -278,6 +278,31 @@ do {									\ | ||||
| 	__ret;								\ | ||||
| }) | ||||
| 
 | ||||
| #define __wait_event_cmd(wq, condition, cmd1, cmd2)			\ | ||||
| 	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\ | ||||
| 			    cmd1; schedule(); cmd2) | ||||
| 
 | ||||
| /**
 | ||||
|  * wait_event_cmd - sleep until a condition gets true | ||||
|  * @wq: the waitqueue to wait on | ||||
|  * @condition: a C expression for the event to wait for | ||||
|  * cmd1: the command will be executed before sleep | ||||
|  * cmd2: the command will be executed after sleep | ||||
|  * | ||||
|  * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the | ||||
|  * @condition evaluates to true. The @condition is checked each time | ||||
|  * the waitqueue @wq is woken up. | ||||
|  * | ||||
|  * wake_up() has to be called after changing any variable that could | ||||
|  * change the result of the wait condition. | ||||
|  */ | ||||
| #define wait_event_cmd(wq, condition, cmd1, cmd2)			\ | ||||
| do {									\ | ||||
| 	if (condition)							\ | ||||
| 		break;							\ | ||||
| 	__wait_event_cmd(wq, condition, cmd1, cmd2);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| #define __wait_event_interruptible(wq, condition)			\ | ||||
| 	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\ | ||||
| 		      schedule()) | ||||
|  | ||||
| @ -16,6 +16,7 @@ | ||||
| #define _MD_P_H | ||||
| 
 | ||||
| #include <linux/types.h> | ||||
| #include <asm/byteorder.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * RAID superblock. | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user