Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver changes from Jens Axboe: - bcache update from Kent Overstreet. - two bcache fixes from Nicholas Swenson. - cciss pci init error fix from Andrew. - underflow fix in the parallel IDE pg_write code from Dan Carpenter. I'm sure the 1 (or 0) users of that are now happy. - two PCI related fixes for sx8 from Jingoo Han. - floppy init fix for first block read from Jiri Kosina. - pktcdvd error return miss fix from Julia Lawall. - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from Michael Opdenacker. - comment typo fix for the loop driver from Olaf Hering. - potential oops fix for null_blk from Raghavendra K T. - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing an OOM problem and a problem with handling security locked conditions * 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits) mg_disk: Spelling s/finised/finished/ null_blk: Null pointer deference problem in alloc_page_buffers mtip32xx: Correctly handle security locked condition mtip32xx: Make SGL container per-command to eliminate high order dma allocation drivers/block/loop.c: fix comment typo in loop_config_discard drivers/block/cciss.c:cciss_init_one(): use proper errnos drivers/block/paride/pg.c: underflow bug in pg_write() drivers/block/sx8.c: remove unnecessary pci_set_drvdata() drivers/block/sx8.c: use module_pci_driver() floppy: bail out in open() if drive is not responding to block0 read bcache: Fix auxiliary search trees for key size > cacheline size bcache: Don't return -EINTR when insert finished bcache: Improve bucket_prio() calculation bcache: Add bch_bkey_equal_header() bcache: update bch_bkey_try_merge bcache: Move insert_fixup() to btree_keys_ops bcache: Convert sorting to btree_keys bcache: Convert debug code to btree_keys bcache: Convert btree_iter to struct btree_keys bcache: Refactor bset_tree sysfs stats ...
This commit is contained in:
		
						commit
						53d8ab29f8
					
				| @ -592,6 +592,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | ||||
| 		ret = -1; | ||||
| 	} | ||||
| 
 | ||||
| 	t->raid_partial_stripes_expensive = | ||||
| 		max(t->raid_partial_stripes_expensive, | ||||
| 		    b->raid_partial_stripes_expensive); | ||||
| 
 | ||||
| 	/* Find lowest common alignment_offset */ | ||||
| 	t->alignment_offset = lcm(t->alignment_offset, alignment) | ||||
| 		& (max(t->physical_block_size, t->io_min) - 1); | ||||
|  | ||||
| @ -5004,7 +5004,7 @@ reinit_after_soft_reset: | ||||
| 
 | ||||
| 	i = alloc_cciss_hba(pdev); | ||||
| 	if (i < 0) | ||||
| 		return -1; | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	h = hba[i]; | ||||
| 	h->pdev = pdev; | ||||
| @ -5205,7 +5205,7 @@ clean_no_release_regions: | ||||
| 	 */ | ||||
| 	pci_set_drvdata(pdev, NULL); | ||||
| 	free_hba(h); | ||||
| 	return -1; | ||||
| 	return -ENODEV; | ||||
| } | ||||
| 
 | ||||
| static void cciss_shutdown(struct pci_dev *pdev) | ||||
|  | ||||
| @ -3691,9 +3691,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) | ||||
| 	if (!(mode & FMODE_NDELAY)) { | ||||
| 		if (mode & (FMODE_READ|FMODE_WRITE)) { | ||||
| 			UDRS->last_checked = 0; | ||||
| 			clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); | ||||
| 			check_disk_change(bdev); | ||||
| 			if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) | ||||
| 				goto out; | ||||
| 			if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) | ||||
| 				goto out; | ||||
| 		} | ||||
| 		res = -EROFS; | ||||
| 		if ((mode & FMODE_WRITE) && | ||||
| @ -3746,17 +3749,29 @@ static unsigned int floppy_check_events(struct gendisk *disk, | ||||
|  * a disk in the drive, and whether that disk is writable. | ||||
|  */ | ||||
| 
 | ||||
| static void floppy_rb0_complete(struct bio *bio, int err) | ||||
| struct rb0_cbdata { | ||||
| 	int drive; | ||||
| 	struct completion complete; | ||||
| }; | ||||
| 
 | ||||
| static void floppy_rb0_cb(struct bio *bio, int err) | ||||
| { | ||||
| 	complete((struct completion *)bio->bi_private); | ||||
| 	struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; | ||||
| 	int drive = cbdata->drive; | ||||
| 
 | ||||
| 	if (err) { | ||||
| 		pr_info("floppy: error %d while reading block 0", err); | ||||
| 		set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); | ||||
| 	} | ||||
| 	complete(&cbdata->complete); | ||||
| } | ||||
| 
 | ||||
| static int __floppy_read_block_0(struct block_device *bdev) | ||||
| static int __floppy_read_block_0(struct block_device *bdev, int drive) | ||||
| { | ||||
| 	struct bio bio; | ||||
| 	struct bio_vec bio_vec; | ||||
| 	struct completion complete; | ||||
| 	struct page *page; | ||||
| 	struct rb0_cbdata cbdata; | ||||
| 	size_t size; | ||||
| 
 | ||||
| 	page = alloc_page(GFP_NOIO); | ||||
| @ -3769,6 +3784,8 @@ static int __floppy_read_block_0(struct block_device *bdev) | ||||
| 	if (!size) | ||||
| 		size = 1024; | ||||
| 
 | ||||
| 	cbdata.drive = drive; | ||||
| 
 | ||||
| 	bio_init(&bio); | ||||
| 	bio.bi_io_vec = &bio_vec; | ||||
| 	bio_vec.bv_page = page; | ||||
| @ -3779,13 +3796,14 @@ static int __floppy_read_block_0(struct block_device *bdev) | ||||
| 	bio.bi_bdev = bdev; | ||||
| 	bio.bi_iter.bi_sector = 0; | ||||
| 	bio.bi_flags = (1 << BIO_QUIET); | ||||
| 	init_completion(&complete); | ||||
| 	bio.bi_private = &complete; | ||||
| 	bio.bi_end_io = floppy_rb0_complete; | ||||
| 	bio.bi_private = &cbdata; | ||||
| 	bio.bi_end_io = floppy_rb0_cb; | ||||
| 
 | ||||
| 	submit_bio(READ, &bio); | ||||
| 	process_fd_request(); | ||||
| 	wait_for_completion(&complete); | ||||
| 
 | ||||
| 	init_completion(&cbdata.complete); | ||||
| 	wait_for_completion(&cbdata.complete); | ||||
| 
 | ||||
| 	__free_page(page); | ||||
| 
 | ||||
| @ -3827,7 +3845,7 @@ static int floppy_revalidate(struct gendisk *disk) | ||||
| 			UDRS->generation++; | ||||
| 		if (drive_no_geom(drive)) { | ||||
| 			/* auto-sensing */ | ||||
| 			res = __floppy_read_block_0(opened_bdev[drive]); | ||||
| 			res = __floppy_read_block_0(opened_bdev[drive], drive); | ||||
| 		} else { | ||||
| 			if (cf) | ||||
| 				poll_drive(false, FD_RAW_NEED_DISK); | ||||
|  | ||||
| @ -799,7 +799,7 @@ static void loop_config_discard(struct loop_device *lo) | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We use punch hole to reclaim the free space used by the | ||||
| 	 * image a.k.a. discard. However we do support discard if | ||||
| 	 * image a.k.a. discard. However we do not support discard if | ||||
| 	 * encryption is enabled, because it may give an attacker | ||||
| 	 * useful information. | ||||
| 	 */ | ||||
|  | ||||
| @ -915,7 +915,7 @@ static int mg_probe(struct platform_device *plat_dev) | ||||
| 
 | ||||
| 	/* disk reset */ | ||||
| 	if (prv_data->dev_attr == MG_STORAGE_DEV) { | ||||
| 		/* If POR seq. not yet finised, wait */ | ||||
| 		/* If POR seq. not yet finished, wait */ | ||||
| 		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT); | ||||
| 		if (err) | ||||
| 			goto probe_err_3b; | ||||
|  | ||||
| @ -41,10 +41,31 @@ | ||||
| #include "mtip32xx.h" | ||||
| 
 | ||||
| #define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32) | ||||
| #define HW_CMD_TBL_SZ		(AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16)) | ||||
| #define HW_CMD_TBL_AR_SZ	(HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS) | ||||
| #define HW_PORT_PRIV_DMA_SZ \ | ||||
| 		(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ) | ||||
| 
 | ||||
| /* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */ | ||||
| #define AHCI_RX_FIS_SZ          0x100 | ||||
| #define AHCI_RX_FIS_OFFSET      0x0 | ||||
| #define AHCI_IDFY_SZ            ATA_SECT_SIZE | ||||
| #define AHCI_IDFY_OFFSET        0x400 | ||||
| #define AHCI_SECTBUF_SZ         ATA_SECT_SIZE | ||||
| #define AHCI_SECTBUF_OFFSET     0x800 | ||||
| #define AHCI_SMARTBUF_SZ        ATA_SECT_SIZE | ||||
| #define AHCI_SMARTBUF_OFFSET    0xC00 | ||||
| /* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */ | ||||
| #define BLOCK_DMA_ALLOC_SZ      4096 | ||||
| 
 | ||||
| /* DMA region containing command table (should be 8192 bytes) */ | ||||
| #define AHCI_CMD_SLOT_SZ        sizeof(struct mtip_cmd_hdr) | ||||
| #define AHCI_CMD_TBL_SZ         (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ) | ||||
| #define AHCI_CMD_TBL_OFFSET     0x0 | ||||
| 
 | ||||
| /* DMA region per command (contains header and SGL) */ | ||||
| #define AHCI_CMD_TBL_HDR_SZ     0x80 | ||||
| #define AHCI_CMD_TBL_HDR_OFFSET 0x0 | ||||
| #define AHCI_CMD_TBL_SGL_SZ     (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg)) | ||||
| #define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ | ||||
| #define CMD_DMA_ALLOC_SZ        (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ) | ||||
| 
 | ||||
| 
 | ||||
| #define HOST_CAP_NZDMA		(1 << 19) | ||||
| #define HOST_HSORG		0xFC | ||||
| @ -899,8 +920,9 @@ static void mtip_handle_tfe(struct driver_data *dd) | ||||
| 			fail_reason = "thermal shutdown"; | ||||
| 		} | ||||
| 		if (buf[288] == 0xBF) { | ||||
| 			set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); | ||||
| 			dev_info(&dd->pdev->dev, | ||||
| 				"Drive indicates rebuild has failed.\n"); | ||||
| 				"Drive indicates rebuild has failed. Secure erase required.\n"); | ||||
| 			fail_all_ncq_cmds = 1; | ||||
| 			fail_reason = "rebuild failed"; | ||||
| 		} | ||||
| @ -1566,6 +1588,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) | ||||
| 	} | ||||
| #endif | ||||
| 
 | ||||
| 	/* Check security locked state */ | ||||
| 	if (port->identify[128] & 0x4) | ||||
| 		set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); | ||||
| 	else | ||||
| 		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); | ||||
| 
 | ||||
| #ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ | ||||
| 	/* Demux ID.DRAT & ID.RZAT to determine trim support */ | ||||
| 	if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) | ||||
| @ -1887,6 +1915,10 @@ static void mtip_dump_identify(struct mtip_port *port) | ||||
| 	strlcpy(cbuf, (char *)(port->identify+27), 41); | ||||
| 	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); | ||||
| 
 | ||||
| 	dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", | ||||
| 		port->identify[128], | ||||
| 		port->identify[128] & 0x4 ? "(LOCKED)" : ""); | ||||
| 
 | ||||
| 	if (mtip_hw_get_capacity(port->dd, §ors)) | ||||
| 		dev_info(&port->dd->pdev->dev, | ||||
| 			"Capacity: %llu sectors (%llu MB)\n", | ||||
| @ -3312,6 +3344,118 @@ st_out: | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * DMA region teardown | ||||
|  * | ||||
|  * @dd Pointer to driver_data structure | ||||
|  * | ||||
|  * return value | ||||
|  *      None | ||||
|  */ | ||||
| static void mtip_dma_free(struct driver_data *dd) | ||||
| { | ||||
| 	int i; | ||||
| 	struct mtip_port *port = dd->port; | ||||
| 
 | ||||
| 	if (port->block1) | ||||
| 		dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, | ||||
| 					port->block1, port->block1_dma); | ||||
| 
 | ||||
| 	if (port->command_list) { | ||||
| 		dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, | ||||
| 				port->command_list, port->command_list_dma); | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { | ||||
| 		if (port->commands[i].command) | ||||
| 			dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, | ||||
| 				port->commands[i].command, | ||||
| 				port->commands[i].command_dma); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * DMA region setup | ||||
|  * | ||||
|  * @dd Pointer to driver_data structure | ||||
|  * | ||||
|  * return value | ||||
|  *      -ENOMEM Not enough free DMA region space to initialize driver | ||||
|  */ | ||||
| static int mtip_dma_alloc(struct driver_data *dd) | ||||
| { | ||||
| 	struct mtip_port *port = dd->port; | ||||
| 	int i, rv = 0; | ||||
| 	u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; | ||||
| 
 | ||||
| 	/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ | ||||
| 	port->block1 = | ||||
| 		dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, | ||||
| 					&port->block1_dma, GFP_KERNEL); | ||||
| 	if (!port->block1) | ||||
| 		return -ENOMEM; | ||||
| 	memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ); | ||||
| 
 | ||||
| 	/* Allocate dma memory for command list */ | ||||
| 	port->command_list = | ||||
| 		dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, | ||||
| 					&port->command_list_dma, GFP_KERNEL); | ||||
| 	if (!port->command_list) { | ||||
| 		dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, | ||||
| 					port->block1, port->block1_dma); | ||||
| 		port->block1 = NULL; | ||||
| 		port->block1_dma = 0; | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 	memset(port->command_list, 0, AHCI_CMD_TBL_SZ); | ||||
| 
 | ||||
| 	/* Setup all pointers into first DMA region */ | ||||
| 	port->rxfis         = port->block1 + AHCI_RX_FIS_OFFSET; | ||||
| 	port->rxfis_dma     = port->block1_dma + AHCI_RX_FIS_OFFSET; | ||||
| 	port->identify      = port->block1 + AHCI_IDFY_OFFSET; | ||||
| 	port->identify_dma  = port->block1_dma + AHCI_IDFY_OFFSET; | ||||
| 	port->log_buf       = port->block1 + AHCI_SECTBUF_OFFSET; | ||||
| 	port->log_buf_dma   = port->block1_dma + AHCI_SECTBUF_OFFSET; | ||||
| 	port->smart_buf     = port->block1 + AHCI_SMARTBUF_OFFSET; | ||||
| 	port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; | ||||
| 
 | ||||
| 	/* Setup per command SGL DMA region */ | ||||
| 
 | ||||
| 	/* Point the command headers at the command tables */ | ||||
| 	for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { | ||||
| 		port->commands[i].command = | ||||
| 			dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, | ||||
| 				&port->commands[i].command_dma, GFP_KERNEL); | ||||
| 		if (!port->commands[i].command) { | ||||
| 			rv = -ENOMEM; | ||||
| 			mtip_dma_free(dd); | ||||
| 			return rv; | ||||
| 		} | ||||
| 		memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ); | ||||
| 
 | ||||
| 		port->commands[i].command_header = port->command_list + | ||||
| 					(sizeof(struct mtip_cmd_hdr) * i); | ||||
| 		port->commands[i].command_header_dma = | ||||
| 					dd->port->command_list_dma + | ||||
| 					(sizeof(struct mtip_cmd_hdr) * i); | ||||
| 
 | ||||
| 		if (host_cap_64) | ||||
| 			port->commands[i].command_header->ctbau = | ||||
| 				__force_bit2int cpu_to_le32( | ||||
| 				(port->commands[i].command_dma >> 16) >> 16); | ||||
| 
 | ||||
| 		port->commands[i].command_header->ctba = | ||||
| 				__force_bit2int cpu_to_le32( | ||||
| 				port->commands[i].command_dma & 0xFFFFFFFF); | ||||
| 
 | ||||
| 		sg_init_table(port->commands[i].sg, MTIP_MAX_SG); | ||||
| 
 | ||||
| 		/* Mark command as currently inactive */ | ||||
| 		atomic_set(&dd->port->commands[i].active, 0); | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Called once for each card. | ||||
|  * | ||||
| @ -3370,83 +3514,10 @@ static int mtip_hw_init(struct driver_data *dd) | ||||
| 	dd->port->mmio	= dd->mmio + PORT_OFFSET; | ||||
| 	dd->port->dd	= dd; | ||||
| 
 | ||||
| 	/* Allocate memory for the command list. */ | ||||
| 	dd->port->command_list = | ||||
| 		dmam_alloc_coherent(&dd->pdev->dev, | ||||
| 			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), | ||||
| 			&dd->port->command_list_dma, | ||||
| 			GFP_KERNEL); | ||||
| 	if (!dd->port->command_list) { | ||||
| 		dev_err(&dd->pdev->dev, | ||||
| 			"Memory allocation: command list\n"); | ||||
| 		rv = -ENOMEM; | ||||
| 	/* DMA allocations */ | ||||
| 	rv = mtip_dma_alloc(dd); | ||||
| 	if (rv < 0) | ||||
| 		goto out1; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Clear the memory we have allocated. */ | ||||
| 	memset(dd->port->command_list, | ||||
| 		0, | ||||
| 		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4)); | ||||
| 
 | ||||
| 	/* Setup the addresse of the RX FIS. */ | ||||
| 	dd->port->rxfis	    = dd->port->command_list + HW_CMD_SLOT_SZ; | ||||
| 	dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ; | ||||
| 
 | ||||
| 	/* Setup the address of the command tables. */ | ||||
| 	dd->port->command_table	  = dd->port->rxfis + AHCI_RX_FIS_SZ; | ||||
| 	dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ; | ||||
| 
 | ||||
| 	/* Setup the address of the identify data. */ | ||||
| 	dd->port->identify     = dd->port->command_table + | ||||
| 					HW_CMD_TBL_AR_SZ; | ||||
| 	dd->port->identify_dma = dd->port->command_tbl_dma + | ||||
| 					HW_CMD_TBL_AR_SZ; | ||||
| 
 | ||||
| 	/* Setup the address of the sector buffer - for some non-ncq cmds */ | ||||
| 	dd->port->sector_buffer	= (void *) dd->port->identify + ATA_SECT_SIZE; | ||||
| 	dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE; | ||||
| 
 | ||||
| 	/* Setup the address of the log buf - for read log command */ | ||||
| 	dd->port->log_buf = (void *)dd->port->sector_buffer  + ATA_SECT_SIZE; | ||||
| 	dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE; | ||||
| 
 | ||||
| 	/* Setup the address of the smart buf - for smart read data command */ | ||||
| 	dd->port->smart_buf = (void *)dd->port->log_buf  + ATA_SECT_SIZE; | ||||
| 	dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE; | ||||
| 
 | ||||
| 
 | ||||
| 	/* Point the command headers at the command tables. */ | ||||
| 	for (i = 0; i < num_command_slots; i++) { | ||||
| 		dd->port->commands[i].command_header = | ||||
| 					dd->port->command_list + | ||||
| 					(sizeof(struct mtip_cmd_hdr) * i); | ||||
| 		dd->port->commands[i].command_header_dma = | ||||
| 					dd->port->command_list_dma + | ||||
| 					(sizeof(struct mtip_cmd_hdr) * i); | ||||
| 
 | ||||
| 		dd->port->commands[i].command = | ||||
| 			dd->port->command_table + (HW_CMD_TBL_SZ * i); | ||||
| 		dd->port->commands[i].command_dma = | ||||
| 			dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i); | ||||
| 
 | ||||
| 		if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64) | ||||
| 			dd->port->commands[i].command_header->ctbau = | ||||
| 			__force_bit2int cpu_to_le32( | ||||
| 			(dd->port->commands[i].command_dma >> 16) >> 16); | ||||
| 		dd->port->commands[i].command_header->ctba = | ||||
| 			__force_bit2int cpu_to_le32( | ||||
| 			dd->port->commands[i].command_dma & 0xFFFFFFFF); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If this is not done, a bug is reported by the stock | ||||
| 		 * FC11 i386. Due to the fact that it has lots of kernel | ||||
| 		 * debugging enabled. | ||||
| 		 */ | ||||
| 		sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG); | ||||
| 
 | ||||
| 		/* Mark all commands as currently inactive.*/ | ||||
| 		atomic_set(&dd->port->commands[i].active, 0); | ||||
| 	} | ||||
| 
 | ||||
| 	/* Setup the pointers to the extended s_active and CI registers. */ | ||||
| 	for (i = 0; i < dd->slot_groups; i++) { | ||||
| @ -3594,12 +3665,8 @@ out3: | ||||
| 
 | ||||
| out2: | ||||
| 	mtip_deinit_port(dd->port); | ||||
| 	mtip_dma_free(dd); | ||||
| 
 | ||||
| 	/* Free the command/command header memory. */ | ||||
| 	dmam_free_coherent(&dd->pdev->dev, | ||||
| 				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), | ||||
| 				dd->port->command_list, | ||||
| 				dd->port->command_list_dma); | ||||
| out1: | ||||
| 	/* Free the memory allocated for the for structure. */ | ||||
| 	kfree(dd->port); | ||||
| @ -3622,7 +3689,8 @@ static int mtip_hw_exit(struct driver_data *dd) | ||||
| 	 * saves its state. | ||||
| 	 */ | ||||
| 	if (!dd->sr) { | ||||
| 		if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) | ||||
| 		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && | ||||
| 		    !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) | ||||
| 			if (mtip_standby_immediate(dd->port)) | ||||
| 				dev_warn(&dd->pdev->dev, | ||||
| 					"STANDBY IMMEDIATE failed\n"); | ||||
| @ -3641,11 +3709,9 @@ static int mtip_hw_exit(struct driver_data *dd) | ||||
| 	irq_set_affinity_hint(dd->pdev->irq, NULL); | ||||
| 	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); | ||||
| 
 | ||||
| 	/* Free the command/command header memory. */ | ||||
| 	dmam_free_coherent(&dd->pdev->dev, | ||||
| 			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), | ||||
| 			dd->port->command_list, | ||||
| 			dd->port->command_list_dma); | ||||
| 	/* Free dma regions */ | ||||
| 	mtip_dma_free(dd); | ||||
| 
 | ||||
| 	/* Free the memory allocated for the for structure. */ | ||||
| 	kfree(dd->port); | ||||
| 	dd->port = NULL; | ||||
|  | ||||
| @ -69,7 +69,7 @@ | ||||
|  * Maximum number of scatter gather entries | ||||
|  * a single command may have. | ||||
|  */ | ||||
| #define MTIP_MAX_SG		128 | ||||
| #define MTIP_MAX_SG		504 | ||||
| 
 | ||||
| /*
 | ||||
|  * Maximum number of slot groups (Command Issue & s_active registers) | ||||
| @ -92,7 +92,7 @@ | ||||
| 
 | ||||
| /* Driver name and version strings */ | ||||
| #define MTIP_DRV_NAME		"mtip32xx" | ||||
| #define MTIP_DRV_VERSION	"1.2.6os3" | ||||
| #define MTIP_DRV_VERSION	"1.3.0" | ||||
| 
 | ||||
| /* Maximum number of minor device numbers per device. */ | ||||
| #define MTIP_MAX_MINORS		16 | ||||
| @ -391,15 +391,13 @@ struct mtip_port { | ||||
| 	 */ | ||||
| 	dma_addr_t rxfis_dma; | ||||
| 	/*
 | ||||
| 	 * Pointer to the beginning of the command table memory as used | ||||
| 	 * by the driver. | ||||
| 	 * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART | ||||
| 	 */ | ||||
| 	void *command_table; | ||||
| 	void *block1; | ||||
| 	/*
 | ||||
| 	 * Pointer to the beginning of the command table memory as used | ||||
| 	 * by the DMA. | ||||
| 	 * DMA address of region for RX Fis, Identify, RLE10, and SMART | ||||
| 	 */ | ||||
| 	dma_addr_t command_tbl_dma; | ||||
| 	dma_addr_t block1_dma; | ||||
| 	/*
 | ||||
| 	 * Pointer to the beginning of the identify data memory as used | ||||
| 	 * by the driver. | ||||
|  | ||||
| @ -616,6 +616,11 @@ static int __init null_init(void) | ||||
| 		irqmode = NULL_IRQ_NONE; | ||||
| 	} | ||||
| #endif | ||||
| 	if (bs > PAGE_SIZE) { | ||||
| 		pr_warn("null_blk: invalid block size\n"); | ||||
| 		pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); | ||||
| 		bs = PAGE_SIZE; | ||||
| 	} | ||||
| 
 | ||||
| 	if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { | ||||
| 		if (submit_queues < nr_online_nodes) { | ||||
|  | ||||
| @ -581,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count, | ||||
| 
 | ||||
| 	if (hdr.magic != PG_MAGIC) | ||||
| 		return -EINVAL; | ||||
| 	if (hdr.dlen > PG_MAX_DATA) | ||||
| 	if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA) | ||||
| 		return -EINVAL; | ||||
| 	if ((count - hs) > PG_MAX_DATA) | ||||
| 		return -EINVAL; | ||||
|  | ||||
| @ -706,7 +706,9 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * | ||||
| 			     WRITE : READ, __GFP_WAIT); | ||||
| 
 | ||||
| 	if (cgc->buflen) { | ||||
| 		if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT)) | ||||
| 		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, | ||||
| 				      __GFP_WAIT); | ||||
| 		if (ret) | ||||
| 			goto out; | ||||
| 	} | ||||
| 
 | ||||
|  | ||||
| @ -1744,20 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev) | ||||
| 	kfree(host); | ||||
| 	pci_release_regions(pdev); | ||||
| 	pci_disable_device(pdev); | ||||
| 	pci_set_drvdata(pdev, NULL); | ||||
| } | ||||
| 
 | ||||
| static int __init carm_init(void) | ||||
| { | ||||
| 	return pci_register_driver(&carm_driver); | ||||
| } | ||||
| 
 | ||||
| static void __exit carm_exit(void) | ||||
| { | ||||
| 	pci_unregister_driver(&carm_driver); | ||||
| } | ||||
| 
 | ||||
| module_init(carm_init); | ||||
| module_exit(carm_exit); | ||||
| 
 | ||||
| 
 | ||||
| module_pci_driver(carm_driver); | ||||
|  | ||||
| @ -561,11 +561,11 @@ static int gdrom_set_interrupt_handlers(void) | ||||
| 	int err; | ||||
| 
 | ||||
| 	err = request_irq(HW_EVENT_GDROM_CMD, gdrom_command_interrupt, | ||||
| 		IRQF_DISABLED, "gdrom_command", &gd); | ||||
| 		0, "gdrom_command", &gd); | ||||
| 	if (err) | ||||
| 		return err; | ||||
| 	err = request_irq(HW_EVENT_GDROM_DMA, gdrom_dma_interrupt, | ||||
| 		IRQF_DISABLED, "gdrom_dma", &gd); | ||||
| 		0, "gdrom_dma", &gd); | ||||
| 	if (err) | ||||
| 		free_irq(HW_EVENT_GDROM_CMD, &gd); | ||||
| 	return err; | ||||
|  | ||||
| @ -1,7 +1,8 @@ | ||||
| 
 | ||||
| obj-$(CONFIG_BCACHE)	+= bcache.o | ||||
| 
 | ||||
| bcache-y		:= alloc.o btree.o bset.o io.o journal.o writeback.o\
 | ||||
| 	movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o | ||||
| bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
 | ||||
| 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
 | ||||
| 	util.o writeback.o | ||||
| 
 | ||||
| CFLAGS_request.o	+= -Iblock | ||||
|  | ||||
| @ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | ||||
| { | ||||
| 	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | ||||
| 
 | ||||
| 	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | ||||
| 	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | ||||
| 		return false; | ||||
| 	if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { | ||||
| 		unsigned i; | ||||
| 
 | ||||
| 		for (i = 0; i < RESERVE_NONE; i++) | ||||
| 			if (!fifo_full(&ca->free[i])) | ||||
| 				goto add; | ||||
| 
 | ||||
| 		return false; | ||||
| 	} | ||||
| add: | ||||
| 	b->prio = 0; | ||||
| 
 | ||||
| 	if (can_inc_bucket_gen(b) && | ||||
| @ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | ||||
| 	fifo_push(&ca->free_inc, b - ca->buckets); | ||||
| } | ||||
| 
 | ||||
| #define bucket_prio(b)				\ | ||||
| 	(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) | ||||
| /*
 | ||||
|  * Determines what order we're going to reuse buckets, smallest bucket_prio() | ||||
|  * first: we also take into account the number of sectors of live data in that | ||||
|  * bucket, and in order for that multiply to make sense we have to scale bucket | ||||
|  * | ||||
|  * Thus, we scale the bucket priorities so that the bucket with the smallest | ||||
|  * prio is worth 1/8th of what INITIAL_PRIO is worth. | ||||
|  */ | ||||
| 
 | ||||
| #define bucket_prio(b)							\ | ||||
| ({									\ | ||||
| 	unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;	\ | ||||
| 									\ | ||||
| 	(b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);	\ | ||||
| }) | ||||
| 
 | ||||
| #define bucket_max_cmp(l, r)	(bucket_prio(l) < bucket_prio(r)) | ||||
| #define bucket_min_cmp(l, r)	(bucket_prio(l) > bucket_prio(r)) | ||||
| @ -304,6 +323,21 @@ do {									\ | ||||
| 	__set_current_state(TASK_RUNNING);				\ | ||||
| } while (0) | ||||
| 
 | ||||
| static int bch_allocator_push(struct cache *ca, long bucket) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	/* Prios/gens are actually the most important reserve */ | ||||
| 	if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	for (i = 0; i < RESERVE_NR; i++) | ||||
| 		if (fifo_push(&ca->free[i], bucket)) | ||||
| 			return true; | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static int bch_allocator_thread(void *arg) | ||||
| { | ||||
| 	struct cache *ca = arg; | ||||
| @ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg) | ||||
| 				mutex_lock(&ca->set->bucket_lock); | ||||
| 			} | ||||
| 
 | ||||
| 			allocator_wait(ca, !fifo_full(&ca->free)); | ||||
| 
 | ||||
| 			fifo_push(&ca->free, bucket); | ||||
| 			allocator_wait(ca, bch_allocator_push(ca, bucket)); | ||||
| 			wake_up(&ca->set->bucket_wait); | ||||
| 		} | ||||
| 
 | ||||
| @ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) | ||||
| long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) | ||||
| { | ||||
| 	DEFINE_WAIT(w); | ||||
| 	struct bucket *b; | ||||
| 	long r; | ||||
| 
 | ||||
| 	/* fastpath */ | ||||
| 	if (fifo_used(&ca->free) > ca->watermark[watermark]) { | ||||
| 		fifo_pop(&ca->free, r); | ||||
| 	if (fifo_pop(&ca->free[RESERVE_NONE], r) || | ||||
| 	    fifo_pop(&ca->free[reserve], r)) | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!wait) | ||||
| 		return -1; | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		if (fifo_used(&ca->free) > ca->watermark[watermark]) { | ||||
| 			fifo_pop(&ca->free, r); | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 	do { | ||||
| 		prepare_to_wait(&ca->set->bucket_wait, &w, | ||||
| 				TASK_UNINTERRUPTIBLE); | ||||
| 
 | ||||
| 		mutex_unlock(&ca->set->bucket_lock); | ||||
| 		schedule(); | ||||
| 		mutex_lock(&ca->set->bucket_lock); | ||||
| 	} | ||||
| 	} while (!fifo_pop(&ca->free[RESERVE_NONE], r) && | ||||
| 		 !fifo_pop(&ca->free[reserve], r)); | ||||
| 
 | ||||
| 	finish_wait(&ca->set->bucket_wait, &w); | ||||
| out: | ||||
| @ -401,12 +428,14 @@ out: | ||||
| 	if (expensive_debug_checks(ca->set)) { | ||||
| 		size_t iter; | ||||
| 		long i; | ||||
| 		unsigned j; | ||||
| 
 | ||||
| 		for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | ||||
| 			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | ||||
| 
 | ||||
| 		fifo_for_each(i, &ca->free, iter) | ||||
| 			BUG_ON(i == r); | ||||
| 		for (j = 0; j < RESERVE_NR; j++) | ||||
| 			fifo_for_each(i, &ca->free[j], iter) | ||||
| 				BUG_ON(i == r); | ||||
| 		fifo_for_each(i, &ca->free_inc, iter) | ||||
| 			BUG_ON(i == r); | ||||
| 		fifo_for_each(i, &ca->unused, iter) | ||||
| @ -419,7 +448,7 @@ out: | ||||
| 
 | ||||
| 	SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | ||||
| 
 | ||||
| 	if (watermark <= WATERMARK_METADATA) { | ||||
| 	if (reserve <= RESERVE_PRIO) { | ||||
| 		SET_GC_MARK(b, GC_MARK_METADATA); | ||||
| 		SET_GC_MOVE(b, 0); | ||||
| 		b->prio = BTREE_PRIO; | ||||
| @ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||||
| int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, | ||||
| 			   struct bkey *k, int n, bool wait) | ||||
| { | ||||
| 	int i; | ||||
| @ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||||
| 
 | ||||
| 	for (i = 0; i < n; i++) { | ||||
| 		struct cache *ca = c->cache_by_alloc[i]; | ||||
| 		long b = bch_bucket_alloc(ca, watermark, wait); | ||||
| 		long b = bch_bucket_alloc(ca, reserve, wait); | ||||
| 
 | ||||
| 		if (b == -1) | ||||
| 			goto err; | ||||
| @ -478,12 +507,12 @@ err: | ||||
| 	return -1; | ||||
| } | ||||
| 
 | ||||
| int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||||
| int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, | ||||
| 			 struct bkey *k, int n, bool wait) | ||||
| { | ||||
| 	int ret; | ||||
| 	mutex_lock(&c->bucket_lock); | ||||
| 	ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); | ||||
| 	ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); | ||||
| 	mutex_unlock(&c->bucket_lock); | ||||
| 	return ret; | ||||
| } | ||||
| @ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, | ||||
| 
 | ||||
| 	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { | ||||
| 		unsigned watermark = write_prio | ||||
| 			? WATERMARK_MOVINGGC | ||||
| 			: WATERMARK_NONE; | ||||
| 			? RESERVE_MOVINGGC | ||||
| 			: RESERVE_NONE; | ||||
| 
 | ||||
| 		spin_unlock(&c->data_bucket_lock); | ||||
| 
 | ||||
| @ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca) | ||||
| 	 * Then 8 for btree allocations | ||||
| 	 * Then half for the moving garbage collector | ||||
| 	 */ | ||||
| 
 | ||||
| #if 0 | ||||
| 	ca->watermark[WATERMARK_PRIO] = 0; | ||||
| 
 | ||||
| 	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | ||||
| @ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca) | ||||
| 
 | ||||
| 	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | ||||
| 		ca->watermark[WATERMARK_MOVINGGC]; | ||||
| 
 | ||||
| #endif | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
| @ -187,6 +187,7 @@ | ||||
| #include <linux/types.h> | ||||
| #include <linux/workqueue.h> | ||||
| 
 | ||||
| #include "bset.h" | ||||
| #include "util.h" | ||||
| #include "closure.h" | ||||
| 
 | ||||
| @ -309,7 +310,8 @@ struct cached_dev { | ||||
| 	struct cache_sb		sb; | ||||
| 	struct bio		sb_bio; | ||||
| 	struct bio_vec		sb_bv[1]; | ||||
| 	struct closure_with_waitlist sb_write; | ||||
| 	struct closure		sb_write; | ||||
| 	struct semaphore	sb_write_mutex; | ||||
| 
 | ||||
| 	/* Refcount on the cache set. Always nonzero when we're caching. */ | ||||
| 	atomic_t		count; | ||||
| @ -382,12 +384,12 @@ struct cached_dev { | ||||
| 	unsigned		writeback_rate_p_term_inverse; | ||||
| }; | ||||
| 
 | ||||
| enum alloc_watermarks { | ||||
| 	WATERMARK_PRIO, | ||||
| 	WATERMARK_METADATA, | ||||
| 	WATERMARK_MOVINGGC, | ||||
| 	WATERMARK_NONE, | ||||
| 	WATERMARK_MAX | ||||
| enum alloc_reserve { | ||||
| 	RESERVE_BTREE, | ||||
| 	RESERVE_PRIO, | ||||
| 	RESERVE_MOVINGGC, | ||||
| 	RESERVE_NONE, | ||||
| 	RESERVE_NR, | ||||
| }; | ||||
| 
 | ||||
| struct cache { | ||||
| @ -399,8 +401,6 @@ struct cache { | ||||
| 	struct kobject		kobj; | ||||
| 	struct block_device	*bdev; | ||||
| 
 | ||||
| 	unsigned		watermark[WATERMARK_MAX]; | ||||
| 
 | ||||
| 	struct task_struct	*alloc_thread; | ||||
| 
 | ||||
| 	struct closure		prio; | ||||
| @ -429,7 +429,7 @@ struct cache { | ||||
| 	 * because all the data they contained was overwritten), so we only | ||||
| 	 * need to discard them before they can be moved to the free list. | ||||
| 	 */ | ||||
| 	DECLARE_FIFO(long, free); | ||||
| 	DECLARE_FIFO(long, free)[RESERVE_NR]; | ||||
| 	DECLARE_FIFO(long, free_inc); | ||||
| 	DECLARE_FIFO(long, unused); | ||||
| 
 | ||||
| @ -514,7 +514,8 @@ struct cache_set { | ||||
| 	uint64_t		cached_dev_sectors; | ||||
| 	struct closure		caching; | ||||
| 
 | ||||
| 	struct closure_with_waitlist sb_write; | ||||
| 	struct closure		sb_write; | ||||
| 	struct semaphore	sb_write_mutex; | ||||
| 
 | ||||
| 	mempool_t		*search; | ||||
| 	mempool_t		*bio_meta; | ||||
| @ -629,13 +630,15 @@ struct cache_set { | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	struct btree		*verify_data; | ||||
| 	struct bset		*verify_ondisk; | ||||
| 	struct mutex		verify_lock; | ||||
| #endif | ||||
| 
 | ||||
| 	unsigned		nr_uuids; | ||||
| 	struct uuid_entry	*uuids; | ||||
| 	BKEY_PADDED(uuid_bucket); | ||||
| 	struct closure_with_waitlist uuid_write; | ||||
| 	struct closure		uuid_write; | ||||
| 	struct semaphore	uuid_write_mutex; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * A btree node on disk could have too many bsets for an iterator to fit | ||||
| @ -643,13 +646,7 @@ struct cache_set { | ||||
| 	 */ | ||||
| 	mempool_t		*fill_iter; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * btree_sort() is a merge sort and requires temporary space - single | ||||
| 	 * element mempool | ||||
| 	 */ | ||||
| 	struct mutex		sort_lock; | ||||
| 	struct bset		*sort; | ||||
| 	unsigned		sort_crit_factor; | ||||
| 	struct bset_sort_state	sort; | ||||
| 
 | ||||
| 	/* List of buckets we're currently writing data to */ | ||||
| 	struct list_head	data_buckets; | ||||
| @ -665,7 +662,6 @@ struct cache_set { | ||||
| 	unsigned		congested_read_threshold_us; | ||||
| 	unsigned		congested_write_threshold_us; | ||||
| 
 | ||||
| 	struct time_stats	sort_time; | ||||
| 	struct time_stats	btree_gc_time; | ||||
| 	struct time_stats	btree_split_time; | ||||
| 	struct time_stats	btree_read_time; | ||||
| @ -683,9 +679,9 @@ struct cache_set { | ||||
| 	unsigned		error_decay; | ||||
| 
 | ||||
| 	unsigned short		journal_delay_ms; | ||||
| 	bool			expensive_debug_checks; | ||||
| 	unsigned		verify:1; | ||||
| 	unsigned		key_merging_disabled:1; | ||||
| 	unsigned		expensive_debug_checks:1; | ||||
| 	unsigned		gc_always_rewrite:1; | ||||
| 	unsigned		shrinker_disabled:1; | ||||
| 	unsigned		copy_gc_enabled:1; | ||||
| @ -707,13 +703,8 @@ struct bbio { | ||||
| 	struct bio		bio; | ||||
| }; | ||||
| 
 | ||||
| static inline unsigned local_clock_us(void) | ||||
| { | ||||
| 	return local_clock() >> 10; | ||||
| } | ||||
| 
 | ||||
| #define BTREE_PRIO		USHRT_MAX | ||||
| #define INITIAL_PRIO		32768 | ||||
| #define INITIAL_PRIO		32768U | ||||
| 
 | ||||
| #define btree_bytes(c)		((c)->btree_pages * PAGE_SIZE) | ||||
| #define btree_blocks(b)							\ | ||||
| @ -726,21 +717,6 @@ static inline unsigned local_clock_us(void) | ||||
| #define bucket_bytes(c)		((c)->sb.bucket_size << 9) | ||||
| #define block_bytes(c)		((c)->sb.block_size << 9) | ||||
| 
 | ||||
| #define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||||
| #define set_bytes(i)		__set_bytes(i, i->keys) | ||||
| 
 | ||||
| #define __set_blocks(i, k, c)	DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) | ||||
| #define set_blocks(i, c)	__set_blocks(i, (i)->keys, c) | ||||
| 
 | ||||
| #define node(i, j)		((struct bkey *) ((i)->d + (j))) | ||||
| #define end(i)			node(i, (i)->keys) | ||||
| 
 | ||||
| #define index(i, b)							\ | ||||
| 	((size_t) (((void *) i - (void *) (b)->sets[0].data) /		\ | ||||
| 		   block_bytes(b->c))) | ||||
| 
 | ||||
| #define btree_data_space(b)	(PAGE_SIZE << (b)->page_order) | ||||
| 
 | ||||
| #define prios_per_bucket(c)				\ | ||||
| 	((bucket_bytes(c) - sizeof(struct prio_set)) /	\ | ||||
| 	 sizeof(struct bucket_disk)) | ||||
| @ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, | ||||
| 	return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); | ||||
| } | ||||
| 
 | ||||
| /* Btree key macros */ | ||||
| 
 | ||||
| static inline void bkey_init(struct bkey *k) | ||||
| static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||||
| { | ||||
| 	*k = ZERO_KEY; | ||||
| 	uint8_t r = a - b; | ||||
| 	return r > 128U ? 0 : r; | ||||
| } | ||||
| 
 | ||||
| static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||||
| 				unsigned i) | ||||
| { | ||||
| 	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||||
| } | ||||
| 
 | ||||
| static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||||
| 				 unsigned i) | ||||
| { | ||||
| 	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||||
| } | ||||
| 
 | ||||
| /* Btree key macros */ | ||||
| 
 | ||||
| /*
 | ||||
|  * This is used for various on disk data structures - cache_sb, prio_set, bset, | ||||
|  * jset: The checksum is _always_ the first 8 bytes of these structs | ||||
|  */ | ||||
| #define csum_set(i)							\ | ||||
| 	bch_crc64(((void *) (i)) + sizeof(uint64_t),			\ | ||||
| 	      ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) | ||||
| 		  ((void *) bset_bkey_last(i)) -			\ | ||||
| 		  (((void *) (i)) + sizeof(uint64_t))) | ||||
| 
 | ||||
| /* Error handling macros */ | ||||
| 
 | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,7 +1,11 @@ | ||||
| #ifndef _BCACHE_BSET_H | ||||
| #define _BCACHE_BSET_H | ||||
| 
 | ||||
| #include <linux/slab.h> | ||||
| #include <linux/bcache.h> | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/types.h> | ||||
| 
 | ||||
| #include "util.h" /* for time_stats */ | ||||
| 
 | ||||
| /*
 | ||||
|  * BKEYS: | ||||
| @ -142,20 +146,13 @@ | ||||
|  * first key in that range of bytes again. | ||||
|  */ | ||||
| 
 | ||||
| /* Btree key comparison/iteration */ | ||||
| struct btree_keys; | ||||
| struct btree_iter; | ||||
| struct btree_iter_set; | ||||
| struct bkey_float; | ||||
| 
 | ||||
| #define MAX_BSETS		4U | ||||
| 
 | ||||
| struct btree_iter { | ||||
| 	size_t size, used; | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	struct btree *b; | ||||
| #endif | ||||
| 	struct btree_iter_set { | ||||
| 		struct bkey *k, *end; | ||||
| 	} data[MAX_BSETS]; | ||||
| }; | ||||
| 
 | ||||
| struct bset_tree { | ||||
| 	/*
 | ||||
| 	 * We construct a binary tree in an array as if the array | ||||
| @ -165,14 +162,14 @@ struct bset_tree { | ||||
| 	 */ | ||||
| 
 | ||||
| 	/* size of the binary tree and prev array */ | ||||
| 	unsigned	size; | ||||
| 	unsigned		size; | ||||
| 
 | ||||
| 	/* function of size - precalculated for to_inorder() */ | ||||
| 	unsigned	extra; | ||||
| 	unsigned		extra; | ||||
| 
 | ||||
| 	/* copy of the last key in the set */ | ||||
| 	struct bkey	end; | ||||
| 	struct bkey_float *tree; | ||||
| 	struct bkey		end; | ||||
| 	struct bkey_float	*tree; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The nodes in the bset tree point to specific keys - this | ||||
| @ -182,12 +179,219 @@ struct bset_tree { | ||||
| 	 * to keep bkey_float to 4 bytes and prev isn't used in the fast | ||||
| 	 * path. | ||||
| 	 */ | ||||
| 	uint8_t		*prev; | ||||
| 	uint8_t			*prev; | ||||
| 
 | ||||
| 	/* The actual btree node, with pointers to each sorted set */ | ||||
| 	struct bset	*data; | ||||
| 	struct bset		*data; | ||||
| }; | ||||
| 
 | ||||
| struct btree_keys_ops { | ||||
| 	bool		(*sort_cmp)(struct btree_iter_set, | ||||
| 				    struct btree_iter_set); | ||||
| 	struct bkey	*(*sort_fixup)(struct btree_iter *, struct bkey *); | ||||
| 	bool		(*insert_fixup)(struct btree_keys *, struct bkey *, | ||||
| 					struct btree_iter *, struct bkey *); | ||||
| 	bool		(*key_invalid)(struct btree_keys *, | ||||
| 				       const struct bkey *); | ||||
| 	bool		(*key_bad)(struct btree_keys *, const struct bkey *); | ||||
| 	bool		(*key_merge)(struct btree_keys *, | ||||
| 				     struct bkey *, struct bkey *); | ||||
| 	void		(*key_to_text)(char *, size_t, const struct bkey *); | ||||
| 	void		(*key_dump)(struct btree_keys *, const struct bkey *); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Only used for deciding whether to use START_KEY(k) or just the key | ||||
| 	 * itself in a couple places | ||||
| 	 */ | ||||
| 	bool		is_extents; | ||||
| }; | ||||
| 
 | ||||
| struct btree_keys { | ||||
| 	const struct btree_keys_ops	*ops; | ||||
| 	uint8_t			page_order; | ||||
| 	uint8_t			nsets; | ||||
| 	unsigned		last_set_unwritten:1; | ||||
| 	bool			*expensive_debug_checks; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Sets of sorted keys - the real btree node - plus a binary search tree | ||||
| 	 * | ||||
| 	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||||
| 	 * to the memory we have allocated for this btree node. Additionally, | ||||
| 	 * set[0]->data points to the entire btree node as it exists on disk. | ||||
| 	 */ | ||||
| 	struct bset_tree	set[MAX_BSETS]; | ||||
| }; | ||||
| 
 | ||||
| static inline struct bset_tree *bset_tree_last(struct btree_keys *b) | ||||
| { | ||||
| 	return b->set + b->nsets; | ||||
| } | ||||
| 
 | ||||
| static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) | ||||
| { | ||||
| 	return t <= b->set + b->nsets - b->last_set_unwritten; | ||||
| } | ||||
| 
 | ||||
| static inline bool bkey_written(struct btree_keys *b, struct bkey *k) | ||||
| { | ||||
| 	return !b->last_set_unwritten || k < b->set[b->nsets].data->start; | ||||
| } | ||||
| 
 | ||||
| static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) | ||||
| { | ||||
| 	return ((size_t) i) - ((size_t) b->set->data); | ||||
| } | ||||
| 
 | ||||
| static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) | ||||
| { | ||||
| 	return bset_byte_offset(b, i) >> 9; | ||||
| } | ||||
| 
 | ||||
| #define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||||
| #define set_bytes(i)		__set_bytes(i, i->keys) | ||||
| 
 | ||||
| #define __set_blocks(i, k, block_bytes)				\ | ||||
| 	DIV_ROUND_UP(__set_bytes(i, k), block_bytes) | ||||
| #define set_blocks(i, block_bytes)				\ | ||||
| 	__set_blocks(i, (i)->keys, block_bytes) | ||||
| 
 | ||||
| static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) | ||||
| { | ||||
| 	struct bset_tree *t = bset_tree_last(b); | ||||
| 
 | ||||
| 	BUG_ON((PAGE_SIZE << b->page_order) < | ||||
| 	       (bset_byte_offset(b, t->data) + set_bytes(t->data))); | ||||
| 
 | ||||
| 	if (!b->last_set_unwritten) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return ((PAGE_SIZE << b->page_order) - | ||||
| 		(bset_byte_offset(b, t->data) + set_bytes(t->data))) / | ||||
| 		sizeof(u64); | ||||
| } | ||||
| 
 | ||||
| static inline struct bset *bset_next_set(struct btree_keys *b, | ||||
| 					 unsigned block_bytes) | ||||
| { | ||||
| 	struct bset *i = bset_tree_last(b)->data; | ||||
| 
 | ||||
| 	return ((void *) i) + roundup(set_bytes(i), block_bytes); | ||||
| } | ||||
| 
 | ||||
| void bch_btree_keys_free(struct btree_keys *); | ||||
| int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); | ||||
| void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, | ||||
| 			 bool *); | ||||
| 
 | ||||
| void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); | ||||
| void bch_bset_build_written_tree(struct btree_keys *); | ||||
| void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); | ||||
| bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); | ||||
| void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); | ||||
| unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, | ||||
| 			      struct bkey *); | ||||
| 
 | ||||
| enum { | ||||
| 	BTREE_INSERT_STATUS_NO_INSERT = 0, | ||||
| 	BTREE_INSERT_STATUS_INSERT, | ||||
| 	BTREE_INSERT_STATUS_BACK_MERGE, | ||||
| 	BTREE_INSERT_STATUS_OVERWROTE, | ||||
| 	BTREE_INSERT_STATUS_FRONT_MERGE, | ||||
| }; | ||||
| 
 | ||||
| /* Btree key iteration */ | ||||
| 
 | ||||
| struct btree_iter { | ||||
| 	size_t size, used; | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	struct btree_keys *b; | ||||
| #endif | ||||
| 	struct btree_iter_set { | ||||
| 		struct bkey *k, *end; | ||||
| 	} data[MAX_BSETS]; | ||||
| }; | ||||
| 
 | ||||
| typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); | ||||
| 
 | ||||
| struct bkey *bch_btree_iter_next(struct btree_iter *); | ||||
| struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||||
| 					struct btree_keys *, ptr_filter_fn); | ||||
| 
 | ||||
| void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||||
| struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, | ||||
| 				 struct bkey *); | ||||
| 
 | ||||
| struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, | ||||
| 			       const struct bkey *); | ||||
| 
 | ||||
| /*
 | ||||
|  * Returns the first key that is strictly greater than search | ||||
|  */ | ||||
| static inline struct bkey *bch_bset_search(struct btree_keys *b, | ||||
| 					   struct bset_tree *t, | ||||
| 					   const struct bkey *search) | ||||
| { | ||||
| 	return search ? __bch_bset_search(b, t, search) : t->data->start; | ||||
| } | ||||
| 
 | ||||
| #define for_each_key_filter(b, k, iter, filter)				\ | ||||
| 	for (bch_btree_iter_init((b), (iter), NULL);			\ | ||||
| 	     ((k) = bch_btree_iter_next_filter((iter), (b), filter));) | ||||
| 
 | ||||
| #define for_each_key(b, k, iter)					\ | ||||
| 	for (bch_btree_iter_init((b), (iter), NULL);			\ | ||||
| 	     ((k) = bch_btree_iter_next(iter));) | ||||
| 
 | ||||
| /* Sorting */ | ||||
| 
 | ||||
| struct bset_sort_state { | ||||
| 	mempool_t		*pool; | ||||
| 
 | ||||
| 	unsigned		page_order; | ||||
| 	unsigned		crit_factor; | ||||
| 
 | ||||
| 	struct time_stats	time; | ||||
| }; | ||||
| 
 | ||||
| void bch_bset_sort_state_free(struct bset_sort_state *); | ||||
| int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); | ||||
| void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); | ||||
| void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, | ||||
| 			 struct bset_sort_state *); | ||||
| void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, | ||||
| 				    struct bset_sort_state *); | ||||
| void bch_btree_sort_partial(struct btree_keys *, unsigned, | ||||
| 			    struct bset_sort_state *); | ||||
| 
 | ||||
| static inline void bch_btree_sort(struct btree_keys *b, | ||||
| 				  struct bset_sort_state *state) | ||||
| { | ||||
| 	bch_btree_sort_partial(b, 0, state); | ||||
| } | ||||
| 
 | ||||
| struct bset_stats { | ||||
| 	size_t sets_written, sets_unwritten; | ||||
| 	size_t bytes_written, bytes_unwritten; | ||||
| 	size_t floats, failed; | ||||
| }; | ||||
| 
 | ||||
| void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); | ||||
| 
 | ||||
| /* Bkey utility code */ | ||||
| 
 | ||||
| #define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys) | ||||
| 
 | ||||
| static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) | ||||
| { | ||||
| 	return bkey_idx(i->start, idx); | ||||
| } | ||||
| 
 | ||||
| static inline void bkey_init(struct bkey *k) | ||||
| { | ||||
| 	*k = ZERO_KEY; | ||||
| } | ||||
| 
 | ||||
| static __always_inline int64_t bkey_cmp(const struct bkey *l, | ||||
| 					const struct bkey *r) | ||||
| { | ||||
| @ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, | ||||
| 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | ||||
| } | ||||
| 
 | ||||
| void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||||
| 			      unsigned); | ||||
| bool __bch_cut_front(const struct bkey *, struct bkey *); | ||||
| bool __bch_cut_back(const struct bkey *, struct bkey *); | ||||
| 
 | ||||
| static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(bkey_cmp(where, k) > 0); | ||||
| 	return __bch_cut_front(where, k); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||||
| 	return __bch_cut_back(where, k); | ||||
| } | ||||
| 
 | ||||
| #define PRECEDING_KEY(_k)					\ | ||||
| ({								\ | ||||
| 	struct bkey *_ret = NULL;				\ | ||||
| 								\ | ||||
| 	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\ | ||||
| 		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\ | ||||
| 								\ | ||||
| 		if (!_ret->low)					\ | ||||
| 			_ret->high--;				\ | ||||
| 		_ret->low--;					\ | ||||
| 	}							\ | ||||
| 								\ | ||||
| 	_ret;							\ | ||||
| }) | ||||
| 
 | ||||
| static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) | ||||
| { | ||||
| 	return b->ops->key_invalid(b, k); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) | ||||
| { | ||||
| 	return b->ops->key_bad(b, k); | ||||
| } | ||||
| 
 | ||||
| static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, | ||||
| 				    size_t size, const struct bkey *k) | ||||
| { | ||||
| 	return b->ops->key_to_text(buf, size, k); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_bkey_equal_header(const struct bkey *l, | ||||
| 					 const struct bkey *r) | ||||
| { | ||||
| 	return (KEY_DIRTY(l) == KEY_DIRTY(r) && | ||||
| 		KEY_PTRS(l) == KEY_PTRS(r) && | ||||
| 		KEY_CSUM(l) == KEY_CSUM(l)); | ||||
| } | ||||
| 
 | ||||
| /* Keylists */ | ||||
| 
 | ||||
| struct keylist { | ||||
| @ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l) | ||||
| 
 | ||||
| struct bkey *bch_keylist_pop(struct keylist *); | ||||
| void bch_keylist_pop_front(struct keylist *); | ||||
| int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | ||||
| int __bch_keylist_realloc(struct keylist *, unsigned); | ||||
| 
 | ||||
| void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||||
| 			      unsigned); | ||||
| bool __bch_cut_front(const struct bkey *, struct bkey *); | ||||
| bool __bch_cut_back(const struct bkey *, struct bkey *); | ||||
| /* Debug stuff */ | ||||
| 
 | ||||
| static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(bkey_cmp(where, k) > 0); | ||||
| 	return __bch_cut_front(where, k); | ||||
| } | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 
 | ||||
| static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||||
| { | ||||
| 	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||||
| 	return __bch_cut_back(where, k); | ||||
| } | ||||
| int __bch_count_data(struct btree_keys *); | ||||
| void __bch_check_keys(struct btree_keys *, const char *, ...); | ||||
| void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); | ||||
| void bch_dump_bucket(struct btree_keys *); | ||||
| 
 | ||||
| const char *bch_ptr_status(struct cache_set *, const struct bkey *); | ||||
| bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); | ||||
| bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); | ||||
| #else | ||||
| 
 | ||||
| bool bch_ptr_bad(struct btree *, const struct bkey *); | ||||
| 
 | ||||
| static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||||
| { | ||||
| 	uint8_t r = a - b; | ||||
| 	return r > 128U ? 0 : r; | ||||
| } | ||||
| 
 | ||||
| static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||||
| 				unsigned i) | ||||
| { | ||||
| 	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||||
| } | ||||
| 
 | ||||
| static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||||
| 				 unsigned i) | ||||
| { | ||||
| 	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | ||||
| 
 | ||||
| struct bkey *bch_btree_iter_next(struct btree_iter *); | ||||
| struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||||
| 					struct btree *, ptr_filter_fn); | ||||
| 
 | ||||
| void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||||
| struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, | ||||
| 				   struct bkey *, struct bset_tree *); | ||||
| 
 | ||||
| /* 32 bits total: */ | ||||
| #define BKEY_MID_BITS		3 | ||||
| #define BKEY_EXPONENT_BITS	7 | ||||
| #define BKEY_MANTISSA_BITS	22 | ||||
| #define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1) | ||||
| 
 | ||||
| struct bkey_float { | ||||
| 	unsigned	exponent:BKEY_EXPONENT_BITS; | ||||
| 	unsigned	m:BKEY_MID_BITS; | ||||
| 	unsigned	mantissa:BKEY_MANTISSA_BITS; | ||||
| } __packed; | ||||
| 
 | ||||
| /*
 | ||||
|  * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||||
|  * it used to be 64, but I realized the lookup code would touch slightly less | ||||
|  * memory if it was 128. | ||||
|  * | ||||
|  * It definites the number of bytes (in struct bset) per struct bkey_float in | ||||
|  * the auxiliar search tree - when we're done searching the bset_float tree we | ||||
|  * have this many bytes left that we do a linear search over. | ||||
|  * | ||||
|  * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||||
|  * we're touching one fewer cacheline in the bset tree in exchange for one more | ||||
|  * cacheline in the linear search - but the linear search might stop before it | ||||
|  * gets to the second cacheline. | ||||
|  */ | ||||
| 
 | ||||
| #define BSET_CACHELINE		128 | ||||
| #define bset_tree_space(b)	(btree_data_space(b) / BSET_CACHELINE) | ||||
| 
 | ||||
| #define bset_tree_bytes(b)	(bset_tree_space(b) * sizeof(struct bkey_float)) | ||||
| #define bset_prev_bytes(b)	(bset_tree_space(b) * sizeof(uint8_t)) | ||||
| 
 | ||||
| void bch_bset_init_next(struct btree *); | ||||
| 
 | ||||
| void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); | ||||
| void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | ||||
| 
 | ||||
| struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | ||||
| 			   const struct bkey *); | ||||
| 
 | ||||
| /*
 | ||||
|  * Returns the first key that is strictly greater than search | ||||
|  */ | ||||
| static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | ||||
| 					   const struct bkey *search) | ||||
| { | ||||
| 	return search ? __bch_bset_search(b, t, search) : t->data->start; | ||||
| } | ||||
| 
 | ||||
| #define PRECEDING_KEY(_k)					\ | ||||
| ({								\ | ||||
| 	struct bkey *_ret = NULL;				\ | ||||
| 								\ | ||||
| 	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\ | ||||
| 		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\ | ||||
| 								\ | ||||
| 		if (!_ret->low)					\ | ||||
| 			_ret->high--;				\ | ||||
| 		_ret->low--;					\ | ||||
| 	}							\ | ||||
| 								\ | ||||
| 	_ret;							\ | ||||
| }) | ||||
| 
 | ||||
| bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | ||||
| void bch_btree_sort_lazy(struct btree *); | ||||
| void bch_btree_sort_into(struct btree *, struct btree *); | ||||
| void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); | ||||
| void bch_btree_sort_partial(struct btree *, unsigned); | ||||
| 
 | ||||
| static inline void bch_btree_sort(struct btree *b) | ||||
| { | ||||
| 	bch_btree_sort_partial(b, 0); | ||||
| } | ||||
| 
 | ||||
| int bch_bset_print_stats(struct cache_set *, char *); | ||||
| static inline int __bch_count_data(struct btree_keys *b) { return -1; } | ||||
| static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} | ||||
| static inline void bch_dump_bucket(struct btree_keys *b) {} | ||||
| void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| static inline bool btree_keys_expensive_checks(struct btree_keys *b) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 	return *b->expensive_debug_checks; | ||||
| #else | ||||
| 	return false; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline int bch_count_data(struct btree_keys *b) | ||||
| { | ||||
| 	return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; | ||||
| } | ||||
| 
 | ||||
| #define bch_check_keys(b, ...)						\ | ||||
| do {									\ | ||||
| 	if (btree_keys_expensive_checks(b))				\ | ||||
| 		__bch_check_keys(b, __VA_ARGS__);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| #endif | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -130,20 +130,12 @@ struct btree { | ||||
| 	unsigned long		flags; | ||||
| 	uint16_t		written;	/* would be nice to kill */ | ||||
| 	uint8_t			level; | ||||
| 	uint8_t			nsets; | ||||
| 	uint8_t			page_order; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Set of sorted keys - the real btree node - plus a binary search tree | ||||
| 	 * | ||||
| 	 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||||
| 	 * to the memory we have allocated for this btree node. Additionally, | ||||
| 	 * set[0]->data points to the entire btree node as it exists on disk. | ||||
| 	 */ | ||||
| 	struct bset_tree	sets[MAX_BSETS]; | ||||
| 	struct btree_keys	keys; | ||||
| 
 | ||||
| 	/* For outstanding btree writes, used as a lock - protects write_idx */ | ||||
| 	struct closure_with_waitlist	io; | ||||
| 	struct closure		io; | ||||
| 	struct semaphore	io_mutex; | ||||
| 
 | ||||
| 	struct list_head	list; | ||||
| 	struct delayed_work	work; | ||||
| @ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b) | ||||
| 	return b->writes + (btree_node_write_idx(b) ^ 1); | ||||
| } | ||||
| 
 | ||||
| static inline unsigned bset_offset(struct btree *b, struct bset *i) | ||||
| static inline struct bset *btree_bset_first(struct btree *b) | ||||
| { | ||||
| 	return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | ||||
| 	return b->keys.set->data; | ||||
| } | ||||
| 
 | ||||
| static inline struct bset *write_block(struct btree *b) | ||||
| static inline struct bset *btree_bset_last(struct btree *b) | ||||
| { | ||||
| 	return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | ||||
| 	return bset_tree_last(&b->keys)->data; | ||||
| } | ||||
| 
 | ||||
| static inline bool bset_written(struct btree *b, struct bset_tree *t) | ||||
| static inline unsigned bset_block_offset(struct btree *b, struct bset *i) | ||||
| { | ||||
| 	return t->data < write_block(b); | ||||
| } | ||||
| 
 | ||||
| static inline bool bkey_written(struct btree *b, struct bkey *k) | ||||
| { | ||||
| 	return k < write_block(b)->start; | ||||
| 	return bset_sector_offset(&b->keys, i) >> b->c->block_bits; | ||||
| } | ||||
| 
 | ||||
| static inline void set_gc_sectors(struct cache_set *c) | ||||
| @ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c) | ||||
| 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); | ||||
| } | ||||
| 
 | ||||
| static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||||
| 					       struct btree_iter *iter, | ||||
| 					       struct bkey *search) | ||||
| { | ||||
| 	return __bch_btree_iter_init(b, iter, search, b->sets); | ||||
| } | ||||
| 
 | ||||
| static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||||
| { | ||||
| 	if (b->level) | ||||
| 		return bch_btree_ptr_invalid(b->c, k); | ||||
| 	else | ||||
| 		return bch_extent_ptr_invalid(b->c, k); | ||||
| } | ||||
| 
 | ||||
| void bkey_put(struct cache_set *c, struct bkey *k); | ||||
| 
 | ||||
| /* Looping macros */ | ||||
| @ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k); | ||||
| 	     iter++)							\ | ||||
| 		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | ||||
| 
 | ||||
| #define for_each_key_filter(b, k, iter, filter)				\ | ||||
| 	for (bch_btree_iter_init((b), (iter), NULL);			\ | ||||
| 	     ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||||
| 
 | ||||
| #define for_each_key(b, k, iter)					\ | ||||
| 	for (bch_btree_iter_init((b), (iter), NULL);			\ | ||||
| 	     ((k) = bch_btree_iter_next(iter));) | ||||
| 
 | ||||
| /* Recursing down the btree */ | ||||
| 
 | ||||
| struct btree_op { | ||||
| 	/* for waiting on btree reserve in btree_split() */ | ||||
| 	wait_queue_t		wait; | ||||
| 
 | ||||
| 	/* Btree level at which we start taking write locks */ | ||||
| 	short			lock; | ||||
| 
 | ||||
| @ -249,6 +216,7 @@ struct btree_op { | ||||
| static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) | ||||
| { | ||||
| 	memset(op, 0, sizeof(struct btree_op)); | ||||
| 	init_wait(&op->wait); | ||||
| 	op->lock = write_lock_level; | ||||
| } | ||||
| 
 | ||||
| @ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b) | ||||
| 	(w ? up_write : up_read)(&b->lock); | ||||
| } | ||||
| 
 | ||||
| void bch_btree_node_read(struct btree *); | ||||
| void bch_btree_node_read_done(struct btree *); | ||||
| void bch_btree_node_write(struct btree *, struct closure *); | ||||
| 
 | ||||
| void bch_btree_set_root(struct btree *); | ||||
|  | ||||
| @ -11,19 +11,6 @@ | ||||
| 
 | ||||
| #include "closure.h" | ||||
| 
 | ||||
| #define CL_FIELD(type, field)					\ | ||||
| 	case TYPE_ ## type:					\ | ||||
| 	return &container_of(cl, struct type, cl)->field | ||||
| 
 | ||||
| static struct closure_waitlist *closure_waitlist(struct closure *cl) | ||||
| { | ||||
| 	switch (cl->type) { | ||||
| 		CL_FIELD(closure_with_waitlist, wait); | ||||
| 	default: | ||||
| 		return NULL; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static inline void closure_put_after_sub(struct closure *cl, int flags) | ||||
| { | ||||
| 	int r = flags & CLOSURE_REMAINING_MASK; | ||||
| @ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | ||||
| 			closure_queue(cl); | ||||
| 		} else { | ||||
| 			struct closure *parent = cl->parent; | ||||
| 			struct closure_waitlist *wait = closure_waitlist(cl); | ||||
| 			closure_fn *destructor = cl->fn; | ||||
| 
 | ||||
| 			closure_debug_destroy(cl); | ||||
| 
 | ||||
| 			smp_mb(); | ||||
| 			atomic_set(&cl->remaining, -1); | ||||
| 
 | ||||
| 			if (wait) | ||||
| 				closure_wake_up(wait); | ||||
| 
 | ||||
| 			if (destructor) | ||||
| 				destructor(cl); | ||||
| 
 | ||||
| @ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v) | ||||
| } | ||||
| EXPORT_SYMBOL(closure_sub); | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_put - decrement a closure's refcount | ||||
|  */ | ||||
| void closure_put(struct closure *cl) | ||||
| { | ||||
| 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | ||||
| } | ||||
| EXPORT_SYMBOL(closure_put); | ||||
| 
 | ||||
| static void set_waiting(struct closure *cl, unsigned long f) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	cl->waiting_on = f; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_wake_up - wake up all closures on a wait list, without memory barrier | ||||
|  */ | ||||
| void __closure_wake_up(struct closure_waitlist *wait_list) | ||||
| { | ||||
| 	struct llist_node *list; | ||||
| @ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list) | ||||
| 		cl = container_of(reverse, struct closure, list); | ||||
| 		reverse = llist_next(reverse); | ||||
| 
 | ||||
| 		set_waiting(cl, 0); | ||||
| 		closure_set_waiting(cl, 0); | ||||
| 		closure_sub(cl, CLOSURE_WAITING + 1); | ||||
| 	} | ||||
| } | ||||
| EXPORT_SYMBOL(__closure_wake_up); | ||||
| 
 | ||||
| bool closure_wait(struct closure_waitlist *list, struct closure *cl) | ||||
| /**
 | ||||
|  * closure_wait - add a closure to a waitlist | ||||
|  * | ||||
|  * @waitlist will own a ref on @cl, which will be released when | ||||
|  * closure_wake_up() is called on @waitlist. | ||||
|  * | ||||
|  */ | ||||
| bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) | ||||
| { | ||||
| 	if (atomic_read(&cl->remaining) & CLOSURE_WAITING) | ||||
| 		return false; | ||||
| 
 | ||||
| 	set_waiting(cl, _RET_IP_); | ||||
| 	closure_set_waiting(cl, _RET_IP_); | ||||
| 	atomic_add(CLOSURE_WAITING + 1, &cl->remaining); | ||||
| 	llist_add(&cl->list, &list->list); | ||||
| 	llist_add(&cl->list, &waitlist->list); | ||||
| 
 | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL(closure_wait); | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_sync() - sleep until a closure a closure has nothing left to wait on | ||||
|  * closure_sync - sleep until a closure a closure has nothing left to wait on | ||||
|  * | ||||
|  * Sleeps until the refcount hits 1 - the thread that's running the closure owns | ||||
|  * the last refcount. | ||||
| @ -148,46 +134,6 @@ void closure_sync(struct closure *cl) | ||||
| } | ||||
| EXPORT_SYMBOL(closure_sync); | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_trylock() - try to acquire the closure, without waiting | ||||
|  * @cl:		closure to lock | ||||
|  * | ||||
|  * Returns true if the closure was succesfully locked. | ||||
|  */ | ||||
| bool closure_trylock(struct closure *cl, struct closure *parent) | ||||
| { | ||||
| 	if (atomic_cmpxchg(&cl->remaining, -1, | ||||
| 			   CLOSURE_REMAINING_INITIALIZER) != -1) | ||||
| 		return false; | ||||
| 
 | ||||
| 	smp_mb(); | ||||
| 
 | ||||
| 	cl->parent = parent; | ||||
| 	if (parent) | ||||
| 		closure_get(parent); | ||||
| 
 | ||||
| 	closure_set_ret_ip(cl); | ||||
| 	closure_debug_create(cl); | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL(closure_trylock); | ||||
| 
 | ||||
| void __closure_lock(struct closure *cl, struct closure *parent, | ||||
| 		    struct closure_waitlist *wait_list) | ||||
| { | ||||
| 	struct closure wait; | ||||
| 	closure_init_stack(&wait); | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		if (closure_trylock(cl, parent)) | ||||
| 			return; | ||||
| 
 | ||||
| 		closure_wait_event(wait_list, &wait, | ||||
| 				   atomic_read(&cl->remaining) == -1); | ||||
| 	} | ||||
| } | ||||
| EXPORT_SYMBOL(__closure_lock); | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 
 | ||||
| static LIST_HEAD(closure_list); | ||||
|  | ||||
| @ -72,30 +72,6 @@ | ||||
|  * closure - _always_ use continue_at(). Doing so consistently will help | ||||
|  * eliminate an entire class of particularly pernicious races. | ||||
|  * | ||||
|  * For a closure to wait on an arbitrary event, we need to introduce waitlists: | ||||
|  * | ||||
|  * struct closure_waitlist list; | ||||
|  * closure_wait_event(list, cl, condition); | ||||
|  * closure_wake_up(wait_list); | ||||
|  * | ||||
|  * These work analagously to wait_event() and wake_up() - except that instead of | ||||
|  * operating on the current thread (for wait_event()) and lists of threads, they | ||||
|  * operate on an explicit closure and lists of closures. | ||||
|  * | ||||
|  * Because it's a closure we can now wait either synchronously or | ||||
|  * asynchronously. closure_wait_event() returns the current value of the | ||||
|  * condition, and if it returned false continue_at() or closure_sync() can be | ||||
|  * used to wait for it to become true. | ||||
|  * | ||||
|  * It's useful for waiting on things when you can't sleep in the context in | ||||
|  * which you must check the condition (perhaps a spinlock held, or you might be | ||||
|  * beneath generic_make_request() - in which case you can't sleep on IO). | ||||
|  * | ||||
|  * closure_wait_event() will wait either synchronously or asynchronously, | ||||
|  * depending on whether the closure is in blocking mode or not. You can pick a | ||||
|  * mode explicitly with closure_wait_event_sync() and | ||||
|  * closure_wait_event_async(), which do just what you might expect. | ||||
|  * | ||||
|  * Lastly, you might have a wait list dedicated to a specific event, and have no | ||||
|  * need for specifying the condition - you just want to wait until someone runs | ||||
|  * closure_wake_up() on the appropriate wait list. In that case, just use | ||||
| @ -121,40 +97,6 @@ | ||||
|  * All this implies that a closure should typically be embedded in a particular | ||||
|  * struct (which its refcount will normally control the lifetime of), and that | ||||
|  * struct can very much be thought of as a stack frame. | ||||
|  * | ||||
|  * Locking: | ||||
|  * | ||||
|  * Closures are based on work items but they can be thought of as more like | ||||
|  * threads - in that like threads and unlike work items they have a well | ||||
|  * defined lifetime; they are created (with closure_init()) and eventually | ||||
|  * complete after a continue_at(cl, NULL, NULL). | ||||
|  * | ||||
|  * Suppose you've got some larger structure with a closure embedded in it that's | ||||
|  * used for periodically doing garbage collection. You only want one garbage | ||||
|  * collection happening at a time, so the natural thing to do is protect it with | ||||
|  * a lock. However, it's difficult to use a lock protecting a closure correctly | ||||
|  * because the unlock should come after the last continue_to() (additionally, if | ||||
|  * you're using the closure asynchronously a mutex won't work since a mutex has | ||||
|  * to be unlocked by the same process that locked it). | ||||
|  * | ||||
|  * So to make it less error prone and more efficient, we also have the ability | ||||
|  * to use closures as locks: | ||||
|  * | ||||
|  * closure_init_unlocked(); | ||||
|  * closure_trylock(); | ||||
|  * | ||||
|  * That's all we need for trylock() - the last closure_put() implicitly unlocks | ||||
|  * it for you.  But for closure_lock(), we also need a wait list: | ||||
|  * | ||||
|  * struct closure_with_waitlist frobnicator_cl; | ||||
|  * | ||||
|  * closure_init_unlocked(&frobnicator_cl); | ||||
|  * closure_lock(&frobnicator_cl); | ||||
|  * | ||||
|  * A closure_with_waitlist embeds a closure and a wait list - much like struct | ||||
|  * delayed_work embeds a work item and a timer_list. The important thing is, use | ||||
|  * it exactly like you would a regular closure and closure_put() will magically | ||||
|  * handle everything for you. | ||||
|  */ | ||||
| 
 | ||||
| struct closure; | ||||
| @ -164,12 +106,6 @@ struct closure_waitlist { | ||||
| 	struct llist_head	list; | ||||
| }; | ||||
| 
 | ||||
| enum closure_type { | ||||
| 	TYPE_closure				= 0, | ||||
| 	TYPE_closure_with_waitlist		= 1, | ||||
| 	MAX_CLOSURE_TYPE			= 1, | ||||
| }; | ||||
| 
 | ||||
| enum closure_state { | ||||
| 	/*
 | ||||
| 	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | ||||
| @ -224,8 +160,6 @@ struct closure { | ||||
| 
 | ||||
| 	atomic_t		remaining; | ||||
| 
 | ||||
| 	enum closure_type	type; | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| #define CLOSURE_MAGIC_DEAD	0xc054dead | ||||
| #define CLOSURE_MAGIC_ALIVE	0xc054a11e | ||||
| @ -237,34 +171,12 @@ struct closure { | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| struct closure_with_waitlist { | ||||
| 	struct closure		cl; | ||||
| 	struct closure_waitlist	wait; | ||||
| }; | ||||
| 
 | ||||
| extern unsigned invalid_closure_type(void); | ||||
| 
 | ||||
| #define __CLOSURE_TYPE(cl, _t)						\ | ||||
| 	  __builtin_types_compatible_p(typeof(cl), struct _t)		\ | ||||
| 		? TYPE_ ## _t :						\ | ||||
| 
 | ||||
| #define __closure_type(cl)						\ | ||||
| (									\ | ||||
| 	__CLOSURE_TYPE(cl, closure)					\ | ||||
| 	__CLOSURE_TYPE(cl, closure_with_waitlist)			\ | ||||
| 	invalid_closure_type()						\ | ||||
| ) | ||||
| 
 | ||||
| void closure_sub(struct closure *cl, int v); | ||||
| void closure_put(struct closure *cl); | ||||
| void __closure_wake_up(struct closure_waitlist *list); | ||||
| bool closure_wait(struct closure_waitlist *list, struct closure *cl); | ||||
| void closure_sync(struct closure *cl); | ||||
| 
 | ||||
| bool closure_trylock(struct closure *cl, struct closure *parent); | ||||
| void __closure_lock(struct closure *cl, struct closure *parent, | ||||
| 		    struct closure_waitlist *wait_list); | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 
 | ||||
| void closure_debug_init(void); | ||||
| @ -293,114 +205,13 @@ static inline void closure_set_ret_ip(struct closure *cl) | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline void closure_get(struct closure *cl) | ||||
| static inline void closure_set_waiting(struct closure *cl, unsigned long f) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	BUG_ON((atomic_inc_return(&cl->remaining) & | ||||
| 		CLOSURE_REMAINING_MASK) <= 1); | ||||
| #else | ||||
| 	atomic_inc(&cl->remaining); | ||||
| 	cl->waiting_on = f; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static inline void closure_set_stopped(struct closure *cl) | ||||
| { | ||||
| 	atomic_sub(CLOSURE_RUNNING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| static inline bool closure_is_unlocked(struct closure *cl) | ||||
| { | ||||
| 	return atomic_read(&cl->remaining) == -1; | ||||
| } | ||||
| 
 | ||||
| static inline void do_closure_init(struct closure *cl, struct closure *parent, | ||||
| 				   bool running) | ||||
| { | ||||
| 	cl->parent = parent; | ||||
| 	if (parent) | ||||
| 		closure_get(parent); | ||||
| 
 | ||||
| 	if (running) { | ||||
| 		closure_debug_create(cl); | ||||
| 		atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||||
| 	} else | ||||
| 		atomic_set(&cl->remaining, -1); | ||||
| 
 | ||||
| 	closure_set_ip(cl); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Hack to get at the embedded closure if there is one, by doing an unsafe cast: | ||||
|  * the result of __closure_type() is thrown away, it's used merely for type | ||||
|  * checking. | ||||
|  */ | ||||
| #define __to_internal_closure(cl)				\ | ||||
| ({								\ | ||||
| 	BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);	\ | ||||
| 	(struct closure *) cl;					\ | ||||
| }) | ||||
| 
 | ||||
| #define closure_init_type(cl, parent, running)			\ | ||||
| do {								\ | ||||
| 	struct closure *_cl = __to_internal_closure(cl);	\ | ||||
| 	_cl->type = __closure_type(*(cl));			\ | ||||
| 	do_closure_init(_cl, parent, running);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * __closure_init() - Initialize a closure, skipping the memset() | ||||
|  * | ||||
|  * May be used instead of closure_init() when memory has already been zeroed. | ||||
|  */ | ||||
| #define __closure_init(cl, parent)				\ | ||||
| 	closure_init_type(cl, parent, true) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_init() - Initialize a closure, setting the refcount to 1 | ||||
|  * @cl:		closure to initialize | ||||
|  * @parent:	parent of the new closure. cl will take a refcount on it for its | ||||
|  *		lifetime; may be NULL. | ||||
|  */ | ||||
| #define closure_init(cl, parent)				\ | ||||
| do {								\ | ||||
| 	memset((cl), 0, sizeof(*(cl)));				\ | ||||
| 	__closure_init(cl, parent);				\ | ||||
| } while (0) | ||||
| 
 | ||||
| static inline void closure_init_stack(struct closure *cl) | ||||
| { | ||||
| 	memset(cl, 0, sizeof(struct closure)); | ||||
| 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_init_unlocked() - Initialize a closure but leave it unlocked. | ||||
|  * @cl:		closure to initialize | ||||
|  * | ||||
|  * For when the closure will be used as a lock. The closure may not be used | ||||
|  * until after a closure_lock() or closure_trylock(). | ||||
|  */ | ||||
| #define closure_init_unlocked(cl)				\ | ||||
| do {								\ | ||||
| 	memset((cl), 0, sizeof(*(cl)));				\ | ||||
| 	closure_init_type(cl, NULL, false);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_lock() - lock and initialize a closure. | ||||
|  * @cl:		the closure to lock | ||||
|  * @parent:	the new parent for this closure | ||||
|  * | ||||
|  * The closure must be of one of the types that has a waitlist (otherwise we | ||||
|  * wouldn't be able to sleep on contention). | ||||
|  * | ||||
|  * @parent has exactly the same meaning as in closure_init(); if non null, the | ||||
|  * closure will take a reference on @parent which will be released when it is | ||||
|  * unlocked. | ||||
|  */ | ||||
| #define closure_lock(cl, parent)				\ | ||||
| 	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | ||||
| 
 | ||||
| static inline void __closure_end_sleep(struct closure *cl) | ||||
| { | ||||
| 	__set_current_state(TASK_RUNNING); | ||||
| @ -419,65 +230,9 @@ static inline void __closure_start_sleep(struct closure *cl) | ||||
| 		atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_wake_up() - wake up all closures on a wait list. | ||||
|  */ | ||||
| static inline void closure_wake_up(struct closure_waitlist *list) | ||||
| static inline void closure_set_stopped(struct closure *cl) | ||||
| { | ||||
| 	smp_mb(); | ||||
| 	__closure_wake_up(list); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Wait on an event, synchronously or asynchronously - analogous to wait_event() | ||||
|  * but for closures. | ||||
|  * | ||||
|  * The loop is oddly structured so as to avoid a race; we must check the | ||||
|  * condition again after we've added ourself to the waitlist. We know if we were | ||||
|  * already on the waitlist because closure_wait() returns false; thus, we only | ||||
|  * schedule or break if closure_wait() returns false. If it returns true, we | ||||
|  * just loop again - rechecking the condition. | ||||
|  * | ||||
|  * The __closure_wake_up() is necessary because we may race with the event | ||||
|  * becoming true; i.e. we see event false -> wait -> recheck condition, but the | ||||
|  * thread that made the event true may have called closure_wake_up() before we | ||||
|  * added ourself to the wait list. | ||||
|  * | ||||
|  * We have to call closure_sync() at the end instead of just | ||||
|  * __closure_end_sleep() because a different thread might've called | ||||
|  * closure_wake_up() before us and gotten preempted before they dropped the | ||||
|  * refcount on our closure. If this was a stack allocated closure, that would be | ||||
|  * bad. | ||||
|  */ | ||||
| #define closure_wait_event(list, cl, condition)				\ | ||||
| ({									\ | ||||
| 	typeof(condition) ret;						\ | ||||
| 									\ | ||||
| 	while (1) {							\ | ||||
| 		ret = (condition);					\ | ||||
| 		if (ret) {						\ | ||||
| 			__closure_wake_up(list);			\ | ||||
| 			closure_sync(cl);				\ | ||||
| 			break;						\ | ||||
| 		}							\ | ||||
| 									\ | ||||
| 		__closure_start_sleep(cl);				\ | ||||
| 									\ | ||||
| 		if (!closure_wait(list, cl))				\ | ||||
| 			schedule();					\ | ||||
| 	}								\ | ||||
| 									\ | ||||
| 	ret;								\ | ||||
| }) | ||||
| 
 | ||||
| static inline void closure_queue(struct closure *cl) | ||||
| { | ||||
| 	struct workqueue_struct *wq = cl->wq; | ||||
| 	if (wq) { | ||||
| 		INIT_WORK(&cl->work, cl->work.func); | ||||
| 		BUG_ON(!queue_work(wq, &cl->work)); | ||||
| 	} else | ||||
| 		cl->fn(cl); | ||||
| 	atomic_sub(CLOSURE_RUNNING, &cl->remaining); | ||||
| } | ||||
| 
 | ||||
| static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||||
| @ -491,6 +246,76 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||||
| 	smp_mb__before_atomic_dec(); | ||||
| } | ||||
| 
 | ||||
| static inline void closure_queue(struct closure *cl) | ||||
| { | ||||
| 	struct workqueue_struct *wq = cl->wq; | ||||
| 	if (wq) { | ||||
| 		INIT_WORK(&cl->work, cl->work.func); | ||||
| 		BUG_ON(!queue_work(wq, &cl->work)); | ||||
| 	} else | ||||
| 		cl->fn(cl); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_get - increment a closure's refcount | ||||
|  */ | ||||
| static inline void closure_get(struct closure *cl) | ||||
| { | ||||
| #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||||
| 	BUG_ON((atomic_inc_return(&cl->remaining) & | ||||
| 		CLOSURE_REMAINING_MASK) <= 1); | ||||
| #else | ||||
| 	atomic_inc(&cl->remaining); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_init - Initialize a closure, setting the refcount to 1 | ||||
|  * @cl:		closure to initialize | ||||
|  * @parent:	parent of the new closure. cl will take a refcount on it for its | ||||
|  *		lifetime; may be NULL. | ||||
|  */ | ||||
| static inline void closure_init(struct closure *cl, struct closure *parent) | ||||
| { | ||||
| 	memset(cl, 0, sizeof(struct closure)); | ||||
| 	cl->parent = parent; | ||||
| 	if (parent) | ||||
| 		closure_get(parent); | ||||
| 
 | ||||
| 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||||
| 
 | ||||
| 	closure_debug_create(cl); | ||||
| 	closure_set_ip(cl); | ||||
| } | ||||
| 
 | ||||
| static inline void closure_init_stack(struct closure *cl) | ||||
| { | ||||
| 	memset(cl, 0, sizeof(struct closure)); | ||||
| 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_wake_up - wake up all closures on a wait list. | ||||
|  */ | ||||
| static inline void closure_wake_up(struct closure_waitlist *list) | ||||
| { | ||||
| 	smp_mb(); | ||||
| 	__closure_wake_up(list); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * continue_at - jump to another function with barrier | ||||
|  * | ||||
|  * After @cl is no longer waiting on anything (i.e. all outstanding refs have | ||||
|  * been dropped with closure_put()), it will resume execution at @fn running out | ||||
|  * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). | ||||
|  * | ||||
|  * NOTE: This macro expands to a return in the calling function! | ||||
|  * | ||||
|  * This is because after calling continue_at() you no longer have a ref on @cl, | ||||
|  * and whatever @cl owns may be freed out from under you - a running closure fn | ||||
|  * has a ref on its own closure which continue_at() drops. | ||||
|  */ | ||||
| #define continue_at(_cl, _fn, _wq)					\ | ||||
| do {									\ | ||||
| 	set_closure_fn(_cl, _fn, _wq);					\ | ||||
| @ -498,8 +323,28 @@ do {									\ | ||||
| 	return;								\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_return - finish execution of a closure | ||||
|  * | ||||
|  * This is used to indicate that @cl is finished: when all outstanding refs on | ||||
|  * @cl have been dropped @cl's ref on its parent closure (as passed to | ||||
|  * closure_init()) will be dropped, if one was specified - thus this can be | ||||
|  * thought of as returning to the parent closure. | ||||
|  */ | ||||
| #define closure_return(_cl)	continue_at((_cl), NULL, NULL) | ||||
| 
 | ||||
| /**
 | ||||
|  * continue_at_nobarrier - jump to another function without barrier | ||||
|  * | ||||
|  * Causes @fn to be executed out of @cl, in @wq context (or called directly if | ||||
|  * @wq is NULL). | ||||
|  * | ||||
|  * NOTE: like continue_at(), this macro expands to a return in the caller! | ||||
|  * | ||||
|  * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, | ||||
|  * thus it's not safe to touch anything protected by @cl after a | ||||
|  * continue_at_nobarrier(). | ||||
|  */ | ||||
| #define continue_at_nobarrier(_cl, _fn, _wq)				\ | ||||
| do {									\ | ||||
| 	set_closure_fn(_cl, _fn, _wq);					\ | ||||
| @ -507,6 +352,15 @@ do {									\ | ||||
| 	return;								\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_return - finish execution of a closure, with destructor | ||||
|  * | ||||
|  * Works like closure_return(), except @destructor will be called when all | ||||
|  * outstanding refs on @cl have been dropped; @destructor may be used to safely | ||||
|  * free the memory occupied by @cl, and it is called with the ref on the parent | ||||
|  * closure still held - so @destructor could safely return an item to a | ||||
|  * freelist protected by @cl's parent. | ||||
|  */ | ||||
| #define closure_return_with_destructor(_cl, _destructor)		\ | ||||
| do {									\ | ||||
| 	set_closure_fn(_cl, _destructor, NULL);				\ | ||||
| @ -514,6 +368,13 @@ do {									\ | ||||
| 	return;								\ | ||||
| } while (0) | ||||
| 
 | ||||
| /**
 | ||||
|  * closure_call - execute @fn out of a new, uninitialized closure | ||||
|  * | ||||
|  * Typically used when running out of one closure, and we want to run @fn | ||||
|  * asynchronously out of a new closure - @parent will then wait for @cl to | ||||
|  * finish. | ||||
|  */ | ||||
| static inline void closure_call(struct closure *cl, closure_fn fn, | ||||
| 				struct workqueue_struct *wq, | ||||
| 				struct closure *parent) | ||||
| @ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn, | ||||
| 	continue_at_nobarrier(cl, fn, wq); | ||||
| } | ||||
| 
 | ||||
| static inline void closure_trylock_call(struct closure *cl, closure_fn fn, | ||||
| 					struct workqueue_struct *wq, | ||||
| 					struct closure *parent) | ||||
| { | ||||
| 	if (closure_trylock(cl, parent)) | ||||
| 		continue_at_nobarrier(cl, fn, wq); | ||||
| } | ||||
| 
 | ||||
| #endif /* _LINUX_CLOSURE_H */ | ||||
|  | ||||
| @ -8,6 +8,7 @@ | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| #include "extents.h" | ||||
| 
 | ||||
| #include <linux/console.h> | ||||
| #include <linux/debugfs.h> | ||||
| @ -17,156 +18,88 @@ | ||||
| 
 | ||||
| static struct dentry *debug; | ||||
| 
 | ||||
| const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 		if (ptr_available(c, k, i)) { | ||||
| 			struct cache *ca = PTR_CACHE(c, k, i); | ||||
| 			size_t bucket = PTR_BUCKET_NR(c, k, i); | ||||
| 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||||
| 
 | ||||
| 			if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||||
| 				return "bad, length too big"; | ||||
| 			if (bucket <  ca->sb.first_bucket) | ||||
| 				return "bad, short offset"; | ||||
| 			if (bucket >= ca->sb.nbuckets) | ||||
| 				return "bad, offset past end of device"; | ||||
| 			if (ptr_stale(c, k, i)) | ||||
| 				return "stale"; | ||||
| 		} | ||||
| 
 | ||||
| 	if (!bkey_cmp(k, &ZERO_KEY)) | ||||
| 		return "bad, null key"; | ||||
| 	if (!KEY_PTRS(k)) | ||||
| 		return "bad, no pointers"; | ||||
| 	if (!KEY_SIZE(k)) | ||||
| 		return "zeroed key"; | ||||
| 	return ""; | ||||
| } | ||||
| 
 | ||||
| int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i = 0; | ||||
| 	char *out = buf, *end = buf + size; | ||||
| 
 | ||||
| #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__)) | ||||
| 
 | ||||
| 	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | ||||
| 
 | ||||
| 	if (KEY_PTRS(k)) | ||||
| 		while (1) { | ||||
| 			p("%llu:%llu gen %llu", | ||||
| 			  PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||||
| 
 | ||||
| 			if (++i == KEY_PTRS(k)) | ||||
| 				break; | ||||
| 
 | ||||
| 			p(", "); | ||||
| 		} | ||||
| 
 | ||||
| 	p("]"); | ||||
| 
 | ||||
| 	if (KEY_DIRTY(k)) | ||||
| 		p(" dirty"); | ||||
| 	if (KEY_CSUM(k)) | ||||
| 		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||||
| #undef p | ||||
| 	return out - buf; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 
 | ||||
| static void dump_bset(struct btree *b, struct bset *i) | ||||
| { | ||||
| 	struct bkey *k, *next; | ||||
| 	unsigned j; | ||||
| 	char buf[80]; | ||||
| #define for_each_written_bset(b, start, i)				\ | ||||
| 	for (i = (start);						\ | ||||
| 	     (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ | ||||
| 	     i->seq == (start)->seq;					\ | ||||
| 	     i = (void *) i + set_blocks(i, block_bytes(b->c)) *	\ | ||||
| 		 block_bytes(b->c)) | ||||
| 
 | ||||
| 	for (k = i->start; k < end(i); k = next) { | ||||
| 		next = bkey_next(k); | ||||
| 
 | ||||
| 		bch_bkey_to_text(buf, sizeof(buf), k); | ||||
| 		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | ||||
| 		       (uint64_t *) k - i->d, i->keys, buf); | ||||
| 
 | ||||
| 		for (j = 0; j < KEY_PTRS(k); j++) { | ||||
| 			size_t n = PTR_BUCKET_NR(b->c, k, j); | ||||
| 			printk(" bucket %zu", n); | ||||
| 
 | ||||
| 			if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||||
| 				printk(" prio %i", | ||||
| 				       PTR_BUCKET(b->c, k, j)->prio); | ||||
| 		} | ||||
| 
 | ||||
| 		printk(" %s\n", bch_ptr_status(b->c, k)); | ||||
| 
 | ||||
| 		if (next < end(i) && | ||||
| 		    bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) | ||||
| 			printk(KERN_ERR "Key skipped backwards\n"); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void bch_dump_bucket(struct btree *b) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	console_lock(); | ||||
| 	for (i = 0; i <= b->nsets; i++) | ||||
| 		dump_bset(b, b->sets[i].data); | ||||
| 	console_unlock(); | ||||
| } | ||||
| 
 | ||||
| void bch_btree_verify(struct btree *b, struct bset *new) | ||||
| void bch_btree_verify(struct btree *b) | ||||
| { | ||||
| 	struct btree *v = b->c->verify_data; | ||||
| 	struct closure cl; | ||||
| 	closure_init_stack(&cl); | ||||
| 	struct bset *ondisk, *sorted, *inmemory; | ||||
| 	struct bio *bio; | ||||
| 
 | ||||
| 	if (!b->c->verify) | ||||
| 	if (!b->c->verify || !b->c->verify_ondisk) | ||||
| 		return; | ||||
| 
 | ||||
| 	closure_wait_event(&b->io.wait, &cl, | ||||
| 			   atomic_read(&b->io.cl.remaining) == -1); | ||||
| 
 | ||||
| 	down(&b->io_mutex); | ||||
| 	mutex_lock(&b->c->verify_lock); | ||||
| 
 | ||||
| 	ondisk = b->c->verify_ondisk; | ||||
| 	sorted = b->c->verify_data->keys.set->data; | ||||
| 	inmemory = b->keys.set->data; | ||||
| 
 | ||||
| 	bkey_copy(&v->key, &b->key); | ||||
| 	v->written = 0; | ||||
| 	v->level = b->level; | ||||
| 	v->keys.ops = b->keys.ops; | ||||
| 
 | ||||
| 	bch_btree_node_read(v); | ||||
| 	closure_wait_event(&v->io.wait, &cl, | ||||
| 			   atomic_read(&b->io.cl.remaining) == -1); | ||||
| 	bio = bch_bbio_alloc(b->c); | ||||
| 	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev; | ||||
| 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0); | ||||
| 	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9; | ||||
| 	bch_bio_map(bio, sorted); | ||||
| 
 | ||||
| 	if (new->keys != v->sets[0].data->keys || | ||||
| 	    memcmp(new->start, | ||||
| 		   v->sets[0].data->start, | ||||
| 		   (void *) end(new) - (void *) new->start)) { | ||||
| 		unsigned i, j; | ||||
| 	submit_bio_wait(REQ_META|READ_SYNC, bio); | ||||
| 	bch_bbio_free(bio, b->c); | ||||
| 
 | ||||
| 	memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); | ||||
| 
 | ||||
| 	bch_btree_node_read_done(v); | ||||
| 	sorted = v->keys.set->data; | ||||
| 
 | ||||
| 	if (inmemory->keys != sorted->keys || | ||||
| 	    memcmp(inmemory->start, | ||||
| 		   sorted->start, | ||||
| 		   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { | ||||
| 		struct bset *i; | ||||
| 		unsigned j; | ||||
| 
 | ||||
| 		console_lock(); | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** original memory node:\n"); | ||||
| 		for (i = 0; i <= b->nsets; i++) | ||||
| 			dump_bset(b, b->sets[i].data); | ||||
| 		printk(KERN_ERR "*** in memory:\n"); | ||||
| 		bch_dump_bset(&b->keys, inmemory, 0); | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** sorted memory node:\n"); | ||||
| 		dump_bset(b, new); | ||||
| 		printk(KERN_ERR "*** read back in:\n"); | ||||
| 		bch_dump_bset(&v->keys, sorted, 0); | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** on disk node:\n"); | ||||
| 		dump_bset(v, v->sets[0].data); | ||||
| 		for_each_written_bset(b, ondisk, i) { | ||||
| 			unsigned block = ((void *) i - (void *) ondisk) / | ||||
| 				block_bytes(b->c); | ||||
| 
 | ||||
| 		for (j = 0; j < new->keys; j++) | ||||
| 			if (new->d[j] != v->sets[0].data->d[j]) | ||||
| 			printk(KERN_ERR "*** on disk block %u:\n", block); | ||||
| 			bch_dump_bset(&b->keys, i, block); | ||||
| 		} | ||||
| 
 | ||||
| 		printk(KERN_ERR "*** block %zu not written\n", | ||||
| 		       ((void *) i - (void *) ondisk) / block_bytes(b->c)); | ||||
| 
 | ||||
| 		for (j = 0; j < inmemory->keys; j++) | ||||
| 			if (inmemory->d[j] != sorted->d[j]) | ||||
| 				break; | ||||
| 
 | ||||
| 		printk(KERN_ERR "b->written %u\n", b->written); | ||||
| 
 | ||||
| 		console_unlock(); | ||||
| 		panic("verify failed at %u\n", j); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_unlock(&b->c->verify_lock); | ||||
| 	up(&b->io_mutex); | ||||
| } | ||||
| 
 | ||||
| void bch_data_verify(struct cached_dev *dc, struct bio *bio) | ||||
| @ -207,74 +140,6 @@ out_put: | ||||
| 	bio_put(check); | ||||
| } | ||||
| 
 | ||||
| int __bch_count_data(struct btree *b) | ||||
| { | ||||
| 	unsigned ret = 0; | ||||
| 	struct btree_iter iter; | ||||
| 	struct bkey *k; | ||||
| 
 | ||||
| 	if (!b->level) | ||||
| 		for_each_key(b, k, &iter) | ||||
| 			ret += KEY_SIZE(k); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void __bch_check_keys(struct btree *b, const char *fmt, ...) | ||||
| { | ||||
| 	va_list args; | ||||
| 	struct bkey *k, *p = NULL; | ||||
| 	struct btree_iter iter; | ||||
| 	const char *err; | ||||
| 
 | ||||
| 	for_each_key(b, k, &iter) { | ||||
| 		if (!b->level) { | ||||
| 			err = "Keys out of order"; | ||||
| 			if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) | ||||
| 				goto bug; | ||||
| 
 | ||||
| 			if (bch_ptr_invalid(b, k)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			err =  "Overlapping keys"; | ||||
| 			if (p && bkey_cmp(p, &START_KEY(k)) > 0) | ||||
| 				goto bug; | ||||
| 		} else { | ||||
| 			if (bch_ptr_bad(b, k)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			err = "Duplicate keys"; | ||||
| 			if (p && !bkey_cmp(p, k)) | ||||
| 				goto bug; | ||||
| 		} | ||||
| 		p = k; | ||||
| 	} | ||||
| 
 | ||||
| 	err = "Key larger than btree node key"; | ||||
| 	if (p && bkey_cmp(p, &b->key) > 0) | ||||
| 		goto bug; | ||||
| 
 | ||||
| 	return; | ||||
| bug: | ||||
| 	bch_dump_bucket(b); | ||||
| 
 | ||||
| 	va_start(args, fmt); | ||||
| 	vprintk(fmt, args); | ||||
| 	va_end(args); | ||||
| 
 | ||||
| 	panic("bcache error: %s:\n", err); | ||||
| } | ||||
| 
 | ||||
| void bch_btree_iter_next_check(struct btree_iter *iter) | ||||
| { | ||||
| 	struct bkey *k = iter->data->k, *next = bkey_next(k); | ||||
| 
 | ||||
| 	if (next < iter->data->end && | ||||
| 	    bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { | ||||
| 		bch_dump_bucket(iter->b); | ||||
| 		panic("Key skipped backwards\n"); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_DEBUG_FS | ||||
| @ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | ||||
| 		if (!w) | ||||
| 			break; | ||||
| 
 | ||||
| 		bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); | ||||
| 		bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); | ||||
| 		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | ||||
| 		bch_keybuf_del(&i->keys, w); | ||||
| 	} | ||||
|  | ||||
| @ -1,47 +1,30 @@ | ||||
| #ifndef _BCACHE_DEBUG_H | ||||
| #define _BCACHE_DEBUG_H | ||||
| 
 | ||||
| /* Btree/bkey debug printing */ | ||||
| 
 | ||||
| int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); | ||||
| struct bio; | ||||
| struct cached_dev; | ||||
| struct cache_set; | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 
 | ||||
| void bch_btree_verify(struct btree *, struct bset *); | ||||
| void bch_btree_verify(struct btree *); | ||||
| void bch_data_verify(struct cached_dev *, struct bio *); | ||||
| int __bch_count_data(struct btree *); | ||||
| void __bch_check_keys(struct btree *, const char *, ...); | ||||
| void bch_btree_iter_next_check(struct btree_iter *); | ||||
| 
 | ||||
| #define EBUG_ON(cond)			BUG_ON(cond) | ||||
| #define expensive_debug_checks(c)	((c)->expensive_debug_checks) | ||||
| #define key_merging_disabled(c)		((c)->key_merging_disabled) | ||||
| #define bypass_torture_test(d)		((d)->bypass_torture_test) | ||||
| 
 | ||||
| #else /* DEBUG */ | ||||
| 
 | ||||
| static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | ||||
| static inline void bch_btree_verify(struct btree *b) {} | ||||
| static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} | ||||
| static inline int __bch_count_data(struct btree *b) { return -1; } | ||||
| static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} | ||||
| static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} | ||||
| 
 | ||||
| #define EBUG_ON(cond)			do { if (cond); } while (0) | ||||
| #define expensive_debug_checks(c)	0 | ||||
| #define key_merging_disabled(c)		0 | ||||
| #define bypass_torture_test(d)		0 | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #define bch_count_data(b)						\ | ||||
| 	(expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) | ||||
| 
 | ||||
| #define bch_check_keys(b, ...)						\ | ||||
| do {									\ | ||||
| 	if (expensive_debug_checks((b)->c))				\ | ||||
| 		__bch_check_keys(b, __VA_ARGS__);			\ | ||||
| } while (0) | ||||
| 
 | ||||
| #ifdef CONFIG_DEBUG_FS | ||||
| void bch_debug_init_cache_set(struct cache_set *); | ||||
| #else | ||||
|  | ||||
							
								
								
									
										616
									
								
								drivers/md/bcache/extents.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										616
									
								
								drivers/md/bcache/extents.c
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,616 @@ | ||||
| /*
 | ||||
|  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | ||||
|  * | ||||
|  * Uses a block device as cache for other block devices; optimized for SSDs. | ||||
|  * All allocation is done in buckets, which should match the erase block size | ||||
|  * of the device. | ||||
|  * | ||||
|  * Buckets containing cached data are kept on a heap sorted by priority; | ||||
|  * bucket priority is increased on cache hit, and periodically all the buckets | ||||
|  * on the heap have their priority scaled down. This currently is just used as | ||||
|  * an LRU but in the future should allow for more intelligent heuristics. | ||||
|  * | ||||
|  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | ||||
|  * counter. Garbage collection is used to remove stale pointers. | ||||
|  * | ||||
|  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | ||||
|  * as keys are inserted we only sort the pages that have not yet been written. | ||||
|  * When garbage collection is run, we resort the entire node. | ||||
|  * | ||||
|  * All configuration is done via sysfs; see Documentation/bcache.txt. | ||||
|  */ | ||||
| 
 | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| #include "extents.h" | ||||
| #include "writeback.h" | ||||
| 
 | ||||
| static void sort_key_next(struct btree_iter *iter, | ||||
| 			  struct btree_iter_set *i) | ||||
| { | ||||
| 	i->k = bkey_next(i->k); | ||||
| 
 | ||||
| 	if (i->k == i->end) | ||||
| 		*i = iter->data[--iter->used]; | ||||
| } | ||||
| 
 | ||||
| static bool bch_key_sort_cmp(struct btree_iter_set l, | ||||
| 			     struct btree_iter_set r) | ||||
| { | ||||
| 	int64_t c = bkey_cmp(l.k, r.k); | ||||
| 
 | ||||
| 	return c ? c > 0 : l.k < r.k; | ||||
| } | ||||
| 
 | ||||
| static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 		if (ptr_available(c, k, i)) { | ||||
| 			struct cache *ca = PTR_CACHE(c, k, i); | ||||
| 			size_t bucket = PTR_BUCKET_NR(c, k, i); | ||||
| 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||||
| 
 | ||||
| 			if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||||
| 			    bucket <  ca->sb.first_bucket || | ||||
| 			    bucket >= ca->sb.nbuckets) | ||||
| 				return true; | ||||
| 		} | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| /* Common among btree and extent ptrs */ | ||||
| 
 | ||||
| static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 		if (ptr_available(c, k, i)) { | ||||
| 			struct cache *ca = PTR_CACHE(c, k, i); | ||||
| 			size_t bucket = PTR_BUCKET_NR(c, k, i); | ||||
| 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||||
| 
 | ||||
| 			if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||||
| 				return "bad, length too big"; | ||||
| 			if (bucket <  ca->sb.first_bucket) | ||||
| 				return "bad, short offset"; | ||||
| 			if (bucket >= ca->sb.nbuckets) | ||||
| 				return "bad, offset past end of device"; | ||||
| 			if (ptr_stale(c, k, i)) | ||||
| 				return "stale"; | ||||
| 		} | ||||
| 
 | ||||
| 	if (!bkey_cmp(k, &ZERO_KEY)) | ||||
| 		return "bad, null key"; | ||||
| 	if (!KEY_PTRS(k)) | ||||
| 		return "bad, no pointers"; | ||||
| 	if (!KEY_SIZE(k)) | ||||
| 		return "zeroed key"; | ||||
| 	return ""; | ||||
| } | ||||
| 
 | ||||
| void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i = 0; | ||||
| 	char *out = buf, *end = buf + size; | ||||
| 
 | ||||
| #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__)) | ||||
| 
 | ||||
| 	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) { | ||||
| 		if (i) | ||||
| 			p(", "); | ||||
| 
 | ||||
| 		if (PTR_DEV(k, i) == PTR_CHECK_DEV) | ||||
| 			p("check dev"); | ||||
| 		else | ||||
| 			p("%llu:%llu gen %llu", PTR_DEV(k, i), | ||||
| 			  PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||||
| 	} | ||||
| 
 | ||||
| 	p("]"); | ||||
| 
 | ||||
| 	if (KEY_DIRTY(k)) | ||||
| 		p(" dirty"); | ||||
| 	if (KEY_CSUM(k)) | ||||
| 		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||||
| #undef p | ||||
| } | ||||
| 
 | ||||
| static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) | ||||
| { | ||||
| 	struct btree *b = container_of(keys, struct btree, keys); | ||||
| 	unsigned j; | ||||
| 	char buf[80]; | ||||
| 
 | ||||
| 	bch_extent_to_text(buf, sizeof(buf), k); | ||||
| 	printk(" %s", buf); | ||||
| 
 | ||||
| 	for (j = 0; j < KEY_PTRS(k); j++) { | ||||
| 		size_t n = PTR_BUCKET_NR(b->c, k, j); | ||||
| 		printk(" bucket %zu", n); | ||||
| 
 | ||||
| 		if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||||
| 			printk(" prio %i", | ||||
| 			       PTR_BUCKET(b->c, k, j)->prio); | ||||
| 	} | ||||
| 
 | ||||
| 	printk(" %s\n", bch_ptr_status(b->c, k)); | ||||
| } | ||||
| 
 | ||||
| /* Btree ptrs */ | ||||
| 
 | ||||
| bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||||
| { | ||||
| 	char buf[80]; | ||||
| 
 | ||||
| 	if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | ||||
| 		goto bad; | ||||
| 
 | ||||
| 	if (__ptr_invalid(c, k)) | ||||
| 		goto bad; | ||||
| 
 | ||||
| 	return false; | ||||
| bad: | ||||
| 	bch_extent_to_text(buf, sizeof(buf), k); | ||||
| 	cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) | ||||
| { | ||||
| 	struct btree *b = container_of(bk, struct btree, keys); | ||||
| 	return __bch_btree_ptr_invalid(b->c, k); | ||||
| } | ||||
| 
 | ||||
| static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) | ||||
| { | ||||
| 	unsigned i; | ||||
| 	char buf[80]; | ||||
| 	struct bucket *g; | ||||
| 
 | ||||
| 	if (mutex_trylock(&b->c->bucket_lock)) { | ||||
| 		for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 			if (ptr_available(b->c, k, i)) { | ||||
| 				g = PTR_BUCKET(b->c, k, i); | ||||
| 
 | ||||
| 				if (KEY_DIRTY(k) || | ||||
| 				    g->prio != BTREE_PRIO || | ||||
| 				    (b->c->gc_mark_valid && | ||||
| 				     GC_MARK(g) != GC_MARK_METADATA)) | ||||
| 					goto err; | ||||
| 			} | ||||
| 
 | ||||
| 		mutex_unlock(&b->c->bucket_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	return false; | ||||
| err: | ||||
| 	mutex_unlock(&b->c->bucket_lock); | ||||
| 	bch_extent_to_text(buf, sizeof(buf), k); | ||||
| 	btree_bug(b, | ||||
| "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||||
| 		  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | ||||
| 		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) | ||||
| { | ||||
| 	struct btree *b = container_of(bk, struct btree, keys); | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	if (!bkey_cmp(k, &ZERO_KEY) || | ||||
| 	    !KEY_PTRS(k) || | ||||
| 	    bch_ptr_invalid(bk, k)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 		if (!ptr_available(b->c, k, i) || | ||||
| 		    ptr_stale(b->c, k, i)) | ||||
| 			return true; | ||||
| 
 | ||||
| 	if (expensive_debug_checks(b->c) && | ||||
| 	    btree_ptr_bad_expensive(b, k)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, | ||||
| 				       struct bkey *insert, | ||||
| 				       struct btree_iter *iter, | ||||
| 				       struct bkey *replace_key) | ||||
| { | ||||
| 	struct btree *b = container_of(bk, struct btree, keys); | ||||
| 
 | ||||
| 	if (!KEY_OFFSET(insert)) | ||||
| 		btree_current_write(b)->prio_blocked++; | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| const struct btree_keys_ops bch_btree_keys_ops = { | ||||
| 	.sort_cmp	= bch_key_sort_cmp, | ||||
| 	.insert_fixup	= bch_btree_ptr_insert_fixup, | ||||
| 	.key_invalid	= bch_btree_ptr_invalid, | ||||
| 	.key_bad	= bch_btree_ptr_bad, | ||||
| 	.key_to_text	= bch_extent_to_text, | ||||
| 	.key_dump	= bch_bkey_dump, | ||||
| }; | ||||
| 
 | ||||
| /* Extents */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Returns true if l > r - unless l == r, in which case returns true if l is | ||||
|  * older than r. | ||||
|  * | ||||
|  * Necessary for btree_sort_fixup() - if there are multiple keys that compare | ||||
|  * equal in different sets, we have to process them newest to oldest. | ||||
|  */ | ||||
| static bool bch_extent_sort_cmp(struct btree_iter_set l, | ||||
| 				struct btree_iter_set r) | ||||
| { | ||||
| 	int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | ||||
| 
 | ||||
| 	return c ? c > 0 : l.k < r.k; | ||||
| } | ||||
| 
 | ||||
| static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, | ||||
| 					  struct bkey *tmp) | ||||
| { | ||||
| 	while (iter->used > 1) { | ||||
| 		struct btree_iter_set *top = iter->data, *i = top + 1; | ||||
| 
 | ||||
| 		if (iter->used > 2 && | ||||
| 		    bch_extent_sort_cmp(i[0], i[1])) | ||||
| 			i++; | ||||
| 
 | ||||
| 		if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (!KEY_SIZE(i->k)) { | ||||
| 			sort_key_next(iter, i); | ||||
| 			heap_sift(iter, i - top, bch_extent_sort_cmp); | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		if (top->k > i->k) { | ||||
| 			if (bkey_cmp(top->k, i->k) >= 0) | ||||
| 				sort_key_next(iter, i); | ||||
| 			else | ||||
| 				bch_cut_front(top->k, i->k); | ||||
| 
 | ||||
| 			heap_sift(iter, i - top, bch_extent_sort_cmp); | ||||
| 		} else { | ||||
| 			/* can't happen because of comparison func */ | ||||
| 			BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); | ||||
| 
 | ||||
| 			if (bkey_cmp(i->k, top->k) < 0) { | ||||
| 				bkey_copy(tmp, top->k); | ||||
| 
 | ||||
| 				bch_cut_back(&START_KEY(i->k), tmp); | ||||
| 				bch_cut_front(i->k, top->k); | ||||
| 				heap_sift(iter, 0, bch_extent_sort_cmp); | ||||
| 
 | ||||
| 				return tmp; | ||||
| 			} else { | ||||
| 				bch_cut_back(&START_KEY(i->k), top->k); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static bool bch_extent_insert_fixup(struct btree_keys *b, | ||||
| 				    struct bkey *insert, | ||||
| 				    struct btree_iter *iter, | ||||
| 				    struct bkey *replace_key) | ||||
| { | ||||
| 	struct cache_set *c = container_of(b, struct btree, keys)->c; | ||||
| 
 | ||||
| 	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | ||||
| 	{ | ||||
| 		if (KEY_DIRTY(k)) | ||||
| 			bcache_dev_sectors_dirty_add(c, KEY_INODE(k), | ||||
| 						     offset, -sectors); | ||||
| 	} | ||||
| 
 | ||||
| 	uint64_t old_offset; | ||||
| 	unsigned old_size, sectors_found = 0; | ||||
| 
 | ||||
| 	BUG_ON(!KEY_OFFSET(insert)); | ||||
| 	BUG_ON(!KEY_SIZE(insert)); | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		struct bkey *k = bch_btree_iter_next(iter); | ||||
| 		if (!k) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (bkey_cmp(&START_KEY(k), insert) >= 0) { | ||||
| 			if (KEY_SIZE(k)) | ||||
| 				break; | ||||
| 			else | ||||
| 				continue; | ||||
| 		} | ||||
| 
 | ||||
| 		if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||||
| 			continue; | ||||
| 
 | ||||
| 		old_offset = KEY_START(k); | ||||
| 		old_size = KEY_SIZE(k); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We might overlap with 0 size extents; we can't skip these | ||||
| 		 * because if they're in the set we're inserting to we have to | ||||
| 		 * adjust them so they don't overlap with the key we're | ||||
| 		 * inserting. But we don't want to check them for replace | ||||
| 		 * operations. | ||||
| 		 */ | ||||
| 
 | ||||
| 		if (replace_key && KEY_SIZE(k)) { | ||||
| 			/*
 | ||||
| 			 * k might have been split since we inserted/found the | ||||
| 			 * key we're replacing | ||||
| 			 */ | ||||
| 			unsigned i; | ||||
| 			uint64_t offset = KEY_START(k) - | ||||
| 				KEY_START(replace_key); | ||||
| 
 | ||||
| 			/* But it must be a subset of the replace key */ | ||||
| 			if (KEY_START(k) < KEY_START(replace_key) || | ||||
| 			    KEY_OFFSET(k) > KEY_OFFSET(replace_key)) | ||||
| 				goto check_failed; | ||||
| 
 | ||||
| 			/* We didn't find a key that we were supposed to */ | ||||
| 			if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||||
| 				goto check_failed; | ||||
| 
 | ||||
| 			if (!bch_bkey_equal_header(k, replace_key)) | ||||
| 				goto check_failed; | ||||
| 
 | ||||
| 			/* skip past gen */ | ||||
| 			offset <<= 8; | ||||
| 
 | ||||
| 			BUG_ON(!KEY_PTRS(replace_key)); | ||||
| 
 | ||||
| 			for (i = 0; i < KEY_PTRS(replace_key); i++) | ||||
| 				if (k->ptr[i] != replace_key->ptr[i] + offset) | ||||
| 					goto check_failed; | ||||
| 
 | ||||
| 			sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||||
| 		} | ||||
| 
 | ||||
| 		if (bkey_cmp(insert, k) < 0 && | ||||
| 		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||||
| 			/*
 | ||||
| 			 * We overlapped in the middle of an existing key: that | ||||
| 			 * means we have to split the old key. But we have to do | ||||
| 			 * slightly different things depending on whether the | ||||
| 			 * old key has been written out yet. | ||||
| 			 */ | ||||
| 
 | ||||
| 			struct bkey *top; | ||||
| 
 | ||||
| 			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | ||||
| 
 | ||||
| 			if (bkey_written(b, k)) { | ||||
| 				/*
 | ||||
| 				 * We insert a new key to cover the top of the | ||||
| 				 * old key, and the old key is modified in place | ||||
| 				 * to represent the bottom split. | ||||
| 				 * | ||||
| 				 * It's completely arbitrary whether the new key | ||||
| 				 * is the top or the bottom, but it has to match | ||||
| 				 * up with what btree_sort_fixup() does - it | ||||
| 				 * doesn't check for this kind of overlap, it | ||||
| 				 * depends on us inserting a new key for the top | ||||
| 				 * here. | ||||
| 				 */ | ||||
| 				top = bch_bset_search(b, bset_tree_last(b), | ||||
| 						      insert); | ||||
| 				bch_bset_insert(b, top, k); | ||||
| 			} else { | ||||
| 				BKEY_PADDED(key) temp; | ||||
| 				bkey_copy(&temp.key, k); | ||||
| 				bch_bset_insert(b, k, &temp.key); | ||||
| 				top = bkey_next(k); | ||||
| 			} | ||||
| 
 | ||||
| 			bch_cut_front(insert, top); | ||||
| 			bch_cut_back(&START_KEY(insert), k); | ||||
| 			bch_bset_fix_invalidated_key(b, k); | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		if (bkey_cmp(insert, k) < 0) { | ||||
| 			bch_cut_front(insert, k); | ||||
| 		} else { | ||||
| 			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) | ||||
| 				old_offset = KEY_START(insert); | ||||
| 
 | ||||
| 			if (bkey_written(b, k) && | ||||
| 			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||||
| 				/*
 | ||||
| 				 * Completely overwrote, so we don't have to | ||||
| 				 * invalidate the binary search tree | ||||
| 				 */ | ||||
| 				bch_cut_front(k, k); | ||||
| 			} else { | ||||
| 				__bch_cut_back(&START_KEY(insert), k); | ||||
| 				bch_bset_fix_invalidated_key(b, k); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | ||||
| 	} | ||||
| 
 | ||||
| check_failed: | ||||
| 	if (replace_key) { | ||||
| 		if (!sectors_found) { | ||||
| 			return true; | ||||
| 		} else if (sectors_found < KEY_SIZE(insert)) { | ||||
| 			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||||
| 				       (KEY_SIZE(insert) - sectors_found)); | ||||
| 			SET_KEY_SIZE(insert, sectors_found); | ||||
| 		} | ||||
| 	} | ||||
| out: | ||||
| 	if (KEY_DIRTY(insert)) | ||||
| 		bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), | ||||
| 					     KEY_START(insert), | ||||
| 					     KEY_SIZE(insert)); | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) | ||||
| { | ||||
| 	struct btree *b = container_of(bk, struct btree, keys); | ||||
| 	char buf[80]; | ||||
| 
 | ||||
| 	if (!KEY_SIZE(k)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	if (KEY_SIZE(k) > KEY_OFFSET(k)) | ||||
| 		goto bad; | ||||
| 
 | ||||
| 	if (__ptr_invalid(b->c, k)) | ||||
| 		goto bad; | ||||
| 
 | ||||
| 	return false; | ||||
| bad: | ||||
| 	bch_extent_to_text(buf, sizeof(buf), k); | ||||
| 	cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, | ||||
| 				     unsigned ptr) | ||||
| { | ||||
| 	struct bucket *g = PTR_BUCKET(b->c, k, ptr); | ||||
| 	char buf[80]; | ||||
| 
 | ||||
| 	if (mutex_trylock(&b->c->bucket_lock)) { | ||||
| 		if (b->c->gc_mark_valid && | ||||
| 		    ((GC_MARK(g) != GC_MARK_DIRTY && | ||||
| 		      KEY_DIRTY(k)) || | ||||
| 		     GC_MARK(g) == GC_MARK_METADATA)) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		if (g->prio == BTREE_PRIO) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		mutex_unlock(&b->c->bucket_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	return false; | ||||
| err: | ||||
| 	mutex_unlock(&b->c->bucket_lock); | ||||
| 	bch_extent_to_text(buf, sizeof(buf), k); | ||||
| 	btree_bug(b, | ||||
| "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||||
| 		  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | ||||
| 		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) | ||||
| { | ||||
| 	struct btree *b = container_of(bk, struct btree, keys); | ||||
| 	struct bucket *g; | ||||
| 	unsigned i, stale; | ||||
| 
 | ||||
| 	if (!KEY_PTRS(k) || | ||||
| 	    bch_extent_invalid(bk, k)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) | ||||
| 		if (!ptr_available(b->c, k, i)) | ||||
| 			return true; | ||||
| 
 | ||||
| 	if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) | ||||
| 		return false; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) { | ||||
| 		g = PTR_BUCKET(b->c, k, i); | ||||
| 		stale = ptr_stale(b->c, k, i); | ||||
| 
 | ||||
| 		btree_bug_on(stale > 96, b, | ||||
| 			     "key too stale: %i, need_gc %u", | ||||
| 			     stale, b->c->need_gc); | ||||
| 
 | ||||
| 		btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||||
| 			     b, "stale dirty pointer"); | ||||
| 
 | ||||
| 		if (stale) | ||||
| 			return true; | ||||
| 
 | ||||
| 		if (expensive_debug_checks(b->c) && | ||||
| 		    bch_extent_bad_expensive(b, k, i)) | ||||
| 			return true; | ||||
| 	} | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | ||||
| { | ||||
| 	return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | ||||
| 		~((uint64_t)1 << 63); | ||||
| } | ||||
| 
 | ||||
| static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) | ||||
| { | ||||
| 	struct btree *b = container_of(bk, struct btree, keys); | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	if (key_merging_disabled(b->c)) | ||||
| 		return false; | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(l); i++) | ||||
| 		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||||
| 		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||||
| 			return false; | ||||
| 
 | ||||
| 	/* Keys with no pointers aren't restricted to one bucket and could
 | ||||
| 	 * overflow KEY_SIZE | ||||
| 	 */ | ||||
| 	if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||||
| 		SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||||
| 		SET_KEY_SIZE(l, USHRT_MAX); | ||||
| 
 | ||||
| 		bch_cut_front(l, r); | ||||
| 		return false; | ||||
| 	} | ||||
| 
 | ||||
| 	if (KEY_CSUM(l)) { | ||||
| 		if (KEY_CSUM(r)) | ||||
| 			l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | ||||
| 		else | ||||
| 			SET_KEY_CSUM(l, 0); | ||||
| 	} | ||||
| 
 | ||||
| 	SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | ||||
| 	SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||||
| 
 | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| const struct btree_keys_ops bch_extent_keys_ops = { | ||||
| 	.sort_cmp	= bch_extent_sort_cmp, | ||||
| 	.sort_fixup	= bch_extent_sort_fixup, | ||||
| 	.insert_fixup	= bch_extent_insert_fixup, | ||||
| 	.key_invalid	= bch_extent_invalid, | ||||
| 	.key_bad	= bch_extent_bad, | ||||
| 	.key_merge	= bch_extent_merge, | ||||
| 	.key_to_text	= bch_extent_to_text, | ||||
| 	.key_dump	= bch_bkey_dump, | ||||
| 	.is_extents	= true, | ||||
| }; | ||||
							
								
								
									
										13
									
								
								drivers/md/bcache/extents.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								drivers/md/bcache/extents.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | ||||
| #ifndef _BCACHE_EXTENTS_H | ||||
| #define _BCACHE_EXTENTS_H | ||||
| 
 | ||||
| extern const struct btree_keys_ops bch_btree_keys_ops; | ||||
| extern const struct btree_keys_ops bch_extent_keys_ops; | ||||
| 
 | ||||
| struct bkey; | ||||
| struct cache_set; | ||||
| 
 | ||||
| void bch_extent_to_text(char *, size_t, const struct bkey *); | ||||
| bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); | ||||
| 
 | ||||
| #endif /* _BCACHE_EXTENTS_H */ | ||||
| @ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, | ||||
| 
 | ||||
| 	closure_init_stack(&cl); | ||||
| 
 | ||||
| 	pr_debug("reading %llu", (uint64_t) bucket); | ||||
| 	pr_debug("reading %u", bucket_index); | ||||
| 
 | ||||
| 	while (offset < ca->sb.bucket_size) { | ||||
| reread:		left = ca->sb.bucket_size - offset; | ||||
| 		len = min_t(unsigned, left, PAGE_SECTORS * 8); | ||||
| 		len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); | ||||
| 
 | ||||
| 		bio_reset(bio); | ||||
| 		bio->bi_iter.bi_sector	= bucket + offset; | ||||
| @ -74,19 +74,28 @@ reread:		left = ca->sb.bucket_size - offset; | ||||
| 			struct list_head *where; | ||||
| 			size_t blocks, bytes = set_bytes(j); | ||||
| 
 | ||||
| 			if (j->magic != jset_magic(&ca->sb)) | ||||
| 			if (j->magic != jset_magic(&ca->sb)) { | ||||
| 				pr_debug("%u: bad magic", bucket_index); | ||||
| 				return ret; | ||||
| 			} | ||||
| 
 | ||||
| 			if (bytes > left << 9) | ||||
| 			if (bytes > left << 9 || | ||||
| 			    bytes > PAGE_SIZE << JSET_BITS) { | ||||
| 				pr_info("%u: too big, %zu bytes, offset %u", | ||||
| 					bucket_index, bytes, offset); | ||||
| 				return ret; | ||||
| 			} | ||||
| 
 | ||||
| 			if (bytes > len << 9) | ||||
| 				goto reread; | ||||
| 
 | ||||
| 			if (j->csum != csum_set(j)) | ||||
| 			if (j->csum != csum_set(j)) { | ||||
| 				pr_info("%u: bad csum, %zu bytes, offset %u", | ||||
| 					bucket_index, bytes, offset); | ||||
| 				return ret; | ||||
| 			} | ||||
| 
 | ||||
| 			blocks = set_blocks(j, ca->set); | ||||
| 			blocks = set_blocks(j, block_bytes(ca->set)); | ||||
| 
 | ||||
| 			while (!list_empty(list)) { | ||||
| 				i = list_first_entry(list, | ||||
| @ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) | ||||
| 		} | ||||
| 
 | ||||
| 		for (k = i->j.start; | ||||
| 		     k < end(&i->j); | ||||
| 		     k < bset_bkey_last(&i->j); | ||||
| 		     k = bkey_next(k)) { | ||||
| 			unsigned j; | ||||
| 
 | ||||
| @ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) | ||||
| 				 n, i->j.seq - 1, start, end); | ||||
| 
 | ||||
| 		for (k = i->j.start; | ||||
| 		     k < end(&i->j); | ||||
| 		     k < bset_bkey_last(&i->j); | ||||
| 		     k = bkey_next(k)) { | ||||
| 			trace_bcache_journal_replay_key(k); | ||||
| 
 | ||||
| @ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl) | ||||
| 	continue_at_nobarrier(cl, journal_write, system_wq); | ||||
| } | ||||
| 
 | ||||
| static void journal_write_unlock(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, journal.io); | ||||
| 
 | ||||
| 	c->journal.io_in_flight = 0; | ||||
| 	spin_unlock(&c->journal.lock); | ||||
| } | ||||
| 
 | ||||
| static void journal_write_unlocked(struct closure *cl) | ||||
| 	__releases(c->journal.lock) | ||||
| { | ||||
| @ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl) | ||||
| 	struct cache *ca; | ||||
| 	struct journal_write *w = c->journal.cur; | ||||
| 	struct bkey *k = &c->journal.key; | ||||
| 	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | ||||
| 	unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * | ||||
| 		c->sb.block_size; | ||||
| 
 | ||||
| 	struct bio *bio; | ||||
| 	struct bio_list list; | ||||
| 	bio_list_init(&list); | ||||
| 
 | ||||
| 	if (!w->need_write) { | ||||
| 		/*
 | ||||
| 		 * XXX: have to unlock closure before we unlock journal lock, | ||||
| 		 * else we race with bch_journal(). But this way we race | ||||
| 		 * against cache set unregister. Doh. | ||||
| 		 */ | ||||
| 		set_closure_fn(cl, NULL, NULL); | ||||
| 		closure_sub(cl, CLOSURE_RUNNING + 1); | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 		return; | ||||
| 		closure_return_with_destructor(cl, journal_write_unlock); | ||||
| 	} else if (journal_full(&c->journal)) { | ||||
| 		journal_reclaim(c); | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| @ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) | ||||
| 		continue_at(cl, journal_write, system_wq); | ||||
| 	} | ||||
| 
 | ||||
| 	c->journal.blocks_free -= set_blocks(w->data, c); | ||||
| 	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); | ||||
| 
 | ||||
| 	w->data->btree_level = c->root->level; | ||||
| 
 | ||||
| @ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c) | ||||
| 
 | ||||
| 	w->need_write = true; | ||||
| 
 | ||||
| 	if (closure_trylock(cl, &c->cl)) | ||||
| 		journal_write_unlocked(cl); | ||||
| 	else | ||||
| 	if (!c->journal.io_in_flight) { | ||||
| 		c->journal.io_in_flight = 1; | ||||
| 		closure_call(cl, journal_write_unlocked, NULL, &c->cl); | ||||
| 	} else { | ||||
| 		spin_unlock(&c->journal.lock); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static struct journal_write *journal_wait_for_write(struct cache_set *c, | ||||
| @ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | ||||
| { | ||||
| 	size_t sectors; | ||||
| 	struct closure cl; | ||||
| 	bool wait = false; | ||||
| 
 | ||||
| 	closure_init_stack(&cl); | ||||
| 
 | ||||
| @ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | ||||
| 		struct journal_write *w = c->journal.cur; | ||||
| 
 | ||||
| 		sectors = __set_blocks(w->data, w->data->keys + nkeys, | ||||
| 				       c) * c->sb.block_size; | ||||
| 				       block_bytes(c)) * c->sb.block_size; | ||||
| 
 | ||||
| 		if (sectors <= min_t(size_t, | ||||
| 				     c->journal.blocks_free * c->sb.block_size, | ||||
| 				     PAGE_SECTORS << JSET_BITS)) | ||||
| 			return w; | ||||
| 
 | ||||
| 		/* XXX: tracepoint */ | ||||
| 		if (wait) | ||||
| 			closure_wait(&c->journal.wait, &cl); | ||||
| 
 | ||||
| 		if (!journal_full(&c->journal)) { | ||||
| 			trace_bcache_journal_entry_full(c); | ||||
| 			if (wait) | ||||
| 				trace_bcache_journal_entry_full(c); | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * XXX: If we were inserting so many keys that they | ||||
| @ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | ||||
| 			 */ | ||||
| 			BUG_ON(!w->data->keys); | ||||
| 
 | ||||
| 			closure_wait(&w->wait, &cl); | ||||
| 			journal_try_write(c); /* unlocks */ | ||||
| 		} else { | ||||
| 			trace_bcache_journal_full(c); | ||||
| 			if (wait) | ||||
| 				trace_bcache_journal_full(c); | ||||
| 
 | ||||
| 			closure_wait(&c->journal.wait, &cl); | ||||
| 			journal_reclaim(c); | ||||
| 			spin_unlock(&c->journal.lock); | ||||
| 
 | ||||
| @ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | ||||
| 
 | ||||
| 		closure_sync(&cl); | ||||
| 		spin_lock(&c->journal.lock); | ||||
| 		wait = true; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| @ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c, | ||||
| 
 | ||||
| 	w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); | ||||
| 
 | ||||
| 	memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); | ||||
| 	memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); | ||||
| 	w->data->keys += bch_keylist_nkeys(keys); | ||||
| 
 | ||||
| 	ret = &fifo_back(&c->journal.pin); | ||||
| @ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c) | ||||
| { | ||||
| 	struct journal *j = &c->journal; | ||||
| 
 | ||||
| 	closure_init_unlocked(&j->io); | ||||
| 	spin_lock_init(&j->lock); | ||||
| 	INIT_DELAYED_WORK(&j->work, journal_write_work); | ||||
| 
 | ||||
|  | ||||
| @ -104,6 +104,7 @@ struct journal { | ||||
| 	/* used when waiting because the journal was full */ | ||||
| 	struct closure_waitlist	wait; | ||||
| 	struct closure		io; | ||||
| 	int			io_in_flight; | ||||
| 	struct delayed_work	work; | ||||
| 
 | ||||
| 	/* Number of blocks free in the bucket(s) we're currently writing to */ | ||||
|  | ||||
| @ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c) | ||||
| 	for_each_cache(ca, c, i) { | ||||
| 		unsigned sectors_to_move = 0; | ||||
| 		unsigned reserve_sectors = ca->sb.bucket_size * | ||||
| 			min(fifo_used(&ca->free), ca->free.size / 2); | ||||
| 			fifo_used(&ca->free[RESERVE_MOVINGGC]); | ||||
| 
 | ||||
| 		ca->heap.used = 0; | ||||
| 
 | ||||
|  | ||||
| @ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl) | ||||
| 	closure_return(cl); | ||||
| } | ||||
| 
 | ||||
| static int bch_keylist_realloc(struct keylist *l, unsigned u64s, | ||||
| 			       struct cache_set *c) | ||||
| { | ||||
| 	size_t oldsize = bch_keylist_nkeys(l); | ||||
| 	size_t newsize = oldsize + u64s; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The journalling code doesn't handle the case where the keys to insert | ||||
| 	 * is bigger than an empty write: If we just return -ENOMEM here, | ||||
| 	 * bio_insert() and bio_invalidate() will insert the keys created so far | ||||
| 	 * and finish the rest when the keylist is empty. | ||||
| 	 */ | ||||
| 	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	return __bch_keylist_realloc(l, u64s); | ||||
| } | ||||
| 
 | ||||
| static void bch_data_invalidate(struct closure *cl) | ||||
| { | ||||
| 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); | ||||
| @ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl) | ||||
| 		unsigned sectors = min(bio_sectors(bio), | ||||
| 				       1U << (KEY_SIZE_BITS - 1)); | ||||
| 
 | ||||
| 		if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) | ||||
| 		if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) | ||||
| 			goto out; | ||||
| 
 | ||||
| 		bio->bi_iter.bi_sector	+= sectors; | ||||
| @ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl) | ||||
| 
 | ||||
| 		/* 1 for the device pointer and 1 for the chksum */ | ||||
| 		if (bch_keylist_realloc(&op->insert_keys, | ||||
| 					1 + (op->csum ? 1 : 0), | ||||
| 					3 + (op->csum ? 1 : 0), | ||||
| 					op->c)) | ||||
| 			continue_at(cl, bch_data_insert_keys, bcache_wq); | ||||
| 
 | ||||
| @ -596,14 +614,12 @@ struct search { | ||||
| 	/* Stack frame for bio_complete */ | ||||
| 	struct closure		cl; | ||||
| 
 | ||||
| 	struct bcache_device	*d; | ||||
| 
 | ||||
| 	struct bbio		bio; | ||||
| 	struct bio		*orig_bio; | ||||
| 	struct bio		*cache_miss; | ||||
| 	struct bcache_device	*d; | ||||
| 
 | ||||
| 	unsigned		insert_bio_sectors; | ||||
| 
 | ||||
| 	unsigned		recoverable:1; | ||||
| 	unsigned		write:1; | ||||
| 	unsigned		read_dirty_data:1; | ||||
| @ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error) | ||||
| 
 | ||||
| 	if (error) | ||||
| 		s->iop.error = error; | ||||
| 	else if (ptr_stale(s->iop.c, &b->key, 0)) { | ||||
| 	else if (!KEY_DIRTY(&b->key) && | ||||
| 		 ptr_stale(s->iop.c, &b->key, 0)) { | ||||
| 		atomic_long_inc(&s->iop.c->cache_read_races); | ||||
| 		s->iop.error = -EINTR; | ||||
| 	} | ||||
| @ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl) | ||||
| { | ||||
| 	struct search *s = container_of(cl, struct search, iop.cl); | ||||
| 	struct bio *bio = &s->bio.bio; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	int ret = bch_btree_map_keys(&s->op, s->iop.c, | ||||
| 				     &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), | ||||
| 				     cache_lookup_fn, MAP_END_KEY); | ||||
| 	bch_btree_op_init(&s->op, -1); | ||||
| 
 | ||||
| 	ret = bch_btree_map_keys(&s->op, s->iop.c, | ||||
| 				 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), | ||||
| 				 cache_lookup_fn, MAP_END_KEY); | ||||
| 	if (ret == -EAGAIN) | ||||
| 		continue_at(cl, cache_lookup, bcache_wq); | ||||
| 
 | ||||
| @ -754,12 +774,12 @@ static void bio_complete(struct search *s) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void do_bio_hook(struct search *s) | ||||
| static void do_bio_hook(struct search *s, struct bio *orig_bio) | ||||
| { | ||||
| 	struct bio *bio = &s->bio.bio; | ||||
| 
 | ||||
| 	bio_init(bio); | ||||
| 	__bio_clone_fast(bio, s->orig_bio); | ||||
| 	__bio_clone_fast(bio, orig_bio); | ||||
| 	bio->bi_end_io		= request_endio; | ||||
| 	bio->bi_private		= &s->cl; | ||||
| 
 | ||||
| @ -778,26 +798,32 @@ static void search_free(struct closure *cl) | ||||
| 	mempool_free(s, s->d->c->search); | ||||
| } | ||||
| 
 | ||||
| static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | ||||
| static inline struct search *search_alloc(struct bio *bio, | ||||
| 					  struct bcache_device *d) | ||||
| { | ||||
| 	struct search *s; | ||||
| 
 | ||||
| 	s = mempool_alloc(d->c->search, GFP_NOIO); | ||||
| 	memset(s, 0, offsetof(struct search, iop.insert_keys)); | ||||
| 
 | ||||
| 	__closure_init(&s->cl, NULL); | ||||
| 	closure_init(&s->cl, NULL); | ||||
| 	do_bio_hook(s, bio); | ||||
| 
 | ||||
| 	s->iop.inode		= d->id; | ||||
| 	s->iop.c		= d->c; | ||||
| 	s->d			= d; | ||||
| 	s->op.lock		= -1; | ||||
| 	s->iop.write_point	= hash_long((unsigned long) current, 16); | ||||
| 	s->orig_bio		= bio; | ||||
| 	s->write		= (bio->bi_rw & REQ_WRITE) != 0; | ||||
| 	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | ||||
| 	s->cache_miss		= NULL; | ||||
| 	s->d			= d; | ||||
| 	s->recoverable		= 1; | ||||
| 	s->write		= (bio->bi_rw & REQ_WRITE) != 0; | ||||
| 	s->read_dirty_data	= 0; | ||||
| 	s->start_time		= jiffies; | ||||
| 	do_bio_hook(s); | ||||
| 
 | ||||
| 	s->iop.c		= d->c; | ||||
| 	s->iop.bio		= NULL; | ||||
| 	s->iop.inode		= d->id; | ||||
| 	s->iop.write_point	= hash_long((unsigned long) current, 16); | ||||
| 	s->iop.write_prio	= 0; | ||||
| 	s->iop.error		= 0; | ||||
| 	s->iop.flags		= 0; | ||||
| 	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | ||||
| 
 | ||||
| 	return s; | ||||
| } | ||||
| @ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl) | ||||
| 		trace_bcache_read_retry(s->orig_bio); | ||||
| 
 | ||||
| 		s->iop.error = 0; | ||||
| 		do_bio_hook(s); | ||||
| 		do_bio_hook(s, s->orig_bio); | ||||
| 
 | ||||
| 		/* XXX: invalidate cache */ | ||||
| 
 | ||||
|  | ||||
| @ -13,17 +13,22 @@ struct data_insert_op { | ||||
| 	uint16_t		write_prio; | ||||
| 	short			error; | ||||
| 
 | ||||
| 	unsigned		bypass:1; | ||||
| 	unsigned		writeback:1; | ||||
| 	unsigned		flush_journal:1; | ||||
| 	unsigned		csum:1; | ||||
| 	union { | ||||
| 		uint16_t	flags; | ||||
| 
 | ||||
| 	unsigned		replace:1; | ||||
| 	unsigned		replace_collision:1; | ||||
| 	struct { | ||||
| 		unsigned	bypass:1; | ||||
| 		unsigned	writeback:1; | ||||
| 		unsigned	flush_journal:1; | ||||
| 		unsigned	csum:1; | ||||
| 
 | ||||
| 	unsigned		insert_data_done:1; | ||||
| 		unsigned	replace:1; | ||||
| 		unsigned	replace_collision:1; | ||||
| 
 | ||||
| 		unsigned	insert_data_done:1; | ||||
| 	}; | ||||
| 	}; | ||||
| 
 | ||||
| 	/* Anything past this point won't get zeroed in search_alloc() */ | ||||
| 	struct keylist		insert_keys; | ||||
| 	BKEY_PADDED(replace_key); | ||||
| }; | ||||
|  | ||||
| @ -9,6 +9,7 @@ | ||||
| #include "bcache.h" | ||||
| #include "btree.h" | ||||
| #include "debug.h" | ||||
| #include "extents.h" | ||||
| #include "request.h" | ||||
| #include "writeback.h" | ||||
| 
 | ||||
| @ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error) | ||||
| 	struct cached_dev *dc = bio->bi_private; | ||||
| 	/* XXX: error checking */ | ||||
| 
 | ||||
| 	closure_put(&dc->sb_write.cl); | ||||
| 	closure_put(&dc->sb_write); | ||||
| } | ||||
| 
 | ||||
| static void __write_super(struct cache_sb *sb, struct bio *bio) | ||||
| @ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) | ||||
| 	submit_bio(REQ_WRITE, bio); | ||||
| } | ||||
| 
 | ||||
| static void bch_write_bdev_super_unlock(struct closure *cl) | ||||
| { | ||||
| 	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); | ||||
| 
 | ||||
| 	up(&dc->sb_write_mutex); | ||||
| } | ||||
| 
 | ||||
| void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | ||||
| { | ||||
| 	struct closure *cl = &dc->sb_write.cl; | ||||
| 	struct closure *cl = &dc->sb_write; | ||||
| 	struct bio *bio = &dc->sb_bio; | ||||
| 
 | ||||
| 	closure_lock(&dc->sb_write, parent); | ||||
| 	down(&dc->sb_write_mutex); | ||||
| 	closure_init(cl, parent); | ||||
| 
 | ||||
| 	bio_reset(bio); | ||||
| 	bio->bi_bdev	= dc->bdev; | ||||
| @ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | ||||
| 	closure_get(cl); | ||||
| 	__write_super(&dc->sb, bio); | ||||
| 
 | ||||
| 	closure_return(cl); | ||||
| 	closure_return_with_destructor(cl, bch_write_bdev_super_unlock); | ||||
| } | ||||
| 
 | ||||
| static void write_super_endio(struct bio *bio, int error) | ||||
| @ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error) | ||||
| 	struct cache *ca = bio->bi_private; | ||||
| 
 | ||||
| 	bch_count_io_errors(ca, error, "writing superblock"); | ||||
| 	closure_put(&ca->set->sb_write.cl); | ||||
| 	closure_put(&ca->set->sb_write); | ||||
| } | ||||
| 
 | ||||
| static void bcache_write_super_unlock(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, sb_write); | ||||
| 
 | ||||
| 	up(&c->sb_write_mutex); | ||||
| } | ||||
| 
 | ||||
| void bcache_write_super(struct cache_set *c) | ||||
| { | ||||
| 	struct closure *cl = &c->sb_write.cl; | ||||
| 	struct closure *cl = &c->sb_write; | ||||
| 	struct cache *ca; | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	closure_lock(&c->sb_write, &c->cl); | ||||
| 	down(&c->sb_write_mutex); | ||||
| 	closure_init(cl, &c->cl); | ||||
| 
 | ||||
| 	c->sb.seq++; | ||||
| 
 | ||||
| @ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c) | ||||
| 		__write_super(&ca->sb, bio); | ||||
| 	} | ||||
| 
 | ||||
| 	closure_return(cl); | ||||
| 	closure_return_with_destructor(cl, bcache_write_super_unlock); | ||||
| } | ||||
| 
 | ||||
| /* UUID io */ | ||||
| @ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c) | ||||
| static void uuid_endio(struct bio *bio, int error) | ||||
| { | ||||
| 	struct closure *cl = bio->bi_private; | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, uuid_write); | ||||
| 
 | ||||
| 	cache_set_err_on(error, c, "accessing uuids"); | ||||
| 	bch_bbio_free(bio, c); | ||||
| 	closure_put(cl); | ||||
| } | ||||
| 
 | ||||
| static void uuid_io_unlock(struct closure *cl) | ||||
| { | ||||
| 	struct cache_set *c = container_of(cl, struct cache_set, uuid_write); | ||||
| 
 | ||||
| 	up(&c->uuid_write_mutex); | ||||
| } | ||||
| 
 | ||||
| static void uuid_io(struct cache_set *c, unsigned long rw, | ||||
| 		    struct bkey *k, struct closure *parent) | ||||
| { | ||||
| 	struct closure *cl = &c->uuid_write.cl; | ||||
| 	struct closure *cl = &c->uuid_write; | ||||
| 	struct uuid_entry *u; | ||||
| 	unsigned i; | ||||
| 	char buf[80]; | ||||
| 
 | ||||
| 	BUG_ON(!parent); | ||||
| 	closure_lock(&c->uuid_write, parent); | ||||
| 	down(&c->uuid_write_mutex); | ||||
| 	closure_init(cl, parent); | ||||
| 
 | ||||
| 	for (i = 0; i < KEY_PTRS(k); i++) { | ||||
| 		struct bio *bio = bch_bbio_alloc(c); | ||||
| @ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | ||||
| 			break; | ||||
| 	} | ||||
| 
 | ||||
| 	bch_bkey_to_text(buf, sizeof(buf), k); | ||||
| 	bch_extent_to_text(buf, sizeof(buf), k); | ||||
| 	pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); | ||||
| 
 | ||||
| 	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | ||||
| @ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | ||||
| 				 u - c->uuids, u->uuid, u->label, | ||||
| 				 u->first_reg, u->last_reg, u->invalidated); | ||||
| 
 | ||||
| 	closure_return(cl); | ||||
| 	closure_return_with_destructor(cl, uuid_io_unlock); | ||||
| } | ||||
| 
 | ||||
| static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) | ||||
| { | ||||
| 	struct bkey *k = &j->uuid_bucket; | ||||
| 
 | ||||
| 	if (bch_btree_ptr_invalid(c, k)) | ||||
| 	if (__bch_btree_ptr_invalid(c, k)) | ||||
| 		return "bad uuid pointer"; | ||||
| 
 | ||||
| 	bkey_copy(&c->uuid_bucket, k); | ||||
| @ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c) | ||||
| 
 | ||||
| 	lockdep_assert_held(&bch_register_lock); | ||||
| 
 | ||||
| 	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) | ||||
| 	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) | ||||
| 		return 1; | ||||
| 
 | ||||
| 	SET_KEY_SIZE(&k.key, c->sb.bucket_size); | ||||
| @ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca) | ||||
| 	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), | ||||
| 			&ca->meta_sectors_written); | ||||
| 
 | ||||
| 	pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | ||||
| 		 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||||
| 	//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
 | ||||
| 	//	 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
 | ||||
| 
 | ||||
| 	for (i = prio_buckets(ca) - 1; i >= 0; --i) { | ||||
| 		long bucket; | ||||
| @ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca) | ||||
| 		p->magic	= pset_magic(&ca->sb); | ||||
| 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8); | ||||
| 
 | ||||
| 		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); | ||||
| 		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); | ||||
| 		BUG_ON(bucket == -1); | ||||
| 
 | ||||
| 		mutex_unlock(&ca->set->bucket_lock); | ||||
| @ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | ||||
| 	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); | ||||
| 	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); | ||||
| 	INIT_WORK(&dc->detach, cached_dev_detach_finish); | ||||
| 	closure_init_unlocked(&dc->sb_write); | ||||
| 	sema_init(&dc->sb_write_mutex, 1); | ||||
| 	INIT_LIST_HEAD(&dc->io_lru); | ||||
| 	spin_lock_init(&dc->io_lock); | ||||
| 	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); | ||||
| @ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | ||||
| 		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | ||||
| 	} | ||||
| 
 | ||||
| 	dc->disk.stripe_size = q->limits.io_opt >> 9; | ||||
| 
 | ||||
| 	if (dc->disk.stripe_size) | ||||
| 		dc->partial_stripes_expensive = | ||||
| 			q->limits.raid_partial_stripes_expensive; | ||||
| 
 | ||||
| 	ret = bcache_device_init(&dc->disk, block_size, | ||||
| 			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | ||||
| 	if (ret) | ||||
| @ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl) | ||||
| 		if (ca) | ||||
| 			kobject_put(&ca->kobj); | ||||
| 
 | ||||
| 	bch_bset_sort_state_free(&c->sort); | ||||
| 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | ||||
| 	free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | ||||
| 
 | ||||
| 	if (c->bio_split) | ||||
| 		bioset_free(c->bio_split); | ||||
| @ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | ||||
| 	c->block_bits		= ilog2(sb->block_size); | ||||
| 	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry); | ||||
| 
 | ||||
| 	c->btree_pages		= c->sb.bucket_size / PAGE_SECTORS; | ||||
| 	c->btree_pages		= bucket_pages(c); | ||||
| 	if (c->btree_pages > BTREE_MAX_PAGES) | ||||
| 		c->btree_pages = max_t(int, c->btree_pages / 4, | ||||
| 				       BTREE_MAX_PAGES); | ||||
| 
 | ||||
| 	c->sort_crit_factor = int_sqrt(c->btree_pages); | ||||
| 
 | ||||
| 	closure_init_unlocked(&c->sb_write); | ||||
| 	sema_init(&c->sb_write_mutex, 1); | ||||
| 	mutex_init(&c->bucket_lock); | ||||
| 	init_waitqueue_head(&c->try_wait); | ||||
| 	init_waitqueue_head(&c->bucket_wait); | ||||
| 	closure_init_unlocked(&c->uuid_write); | ||||
| 	mutex_init(&c->sort_lock); | ||||
| 	sema_init(&c->uuid_write_mutex, 1); | ||||
| 
 | ||||
| 	spin_lock_init(&c->sort_time.lock); | ||||
| 	spin_lock_init(&c->btree_gc_time.lock); | ||||
| 	spin_lock_init(&c->btree_split_time.lock); | ||||
| 	spin_lock_init(&c->btree_read_time.lock); | ||||
| @ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | ||||
| 				bucket_pages(c))) || | ||||
| 	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || | ||||
| 	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | ||||
| 	    !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | ||||
| 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | ||||
| 	    bch_journal_alloc(c) || | ||||
| 	    bch_btree_cache_alloc(c) || | ||||
| 	    bch_open_buckets_alloc(c)) | ||||
| 	    bch_open_buckets_alloc(c) || | ||||
| 	    bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) | ||||
| 		goto err; | ||||
| 
 | ||||
| 	c->congested_read_threshold_us	= 2000; | ||||
| @ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c) | ||||
| 		k = &j->btree_root; | ||||
| 
 | ||||
| 		err = "bad btree root"; | ||||
| 		if (bch_btree_ptr_invalid(c, k)) | ||||
| 		if (__bch_btree_ptr_invalid(c, k)) | ||||
| 			goto err; | ||||
| 
 | ||||
| 		err = "error reading btree root"; | ||||
| @ -1743,6 +1770,7 @@ err: | ||||
| void bch_cache_release(struct kobject *kobj) | ||||
| { | ||||
| 	struct cache *ca = container_of(kobj, struct cache, kobj); | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	if (ca->set) | ||||
| 		ca->set->cache[ca->sb.nr_this_dev] = NULL; | ||||
| @ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj) | ||||
| 	free_heap(&ca->heap); | ||||
| 	free_fifo(&ca->unused); | ||||
| 	free_fifo(&ca->free_inc); | ||||
| 	free_fifo(&ca->free); | ||||
| 
 | ||||
| 	for (i = 0; i < RESERVE_NR; i++) | ||||
| 		free_fifo(&ca->free[i]); | ||||
| 
 | ||||
| 	if (ca->sb_bio.bi_inline_vecs[0].bv_page) | ||||
| 		put_page(ca->sb_bio.bi_io_vec[0].bv_page); | ||||
| @ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | ||||
| 	ca->journal.bio.bi_max_vecs = 8; | ||||
| 	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; | ||||
| 
 | ||||
| 	free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; | ||||
| 	free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); | ||||
| 	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; | ||||
| 
 | ||||
| 	if (!init_fifo(&ca->free,	free, GFP_KERNEL) || | ||||
| 	if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || | ||||
| 	    !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || | ||||
| 	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || | ||||
| 	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || | ||||
| 	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) || | ||||
| 	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) || | ||||
| 	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) || | ||||
| @ -2030,7 +2062,8 @@ static void bcache_exit(void) | ||||
| 		kobject_put(bcache_kobj); | ||||
| 	if (bcache_wq) | ||||
| 		destroy_workqueue(bcache_wq); | ||||
| 	unregister_blkdev(bcache_major, "bcache"); | ||||
| 	if (bcache_major) | ||||
| 		unregister_blkdev(bcache_major, "bcache"); | ||||
| 	unregister_reboot_notifier(&reboot); | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -102,7 +102,6 @@ rw_attribute(bypass_torture_test); | ||||
| rw_attribute(key_merging_disabled); | ||||
| rw_attribute(gc_always_rewrite); | ||||
| rw_attribute(expensive_debug_checks); | ||||
| rw_attribute(freelist_percent); | ||||
| rw_attribute(cache_replacement_policy); | ||||
| rw_attribute(btree_shrinker_disabled); | ||||
| rw_attribute(copy_gc_enabled); | ||||
| @ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = { | ||||
| }; | ||||
| KTYPE(bch_flash_dev); | ||||
| 
 | ||||
| struct bset_stats_op { | ||||
| 	struct btree_op op; | ||||
| 	size_t nodes; | ||||
| 	struct bset_stats stats; | ||||
| }; | ||||
| 
 | ||||
| static int btree_bset_stats(struct btree_op *b_op, struct btree *b) | ||||
| { | ||||
| 	struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); | ||||
| 
 | ||||
| 	op->nodes++; | ||||
| 	bch_btree_keys_stats(&b->keys, &op->stats); | ||||
| 
 | ||||
| 	return MAP_CONTINUE; | ||||
| } | ||||
| 
 | ||||
| int bch_bset_print_stats(struct cache_set *c, char *buf) | ||||
| { | ||||
| 	struct bset_stats_op op; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	memset(&op, 0, sizeof(op)); | ||||
| 	bch_btree_op_init(&op.op, -1); | ||||
| 
 | ||||
| 	ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); | ||||
| 	if (ret < 0) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	return snprintf(buf, PAGE_SIZE, | ||||
| 			"btree nodes:		%zu\n" | ||||
| 			"written sets:		%zu\n" | ||||
| 			"unwritten sets:		%zu\n" | ||||
| 			"written key bytes:	%zu\n" | ||||
| 			"unwritten key bytes:	%zu\n" | ||||
| 			"floats:			%zu\n" | ||||
| 			"failed:			%zu\n", | ||||
| 			op.nodes, | ||||
| 			op.stats.sets_written, op.stats.sets_unwritten, | ||||
| 			op.stats.bytes_written, op.stats.bytes_unwritten, | ||||
| 			op.stats.floats, op.stats.failed); | ||||
| } | ||||
| 
 | ||||
| SHOW(__bch_cache_set) | ||||
| { | ||||
| 	unsigned root_usage(struct cache_set *c) | ||||
| @ -419,7 +460,7 @@ lock_root: | ||||
| 			rw_lock(false, b, b->level); | ||||
| 		} while (b != c->root); | ||||
| 
 | ||||
| 		for_each_key_filter(b, k, &iter, bch_ptr_bad) | ||||
| 		for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) | ||||
| 			bytes += bkey_bytes(k); | ||||
| 
 | ||||
| 		rw_unlock(false, b); | ||||
| @ -434,7 +475,7 @@ lock_root: | ||||
| 
 | ||||
| 		mutex_lock(&c->bucket_lock); | ||||
| 		list_for_each_entry(b, &c->btree_cache, list) | ||||
| 			ret += 1 << (b->page_order + PAGE_SHIFT); | ||||
| 			ret += 1 << (b->keys.page_order + PAGE_SHIFT); | ||||
| 
 | ||||
| 		mutex_unlock(&c->bucket_lock); | ||||
| 		return ret; | ||||
| @ -491,7 +532,7 @@ lock_root: | ||||
| 
 | ||||
| 	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms); | ||||
| 	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us); | ||||
| 	sysfs_print_time_stats(&c->sort_time,		btree_sort, ms, us); | ||||
| 	sysfs_print_time_stats(&c->sort.time,		btree_sort, ms, us); | ||||
| 	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us); | ||||
| 	sysfs_print_time_stats(&c->try_harder_time,	try_harder, ms, us); | ||||
| 
 | ||||
| @ -711,9 +752,6 @@ SHOW(__bch_cache) | ||||
| 	sysfs_print(io_errors, | ||||
| 		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | ||||
| 
 | ||||
| 	sysfs_print(freelist_percent, ca->free.size * 100 / | ||||
| 		    ((size_t) ca->sb.nbuckets)); | ||||
| 
 | ||||
| 	if (attr == &sysfs_cache_replacement_policy) | ||||
| 		return bch_snprint_string_list(buf, PAGE_SIZE, | ||||
| 					       cache_replacement_policies, | ||||
| @ -820,32 +858,6 @@ STORE(__bch_cache) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_freelist_percent) { | ||||
| 		DECLARE_FIFO(long, free); | ||||
| 		long i; | ||||
| 		size_t p = strtoul_or_return(buf); | ||||
| 
 | ||||
| 		p = clamp_t(size_t, | ||||
| 			    ((size_t) ca->sb.nbuckets * p) / 100, | ||||
| 			    roundup_pow_of_two(ca->sb.nbuckets) >> 9, | ||||
| 			    ca->sb.nbuckets / 2); | ||||
| 
 | ||||
| 		if (!init_fifo_exact(&free, p, GFP_KERNEL)) | ||||
| 			return -ENOMEM; | ||||
| 
 | ||||
| 		mutex_lock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 		fifo_move(&free, &ca->free); | ||||
| 		fifo_swap(&free, &ca->free); | ||||
| 
 | ||||
| 		mutex_unlock(&ca->set->bucket_lock); | ||||
| 
 | ||||
| 		while (fifo_pop(&free, i)) | ||||
| 			atomic_dec(&ca->buckets[i].pin); | ||||
| 
 | ||||
| 		free_fifo(&free); | ||||
| 	} | ||||
| 
 | ||||
| 	if (attr == &sysfs_clear_stats) { | ||||
| 		atomic_long_set(&ca->sectors_written, 0); | ||||
| 		atomic_long_set(&ca->btree_sectors_written, 0); | ||||
| @ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = { | ||||
| 	&sysfs_metadata_written, | ||||
| 	&sysfs_io_errors, | ||||
| 	&sysfs_clear_stats, | ||||
| 	&sysfs_freelist_percent, | ||||
| 	&sysfs_cache_replacement_policy, | ||||
| 	NULL | ||||
| }; | ||||
|  | ||||
| @ -2,6 +2,7 @@ | ||||
| #ifndef _BCACHE_UTIL_H | ||||
| #define _BCACHE_UTIL_H | ||||
| 
 | ||||
| #include <linux/blkdev.h> | ||||
| #include <linux/errno.h> | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/llist.h> | ||||
| @ -17,11 +18,13 @@ struct closure; | ||||
| 
 | ||||
| #ifdef CONFIG_BCACHE_DEBUG | ||||
| 
 | ||||
| #define EBUG_ON(cond)			BUG_ON(cond) | ||||
| #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0) | ||||
| #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i) | ||||
| 
 | ||||
| #else /* DEBUG */ | ||||
| 
 | ||||
| #define EBUG_ON(cond)			do { if (cond); } while (0) | ||||
| #define atomic_dec_bug(v)	atomic_dec(v) | ||||
| #define atomic_inc_bug(v, i)	atomic_inc(v) | ||||
| 
 | ||||
| @ -391,6 +394,11 @@ struct time_stats { | ||||
| 
 | ||||
| void bch_time_stats_update(struct time_stats *stats, uint64_t time); | ||||
| 
 | ||||
| static inline unsigned local_clock_us(void) | ||||
| { | ||||
| 	return local_clock() >> 10; | ||||
| } | ||||
| 
 | ||||
| #define NSEC_PER_ns			1L | ||||
| #define NSEC_PER_us			NSEC_PER_USEC | ||||
| #define NSEC_PER_ms			NSEC_PER_MSEC | ||||
|  | ||||
| @ -6103,6 +6103,7 @@ static int run(struct mddev *mddev) | ||||
| 		blk_queue_io_min(mddev->queue, chunk_size); | ||||
| 		blk_queue_io_opt(mddev->queue, chunk_size * | ||||
| 				 (conf->raid_disks - conf->max_degraded)); | ||||
| 		mddev->queue->limits.raid_partial_stripes_expensive = 1; | ||||
| 		/*
 | ||||
| 		 * We can only discard a whole stripe. It doesn't make sense to | ||||
| 		 * discard data disk but write parity disk | ||||
|  | ||||
| @ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh) | ||||
| 		} | ||||
| 		while (out < BH_LRU_SIZE) | ||||
| 			bhs[out++] = NULL; | ||||
| 		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); | ||||
| 		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); | ||||
| 	} | ||||
| 	bh_lru_unlock(); | ||||
| 
 | ||||
|  | ||||
| @ -291,6 +291,7 @@ struct queue_limits { | ||||
| 	unsigned char		discard_misaligned; | ||||
| 	unsigned char		cluster; | ||||
| 	unsigned char		discard_zeroes_data; | ||||
| 	unsigned char		raid_partial_stripes_expensive; | ||||
| }; | ||||
| 
 | ||||
| struct request_queue { | ||||
|  | ||||
| @ -247,7 +247,7 @@ TRACE_EVENT(bcache_btree_write, | ||||
| 	TP_fast_assign( | ||||
| 		__entry->bucket	= PTR_BUCKET_NR(b->c, &b->key, 0); | ||||
| 		__entry->block	= b->written; | ||||
| 		__entry->keys	= b->sets[b->nsets].data->keys; | ||||
| 		__entry->keys	= b->keys.set[b->keys.nsets].data->keys; | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk("bucket %zu", __entry->bucket) | ||||
| @ -411,7 +411,7 @@ TRACE_EVENT(bcache_alloc_invalidate, | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->free		= fifo_used(&ca->free); | ||||
| 		__entry->free		= fifo_used(&ca->free[RESERVE_NONE]); | ||||
| 		__entry->free_inc	= fifo_used(&ca->free_inc); | ||||
| 		__entry->free_inc_size	= ca->free_inc.size; | ||||
| 		__entry->unused		= fifo_used(&ca->unused); | ||||
| @ -422,8 +422,8 @@ TRACE_EVENT(bcache_alloc_invalidate, | ||||
| ); | ||||
| 
 | ||||
| TRACE_EVENT(bcache_alloc_fail, | ||||
| 	TP_PROTO(struct cache *ca), | ||||
| 	TP_ARGS(ca), | ||||
| 	TP_PROTO(struct cache *ca, unsigned reserve), | ||||
| 	TP_ARGS(ca, reserve), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(unsigned,	free			) | ||||
| @ -433,7 +433,7 @@ TRACE_EVENT(bcache_alloc_fail, | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->free		= fifo_used(&ca->free); | ||||
| 		__entry->free		= fifo_used(&ca->free[reserve]); | ||||
| 		__entry->free_inc	= fifo_used(&ca->free_inc); | ||||
| 		__entry->unused		= fifo_used(&ca->unused); | ||||
| 		__entry->blocked	= atomic_read(&ca->set->prio_blocked); | ||||
|  | ||||
| @ -39,6 +39,7 @@ static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)	\ | ||||
| } | ||||
| 
 | ||||
| #define KEY_SIZE_BITS		16 | ||||
| #define KEY_MAX_U64S		8 | ||||
| 
 | ||||
| KEY_FIELD(KEY_PTRS,	high, 60, 3) | ||||
| KEY_FIELD(HEADER_SIZE,	high, 58, 2) | ||||
| @ -118,7 +119,7 @@ static inline struct bkey *bkey_next(const struct bkey *k) | ||||
| 	return (struct bkey *) (d + bkey_u64s(k)); | ||||
| } | ||||
| 
 | ||||
| static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) | ||||
| static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys) | ||||
| { | ||||
| 	__u64 *d = (void *) k; | ||||
| 	return (struct bkey *) (d + nr_keys); | ||||
|  | ||||
| @ -185,7 +185,8 @@ enum { | ||||
| 				 * to clear media change status */ | ||||
| 	FD_UNUSED_BIT, | ||||
| 	FD_DISK_CHANGED_BIT,	/* disk has been changed since last i/o */ | ||||
| 	FD_DISK_WRITABLE_BIT	/* disk is writable */ | ||||
| 	FD_DISK_WRITABLE_BIT,	/* disk is writable */ | ||||
| 	FD_OPEN_SHOULD_FAIL_BIT | ||||
| }; | ||||
| 
 | ||||
| #define FDSETDRVPRM _IOW(2, 0x90, struct floppy_drive_params) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user