diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 23022cf20d26..c02fa27dd3f3 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2426,7 +2426,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, offset = to_interleave_offset(offset, mmio); writeq(cmd, mmio->addr.base + offset); - nvdimm_flush(nfit_blk->nd_region); + nvdimm_flush(nfit_blk->nd_region, NULL); if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH) readq(mmio->addr.base + offset); @@ -2475,7 +2475,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, } if (rw) - nvdimm_flush(nfit_blk->nd_region); + nvdimm_flush(nfit_blk->nd_region, NULL); rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0; return rc; diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 2109cfe80219..5f184e751c82 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -388,7 +388,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, * No 'host' or dax_operations since there is no access to this * device outside of mmap of the resulting character device. */ - dax_dev = alloc_dax(dev_dax, NULL, NULL); + dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); if (!dax_dev) goto err; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 4e5ae7e8b557..8ab12068eea3 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -195,6 +195,8 @@ enum dax_device_flags { DAXDEV_ALIVE, /* gate whether dax_flush() calls the low level flush routine */ DAXDEV_WRITE_CACHE, + /* flag to check if device supports synchronous flush */ + DAXDEV_SYNC, }; /** @@ -372,6 +374,18 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(dax_write_cache_enabled); +bool __dax_synchronous(struct dax_device *dax_dev) +{ + return test_bit(DAXDEV_SYNC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(__dax_synchronous); + +void __set_dax_synchronous(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_SYNC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(__set_dax_synchronous); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); @@ -526,7 +540,7 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host) } struct dax_device *alloc_dax(void *private, const char *__host, - const struct dax_operations *ops) + const struct dax_operations *ops, unsigned long flags) { struct dax_device *dax_dev; const char *host; @@ -549,6 +563,9 @@ struct dax_device *alloc_dax(void *private, const char *__host, dax_add_host(dax_dev, host); dax_dev->ops = ops; dax_dev->private = private; + if (flags & DAXDEV_F_SYNC) + set_dax_synchronous(dax_dev); + return dax_dev; err_dev: diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ec8b27e20de3..caaee8032afe 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -881,7 +881,7 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) EXPORT_SYMBOL_GPL(dm_table_set_type); /* validate the dax capability of the target device span */ -static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { int blocksize = *(int *) data; @@ -890,7 +890,15 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, start, len); } -bool dm_table_supports_dax(struct dm_table *t, int blocksize) +/* Check devices support synchronous DAX */ +static int device_synchronous(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + return dax_synchronous(dev->dax_dev); +} + +bool dm_table_supports_dax(struct dm_table *t, + iterate_devices_callout_fn iterate_fn, int *blocksize) { struct dm_target *ti; unsigned i; @@ -903,8 +911,7 @@ bool dm_table_supports_dax(struct dm_table *t, int blocksize) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_supports_dax, - &blocksize)) + !ti->type->iterate_devices(ti, iterate_fn, blocksize)) return false; } @@ -940,6 +947,7 @@ static int dm_table_determine_type(struct dm_table *t) struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); + int page_size = PAGE_SIZE; if (t->type != DM_TYPE_NONE) { /* target already set the table's type */ @@ -984,7 +992,7 @@ static int dm_table_determine_type(struct dm_table *t) verify_bio_based: /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; - if (dm_table_supports_dax(t, PAGE_SIZE) || + if (dm_table_supports_dax(t, device_supports_dax, &page_size) || (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { t->type = DM_TYPE_DAX_BIO_BASED; } else { @@ -1883,6 +1891,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { bool wc = false, fua = false; + int page_size = PAGE_SIZE; /* * Copy table's limits to the DM device's request_queue @@ -1910,8 +1919,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); - if (dm_table_supports_dax(t, PAGE_SIZE)) + if (dm_table_supports_dax(t, device_supports_dax, &page_size)) { blk_queue_flag_set(QUEUE_FLAG_DAX, q); + if (dm_table_supports_dax(t, device_synchronous, NULL)) + set_dax_synchronous(t->md->dax_dev); + } else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 61f1152b74e9..d0beef033e2f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1117,7 +1117,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd if (!map) return false; - ret = dm_table_supports_dax(map, blocksize); + ret = dm_table_supports_dax(map, device_supports_dax, &blocksize); dm_put_live_table(md, srcu_idx); @@ -1989,7 +1989,8 @@ static struct mapped_device *alloc_dev(int minor) sprintf(md->disk->disk_name, "dm-%d", minor); if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + md->dax_dev = alloc_dax(md, md->disk->disk_name, + &dm_dax_ops, 0); if (!md->dax_dev) goto bad; } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 17e3db54404c..0475673337f3 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -72,7 +72,10 @@ bool dm_table_bio_based(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); -bool dm_table_supports_dax(struct dm_table *t, int blocksize); +bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn, + int *blocksize); +int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 6f2a088afad6..cefe233e0b52 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_OF_PMEM) += of_pmem.o +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o nd_pmem-y := pmem.o diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 26c1c7618891..2985ca949912 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -255,7 +255,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); sector_t sector = offset >> 9; - int rc = 0; + int rc = 0, ret = 0; if (unlikely(!size)) return 0; @@ -293,7 +293,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, } memcpy_flushcache(nsio->addr + offset, buf, size); - nvdimm_flush(to_nd_region(ndns->dev.parent)); + ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL); + if (ret) + rc = ret; return rc; } diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index a434a5964cb9..2d8d7e554877 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1822,8 +1822,8 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, && !guid_equal(&nd_set->type_guid, &nd_label->type_guid)) { dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", - nd_set->type_guid.b, - nd_label->type_guid.b); + &nd_set->type_guid, + &nd_label->type_guid); continue; } @@ -2227,8 +2227,8 @@ static struct device *create_namespace_blk(struct nd_region *nd_region, if (namespace_label_has(ndd, type_guid)) { if (!guid_equal(&nd_set->type_guid, &nd_label->type_guid)) { dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", - nd_set->type_guid.b, - nd_label->type_guid.b); + &nd_set->type_guid, + &nd_label->type_guid); return ERR_PTR(-EAGAIN); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index d24304c0e6d7..1b9955651379 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -155,6 +155,7 @@ struct nd_region { struct badblocks bb; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; + int (*flush)(struct nd_region *nd_region, struct bio *bio); struct nd_mapping mapping[0]; }; diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c new file mode 100644 index 000000000000..10351d5b49fa --- /dev/null +++ b/drivers/nvdimm/nd_virtio.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + */ +#include "virtio_pmem.h" +#include "nd.h" + + /* The interrupt handler */ +void virtio_pmem_host_ack(struct virtqueue *vq) +{ + struct virtio_pmem *vpmem = vq->vdev->priv; + struct virtio_pmem_request *req_data, *req_buf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) { + req_data->done = true; + wake_up(&req_data->host_acked); + + if (!list_empty(&vpmem->req_list)) { + req_buf = list_first_entry(&vpmem->req_list, + struct virtio_pmem_request, list); + req_buf->wq_buf_avail = true; + wake_up(&req_buf->wq_buf); + list_del(&req_buf->list); + } + } + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); +} +EXPORT_SYMBOL_GPL(virtio_pmem_host_ack); + + /* The request submission function */ +static int virtio_pmem_flush(struct nd_region *nd_region) +{ + struct virtio_device *vdev = nd_region->provider_data; + struct virtio_pmem *vpmem = vdev->priv; + struct virtio_pmem_request *req_data; + struct scatterlist *sgs[2], sg, ret; + unsigned long flags; + int err, err1; + + might_sleep(); + req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); + if (!req_data) + return -ENOMEM; + + req_data->done = false; + init_waitqueue_head(&req_data->host_acked); + init_waitqueue_head(&req_data->wq_buf); + INIT_LIST_HEAD(&req_data->list); + req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH); + sg_init_one(&sg, &req_data->req, sizeof(req_data->req)); + sgs[0] = &sg; + sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp)); + sgs[1] = &ret; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + /* + * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual + * queue does not have free descriptor. We add the request + * to req_list and wait for host_ack to wake us up when free + * slots are available. + */ + while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data, + GFP_ATOMIC)) == -ENOSPC) { + + dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n"); + req_data->wq_buf_avail = false; + list_add_tail(&req_data->list, &vpmem->req_list); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + + /* A host response results in "host_ack" getting called */ + wait_event(req_data->wq_buf, req_data->wq_buf_avail); + spin_lock_irqsave(&vpmem->pmem_lock, flags); + } + err1 = virtqueue_kick(vpmem->req_vq); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + /* + * virtqueue_add_sgs failed with error different than -ENOSPC, we can't + * do anything about that. + */ + if (err || !err1) { + dev_info(&vdev->dev, "failed to send command to virtio pmem device\n"); + err = -EIO; + } else { + /* A host repsonse results in "host_ack" getting called */ + wait_event(req_data->host_acked, req_data->done); + err = le32_to_cpu(req_data->resp.ret); + } + + kfree(req_data); + return err; +}; + +/* The asynchronous flush callback function */ +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio) +{ + /* + * Create child bio for asynchronous flush and chain with + * parent bio. Otherwise directly call nd_region flush. + */ + if (bio && bio->bi_iter.bi_sector != -1) { + struct bio *child = bio_alloc(GFP_ATOMIC, 0); + + if (!child) + return -ENOMEM; + bio_copy_dev(child, bio); + child->bi_opf = REQ_PREFLUSH; + child->bi_iter.bi_sector = -1; + bio_chain(child, bio); + submit_bio(child); + return 0; + } + if (virtio_pmem_flush(nd_region)) + return -EIO; + + return 0; +}; +EXPORT_SYMBOL_GPL(async_pmem_flush); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index e7d8cc9f41e8..2bf3acd69613 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -184,6 +184,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int ret = 0; blk_status_t rc = 0; bool do_acct; unsigned long start; @@ -193,7 +194,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct nd_region *nd_region = to_region(pmem); if (bio->bi_opf & REQ_PREFLUSH) - nvdimm_flush(nd_region); + ret = nvdimm_flush(nd_region, bio); do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { @@ -208,7 +209,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) nd_iostat_end(bio, start); if (bio->bi_opf & REQ_FUA) - nvdimm_flush(nd_region); + ret = nvdimm_flush(nd_region, bio); + + if (ret) + bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); return BLK_QC_T_NONE; @@ -362,6 +366,7 @@ static int pmem_attach_disk(struct device *dev, struct gendisk *disk; void *addr; int rc; + unsigned long flags = 0UL; pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); if (!pmem) @@ -457,14 +462,15 @@ static int pmem_attach_disk(struct device *dev, nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); disk->bb = &pmem->bb; - dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); + if (is_nvdimm_sync(nd_region)) + flags = DAXDEV_F_SYNC; + dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); if (!dax_dev) { put_disk(disk); return -ENOMEM; } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; - gendev = disk_to_dev(disk); gendev->groups = pmem_attribute_groups; @@ -522,14 +528,14 @@ static int nd_pmem_remove(struct device *dev) sysfs_put(pmem->bb_state); pmem->bb_state = NULL; } - nvdimm_flush(to_nd_region(dev->parent)); + nvdimm_flush(to_nd_region(dev->parent), NULL); return 0; } static void nd_pmem_shutdown(struct device *dev) { - nvdimm_flush(to_nd_region(dev->parent)); + nvdimm_flush(to_nd_region(dev->parent), NULL); } static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4fed9ce9c2fe..56f2227f192a 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -287,7 +287,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att return rc; if (!flush) return -EINVAL; - nvdimm_flush(nd_region); + rc = nvdimm_flush(nd_region, NULL); + if (rc) + return rc; return len; } @@ -1077,6 +1079,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, dev->of_node = ndr_desc->of_node; nd_region->ndr_size = resource_size(ndr_desc->res); nd_region->ndr_start = ndr_desc->res->start; + if (ndr_desc->flush) + nd_region->flush = ndr_desc->flush; + else + nd_region->flush = NULL; + nd_device_register(dev); return nd_region; @@ -1117,11 +1124,24 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, } EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); +int nvdimm_flush(struct nd_region *nd_region, struct bio *bio) +{ + int rc = 0; + + if (!nd_region->flush) + rc = generic_nvdimm_flush(nd_region); + else { + if (nd_region->flush(nd_region, bio)) + rc = -EIO; + } + + return rc; +} /** * nvdimm_flush - flush any posted write queues between the cpu and pmem media * @nd_region: blk or interleaved pmem region */ -void nvdimm_flush(struct nd_region *nd_region) +int generic_nvdimm_flush(struct nd_region *nd_region) { struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev); int i, idx; @@ -1145,6 +1165,8 @@ void nvdimm_flush(struct nd_region *nd_region) if (ndrd_get_flush_wpq(ndrd, i, 0)) writeq(1, ndrd_get_flush_wpq(ndrd, i, idx)); wmb(); + + return 0; } EXPORT_SYMBOL_GPL(nvdimm_flush); @@ -1189,6 +1211,13 @@ int nvdimm_has_cache(struct nd_region *nd_region) } EXPORT_SYMBOL_GPL(nvdimm_has_cache); +bool is_nvdimm_sync(struct nd_region *nd_region) +{ + return is_nd_pmem(&nd_region->dev) && + !test_bit(ND_REGION_ASYNC, &nd_region->flags); +} +EXPORT_SYMBOL_GPL(is_nvdimm_sync); + struct conflict_context { struct nd_region *nd_region; resource_size_t start, size; diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c new file mode 100644 index 000000000000..5e3d07b47e0c --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and registers the virtual pmem device + * with libnvdimm core. + */ +#include "virtio_pmem.h" +#include "nd.h" + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + + /* Initialize virt queue */ +static int init_vq(struct virtio_pmem *vpmem) +{ + /* single vq */ + vpmem->req_vq = virtio_find_single_vq(vpmem->vdev, + virtio_pmem_host_ack, "flush_queue"); + if (IS_ERR(vpmem->req_vq)) + return PTR_ERR(vpmem->req_vq); + + spin_lock_init(&vpmem->pmem_lock); + INIT_LIST_HEAD(&vpmem->req_list); + + return 0; +}; + +static int virtio_pmem_probe(struct virtio_device *vdev) +{ + struct nd_region_desc ndr_desc = {}; + int nid = dev_to_node(&vdev->dev); + struct nd_region *nd_region; + struct virtio_pmem *vpmem; + struct resource res; + int err = 0; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL); + if (!vpmem) { + err = -ENOMEM; + goto out_err; + } + + vpmem->vdev = vdev; + vdev->priv = vpmem; + err = init_vq(vpmem); + if (err) { + dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n"); + goto out_err; + } + + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + start, &vpmem->start); + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + size, &vpmem->size); + + res.start = vpmem->start; + res.end = vpmem->start + vpmem->size - 1; + vpmem->nd_desc.provider_name = "virtio-pmem"; + vpmem->nd_desc.module = THIS_MODULE; + + vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev, + &vpmem->nd_desc); + if (!vpmem->nvdimm_bus) { + dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n"); + err = -ENXIO; + goto out_vq; + } + + dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus); + + ndr_desc.res = &res; + ndr_desc.numa_node = nid; + ndr_desc.flush = async_pmem_flush; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); + if (!nd_region) { + dev_err(&vdev->dev, "failed to create nvdimm region\n"); + err = -ENXIO; + goto out_nd; + } + nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); + return 0; +out_nd: + nvdimm_bus_unregister(vpmem->nvdimm_bus); +out_vq: + vdev->config->del_vqs(vdev); +out_err: + return err; +} + +static void virtio_pmem_remove(struct virtio_device *vdev) +{ + struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev); + + nvdimm_bus_unregister(nvdimm_bus); + vdev->config->del_vqs(vdev); + vdev->config->reset(vdev); +} + +static struct virtio_driver virtio_pmem_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_pmem_probe, + .remove = virtio_pmem_remove, +}; + +module_virtio_driver(virtio_pmem_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio pmem driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h new file mode 100644 index 000000000000..0dddefe594c4 --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * virtio_pmem.h: virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + **/ + +#ifndef _LINUX_VIRTIO_PMEM_H +#define _LINUX_VIRTIO_PMEM_H + +#include +#include +#include +#include + +struct virtio_pmem_request { + struct virtio_pmem_req req; + struct virtio_pmem_resp resp; + + /* Wait queue to process deferred work after ack from host */ + wait_queue_head_t host_acked; + bool done; + + /* Wait queue to process deferred work after virt queue buffer avail */ + wait_queue_head_t wq_buf; + bool wq_buf_avail; + struct list_head list; +}; + +struct virtio_pmem { + struct virtio_device *vdev; + + /* Virtio pmem request queue */ + struct virtqueue *req_vq; + + /* nvdimm bus registers virtio pmem device */ + struct nvdimm_bus *nvdimm_bus; + struct nvdimm_bus_descriptor nd_desc; + + /* List to store deferred work if virtqueue is full */ + struct list_head req_list; + + /* Synchronize virtqueue data */ + spinlock_t pmem_lock; + + /* Memory region information */ + __u64 start; + __u64 size; +}; + +void virtio_pmem_host_ack(struct virtqueue *vq); +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio); +#endif diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index d04d4378ca50..63502ca537eb 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -679,7 +679,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char goto put_dev; dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, - &dcssblk_dax_ops); + &dcssblk_dax_ops, DAXDEV_F_SYNC); if (!dev_info->dax_dev) { rc = -ENOMEM; goto put_dev; diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 023fc3bc01c6..078615cf2afc 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -43,6 +43,17 @@ config VIRTIO_PCI_LEGACY If unsure, say Y. +config VIRTIO_PMEM + tristate "Support for virtio pmem driver" + depends on VIRTIO + depends on LIBNVDIMM + help + This driver provides access to virtio-pmem devices, storage devices + that are mapped into the physical address space - similar to NVDIMMs + - with a virtio-based flushing interface. + + If unsure, say Y. + config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f4a24a46245e..70b0438dbc94 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -371,15 +371,17 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct dax_device *dax_dev = sbi->s_daxdev; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; /* - * We don't support synchronous mappings for non-DAX files. At least - * until someone comes with a sensible use case. + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. */ - if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(file); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e93bacbd49ae..28101bbc0b78 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1197,11 +1197,14 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + struct dax_device *dax_dev; + + dax_dev = xfs_find_daxdev_for_inode(file_inode(filp)); /* - * We don't support synchronous mappings for non-DAX files. At least - * until someone comes with a sensible use case. + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. */ - if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(filp); diff --git a/include/linux/dax.h b/include/linux/dax.h index becaea5f4488..9bd8528bd305 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -7,6 +7,9 @@ #include #include +/* Flag for synchronous flush */ +#define DAXDEV_F_SYNC (1UL << 0) + typedef unsigned long dax_entry_t; struct iomap_ops; @@ -38,18 +41,40 @@ extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) struct dax_device *dax_get_by_host(const char *host); struct dax_device *alloc_dax(void *private, const char *host, - const struct dax_operations *ops); + const struct dax_operations *ops, unsigned long flags); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); +bool __dax_synchronous(struct dax_device *dax_dev); +static inline bool dax_synchronous(struct dax_device *dax_dev) +{ + return __dax_synchronous(dax_dev); +} +void __set_dax_synchronous(struct dax_device *dax_dev); +static inline void set_dax_synchronous(struct dax_device *dax_dev) +{ + __set_dax_synchronous(dax_dev); +} +/* + * Check if given mapping is supported by the file / underlying device. + */ +static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, + struct dax_device *dax_dev) +{ + if (!(vma->vm_flags & VM_SYNC)) + return true; + if (!IS_DAX(file_inode(vma->vm_file))) + return false; + return dax_synchronous(dax_dev); +} #else static inline struct dax_device *dax_get_by_host(const char *host) { return NULL; } static inline struct dax_device *alloc_dax(void *private, const char *host, - const struct dax_operations *ops) + const struct dax_operations *ops, unsigned long flags) { /* * Callers should check IS_ENABLED(CONFIG_DAX) to know if this @@ -70,6 +95,18 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) { return false; } +static inline bool dax_synchronous(struct dax_device *dax_dev) +{ + return true; +} +static inline void set_dax_synchronous(struct dax_device *dax_dev) +{ +} +static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, + struct dax_device *dax_dev) +{ + return !(vma->vm_flags & VM_SYNC); +} #endif struct writeback_control; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 03d5c3aece9d..7a64b3ddb408 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -11,6 +11,7 @@ #include #include #include +#include struct badrange_entry { u64 start; @@ -57,6 +58,9 @@ enum { */ ND_REGION_PERSIST_MEMCTRL = 2, + /* Platform provides asynchronous flush mechanism */ + ND_REGION_ASYNC = 3, + /* mark newly adjusted resources as requiring a label update */ DPA_RESOURCE_ADJUSTED = 1 << 0, }; @@ -113,6 +117,7 @@ struct nd_mapping_desc { int position; }; +struct nd_region; struct nd_region_desc { struct resource *res; struct nd_mapping_desc *mapping; @@ -125,6 +130,7 @@ struct nd_region_desc { int target_node; unsigned long flags; struct device_node *of_node; + int (*flush)(struct nd_region *nd_region, struct bio *bio); }; struct device; @@ -252,10 +258,12 @@ unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr); unsigned int nd_region_acquire_lane(struct nd_region *nd_region); void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); -void nvdimm_flush(struct nd_region *nd_region); +int nvdimm_flush(struct nd_region *nd_region, struct bio *bio); +int generic_nvdimm_flush(struct nd_region *nd_region); int nvdimm_has_flush(struct nd_region *nd_region); int nvdimm_has_cache(struct nd_region *nd_region); int nvdimm_in_overwrite(struct nvdimm *nvdimm); +bool is_nvdimm_sync(struct nd_region *nd_region); static inline int nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index cfe47c5d9a56..348fd0176f75 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -44,5 +44,6 @@ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ +#define VIRTIO_ID_PMEM 27 /* virtio pmem */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h new file mode 100644 index 000000000000..9a63ed6d062f --- /dev/null +++ b/include/uapi/linux/virtio_pmem.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Definitions for virtio-pmem devices. + * + * Copyright (C) 2019 Red Hat, Inc. + * + * Author(s): Pankaj Gupta + */ + +#ifndef _UAPI_LINUX_VIRTIO_PMEM_H +#define _UAPI_LINUX_VIRTIO_PMEM_H + +#include +#include +#include + +struct virtio_pmem_config { + __u64 start; + __u64 size; +}; + +#define VIRTIO_PMEM_REQ_TYPE_FLUSH 0 + +struct virtio_pmem_resp { + /* Host return status corresponding to flush request */ + __le32 ret; +}; + +struct virtio_pmem_req { + /* command type */ + __le32 type; +}; + +#endif