mirror of
https://github.com/torvalds/linux.git
synced 2024-11-26 22:21:42 +00:00
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull ceph updates from Sage Weil: "This is a big batch. From Ilya we have: - rbd support for more than ~250 mapped devices (now uses same scheme that SCSI does for device major/minor numbering) - crush updates for new mapping behaviors (will be needed for coming erasure coding support, among other things) - preliminary support for tiered storage pools There is also a big series fixing a pile cephfs bugs with clustered MDSs from Yan Zheng, ACL support for cephfs from Guangliang Zhao, ceph fscache improvements from Li Wang, improved behavior when we get ENOSPC from Josh Durgin, some readv/writev improvements from Majianpeng, and the usual mix of small cleanups" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (76 commits) ceph: cast PAGE_SIZE to size_t in ceph_sync_write() ceph: fix dout() compile warnings in ceph_filemap_fault() libceph: support CEPH_FEATURE_OSD_CACHEPOOL feature libceph: follow redirect replies from osds libceph: rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} libceph: follow {read,write}_tier fields on osd request submission libceph: add ceph_pg_pool_by_id() libceph: CEPH_OSD_FLAG_* enum update libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg() libceph: introduce and start using oid abstraction libceph: rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN libceph: move ceph_file_layout helpers to ceph_fs.h libceph: start using oloc abstraction libceph: dout() is missing a newline libceph: add ceph_kv{malloc,free}() and switch to them libceph: support CEPH_FEATURE_EXPORT_PEER ceph: add imported caps when handling cap export message ceph: add open export target session helper ceph: remove exported caps when handling cap import message ceph: handle session flush message ...
This commit is contained in:
commit
d891ea23d5
@ -18,6 +18,28 @@ Removal of a device:
|
|||||||
|
|
||||||
$ echo <dev-id> > /sys/bus/rbd/remove
|
$ echo <dev-id> > /sys/bus/rbd/remove
|
||||||
|
|
||||||
|
What: /sys/bus/rbd/add_single_major
|
||||||
|
Date: December 2013
|
||||||
|
KernelVersion: 3.14
|
||||||
|
Contact: Sage Weil <sage@inktank.com>
|
||||||
|
Description: Available only if rbd module is inserted with single_major
|
||||||
|
parameter set to true.
|
||||||
|
Usage is the same as for /sys/bus/rbd/add. If present,
|
||||||
|
should be used instead of the latter: any attempts to use
|
||||||
|
/sys/bus/rbd/add if /sys/bus/rbd/add_single_major is
|
||||||
|
available will fail for backwards compatibility reasons.
|
||||||
|
|
||||||
|
What: /sys/bus/rbd/remove_single_major
|
||||||
|
Date: December 2013
|
||||||
|
KernelVersion: 3.14
|
||||||
|
Contact: Sage Weil <sage@inktank.com>
|
||||||
|
Description: Available only if rbd module is inserted with single_major
|
||||||
|
parameter set to true.
|
||||||
|
Usage is the same as for /sys/bus/rbd/remove. If present,
|
||||||
|
should be used instead of the latter: any attempts to use
|
||||||
|
/sys/bus/rbd/remove if /sys/bus/rbd/remove_single_major is
|
||||||
|
available will fail for backwards compatibility reasons.
|
||||||
|
|
||||||
Entries under /sys/bus/rbd/devices/<dev-id>/
|
Entries under /sys/bus/rbd/devices/<dev-id>/
|
||||||
--------------------------------------------
|
--------------------------------------------
|
||||||
|
|
||||||
@ -33,6 +55,10 @@ major
|
|||||||
|
|
||||||
The block device major number.
|
The block device major number.
|
||||||
|
|
||||||
|
minor
|
||||||
|
|
||||||
|
The block device minor number. (December 2013, since 3.14.)
|
||||||
|
|
||||||
name
|
name
|
||||||
|
|
||||||
The name of the rbd image.
|
The name of the rbd image.
|
||||||
|
@ -7075,7 +7075,7 @@ F: drivers/media/parport/*-qcam*
|
|||||||
RADOS BLOCK DEVICE (RBD)
|
RADOS BLOCK DEVICE (RBD)
|
||||||
M: Yehuda Sadeh <yehuda@inktank.com>
|
M: Yehuda Sadeh <yehuda@inktank.com>
|
||||||
M: Sage Weil <sage@inktank.com>
|
M: Sage Weil <sage@inktank.com>
|
||||||
M: Alex Elder <elder@inktank.com>
|
M: Alex Elder <elder@kernel.org>
|
||||||
M: ceph-devel@vger.kernel.org
|
M: ceph-devel@vger.kernel.org
|
||||||
W: http://ceph.com/
|
W: http://ceph.com/
|
||||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
|
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
|
||||||
|
@ -41,6 +41,7 @@
|
|||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/blkdev.h>
|
#include <linux/blkdev.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
#include <linux/idr.h>
|
||||||
|
|
||||||
#include "rbd_types.h"
|
#include "rbd_types.h"
|
||||||
|
|
||||||
@ -89,9 +90,9 @@ static int atomic_dec_return_safe(atomic_t *v)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define RBD_DRV_NAME "rbd"
|
#define RBD_DRV_NAME "rbd"
|
||||||
#define RBD_DRV_NAME_LONG "rbd (rados block device)"
|
|
||||||
|
|
||||||
#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
|
#define RBD_MINORS_PER_MAJOR 256
|
||||||
|
#define RBD_SINGLE_MAJOR_PART_SHIFT 4
|
||||||
|
|
||||||
#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
|
#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
|
||||||
#define RBD_MAX_SNAP_NAME_LEN \
|
#define RBD_MAX_SNAP_NAME_LEN \
|
||||||
@ -323,6 +324,7 @@ struct rbd_device {
|
|||||||
int dev_id; /* blkdev unique id */
|
int dev_id; /* blkdev unique id */
|
||||||
|
|
||||||
int major; /* blkdev assigned major */
|
int major; /* blkdev assigned major */
|
||||||
|
int minor;
|
||||||
struct gendisk *disk; /* blkdev's gendisk and rq */
|
struct gendisk *disk; /* blkdev's gendisk and rq */
|
||||||
|
|
||||||
u32 image_format; /* Either 1 or 2 */
|
u32 image_format; /* Either 1 or 2 */
|
||||||
@ -386,6 +388,17 @@ static struct kmem_cache *rbd_img_request_cache;
|
|||||||
static struct kmem_cache *rbd_obj_request_cache;
|
static struct kmem_cache *rbd_obj_request_cache;
|
||||||
static struct kmem_cache *rbd_segment_name_cache;
|
static struct kmem_cache *rbd_segment_name_cache;
|
||||||
|
|
||||||
|
static int rbd_major;
|
||||||
|
static DEFINE_IDA(rbd_dev_id_ida);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Default to false for now, as single-major requires >= 0.75 version of
|
||||||
|
* userspace rbd utility.
|
||||||
|
*/
|
||||||
|
static bool single_major = false;
|
||||||
|
module_param(single_major, bool, S_IRUGO);
|
||||||
|
MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
|
||||||
|
|
||||||
static int rbd_img_request_submit(struct rbd_img_request *img_request);
|
static int rbd_img_request_submit(struct rbd_img_request *img_request);
|
||||||
|
|
||||||
static void rbd_dev_device_release(struct device *dev);
|
static void rbd_dev_device_release(struct device *dev);
|
||||||
@ -394,18 +407,52 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf,
|
|||||||
size_t count);
|
size_t count);
|
||||||
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
|
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
|
||||||
size_t count);
|
size_t count);
|
||||||
|
static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
|
||||||
|
size_t count);
|
||||||
|
static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
|
||||||
|
size_t count);
|
||||||
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
|
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
|
||||||
static void rbd_spec_put(struct rbd_spec *spec);
|
static void rbd_spec_put(struct rbd_spec *spec);
|
||||||
|
|
||||||
|
static int rbd_dev_id_to_minor(int dev_id)
|
||||||
|
{
|
||||||
|
return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int minor_to_rbd_dev_id(int minor)
|
||||||
|
{
|
||||||
|
return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
|
||||||
|
}
|
||||||
|
|
||||||
static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
|
static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
|
||||||
static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
|
static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
|
||||||
|
static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
|
||||||
|
static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
|
||||||
|
|
||||||
static struct attribute *rbd_bus_attrs[] = {
|
static struct attribute *rbd_bus_attrs[] = {
|
||||||
&bus_attr_add.attr,
|
&bus_attr_add.attr,
|
||||||
&bus_attr_remove.attr,
|
&bus_attr_remove.attr,
|
||||||
|
&bus_attr_add_single_major.attr,
|
||||||
|
&bus_attr_remove_single_major.attr,
|
||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
ATTRIBUTE_GROUPS(rbd_bus);
|
|
||||||
|
static umode_t rbd_bus_is_visible(struct kobject *kobj,
|
||||||
|
struct attribute *attr, int index)
|
||||||
|
{
|
||||||
|
if (!single_major &&
|
||||||
|
(attr == &bus_attr_add_single_major.attr ||
|
||||||
|
attr == &bus_attr_remove_single_major.attr))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return attr->mode;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct attribute_group rbd_bus_group = {
|
||||||
|
.attrs = rbd_bus_attrs,
|
||||||
|
.is_visible = rbd_bus_is_visible,
|
||||||
|
};
|
||||||
|
__ATTRIBUTE_GROUPS(rbd_bus);
|
||||||
|
|
||||||
static struct bus_type rbd_bus_type = {
|
static struct bus_type rbd_bus_type = {
|
||||||
.name = "rbd",
|
.name = "rbd",
|
||||||
@ -1041,9 +1088,9 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
|
|||||||
name_format = "%s.%012llx";
|
name_format = "%s.%012llx";
|
||||||
if (rbd_dev->image_format == 2)
|
if (rbd_dev->image_format == 2)
|
||||||
name_format = "%s.%016llx";
|
name_format = "%s.%016llx";
|
||||||
ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
|
ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
|
||||||
rbd_dev->header.object_prefix, segment);
|
rbd_dev->header.object_prefix, segment);
|
||||||
if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
|
if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
|
||||||
pr_err("error formatting segment name for #%llu (%d)\n",
|
pr_err("error formatting segment name for #%llu (%d)\n",
|
||||||
segment, ret);
|
segment, ret);
|
||||||
kfree(name);
|
kfree(name);
|
||||||
@ -1761,11 +1808,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
|
|||||||
osd_req->r_callback = rbd_osd_req_callback;
|
osd_req->r_callback = rbd_osd_req_callback;
|
||||||
osd_req->r_priv = obj_request;
|
osd_req->r_priv = obj_request;
|
||||||
|
|
||||||
osd_req->r_oid_len = strlen(obj_request->object_name);
|
osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
|
||||||
rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
|
ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
|
||||||
memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
|
|
||||||
|
|
||||||
osd_req->r_file_layout = rbd_dev->layout; /* struct */
|
|
||||||
|
|
||||||
return osd_req;
|
return osd_req;
|
||||||
}
|
}
|
||||||
@ -1802,11 +1846,8 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
|
|||||||
osd_req->r_callback = rbd_osd_req_callback;
|
osd_req->r_callback = rbd_osd_req_callback;
|
||||||
osd_req->r_priv = obj_request;
|
osd_req->r_priv = obj_request;
|
||||||
|
|
||||||
osd_req->r_oid_len = strlen(obj_request->object_name);
|
osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
|
||||||
rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
|
ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
|
||||||
memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
|
|
||||||
|
|
||||||
osd_req->r_file_layout = rbd_dev->layout; /* struct */
|
|
||||||
|
|
||||||
return osd_req;
|
return osd_req;
|
||||||
}
|
}
|
||||||
@ -2866,7 +2907,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
|
|||||||
* Request sync osd watch/unwatch. The value of "start" determines
|
* Request sync osd watch/unwatch. The value of "start" determines
|
||||||
* whether a watch request is being initiated or torn down.
|
* whether a watch request is being initiated or torn down.
|
||||||
*/
|
*/
|
||||||
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
|
static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
|
||||||
{
|
{
|
||||||
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
|
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
|
||||||
struct rbd_obj_request *obj_request;
|
struct rbd_obj_request *obj_request;
|
||||||
@ -2941,6 +2982,22 @@ out_cancel:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
|
||||||
|
{
|
||||||
|
return __rbd_dev_header_watch_sync(rbd_dev, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = __rbd_dev_header_watch_sync(rbd_dev, false);
|
||||||
|
if (ret) {
|
||||||
|
rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
|
||||||
|
ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Synchronous osd object method call. Returns the number of bytes
|
* Synchronous osd object method call. Returns the number of bytes
|
||||||
* returned in the outbound buffer, or a negative error code.
|
* returned in the outbound buffer, or a negative error code.
|
||||||
@ -3388,14 +3445,18 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
|
|||||||
u64 segment_size;
|
u64 segment_size;
|
||||||
|
|
||||||
/* create gendisk info */
|
/* create gendisk info */
|
||||||
disk = alloc_disk(RBD_MINORS_PER_MAJOR);
|
disk = alloc_disk(single_major ?
|
||||||
|
(1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
|
||||||
|
RBD_MINORS_PER_MAJOR);
|
||||||
if (!disk)
|
if (!disk)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
|
snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
|
||||||
rbd_dev->dev_id);
|
rbd_dev->dev_id);
|
||||||
disk->major = rbd_dev->major;
|
disk->major = rbd_dev->major;
|
||||||
disk->first_minor = 0;
|
disk->first_minor = rbd_dev->minor;
|
||||||
|
if (single_major)
|
||||||
|
disk->flags |= GENHD_FL_EXT_DEVT;
|
||||||
disk->fops = &rbd_bd_ops;
|
disk->fops = &rbd_bd_ops;
|
||||||
disk->private_data = rbd_dev;
|
disk->private_data = rbd_dev;
|
||||||
|
|
||||||
@ -3467,7 +3528,14 @@ static ssize_t rbd_major_show(struct device *dev,
|
|||||||
return sprintf(buf, "%d\n", rbd_dev->major);
|
return sprintf(buf, "%d\n", rbd_dev->major);
|
||||||
|
|
||||||
return sprintf(buf, "(none)\n");
|
return sprintf(buf, "(none)\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t rbd_minor_show(struct device *dev,
|
||||||
|
struct device_attribute *attr, char *buf)
|
||||||
|
{
|
||||||
|
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
|
||||||
|
|
||||||
|
return sprintf(buf, "%d\n", rbd_dev->minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t rbd_client_id_show(struct device *dev,
|
static ssize_t rbd_client_id_show(struct device *dev,
|
||||||
@ -3589,6 +3657,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
|
|||||||
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
|
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
|
||||||
static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
|
static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
|
||||||
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
|
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
|
||||||
|
static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
|
||||||
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
|
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
|
||||||
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
|
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
|
||||||
static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
|
static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
|
||||||
@ -3602,6 +3671,7 @@ static struct attribute *rbd_attrs[] = {
|
|||||||
&dev_attr_size.attr,
|
&dev_attr_size.attr,
|
||||||
&dev_attr_features.attr,
|
&dev_attr_features.attr,
|
||||||
&dev_attr_major.attr,
|
&dev_attr_major.attr,
|
||||||
|
&dev_attr_minor.attr,
|
||||||
&dev_attr_client_id.attr,
|
&dev_attr_client_id.attr,
|
||||||
&dev_attr_pool.attr,
|
&dev_attr_pool.attr,
|
||||||
&dev_attr_pool_id.attr,
|
&dev_attr_pool_id.attr,
|
||||||
@ -4372,21 +4442,29 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
|
|||||||
device_unregister(&rbd_dev->dev);
|
device_unregister(&rbd_dev->dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get a unique rbd identifier for the given new rbd_dev, and add
|
* Get a unique rbd identifier for the given new rbd_dev, and add
|
||||||
* the rbd_dev to the global list. The minimum rbd id is 1.
|
* the rbd_dev to the global list.
|
||||||
*/
|
*/
|
||||||
static void rbd_dev_id_get(struct rbd_device *rbd_dev)
|
static int rbd_dev_id_get(struct rbd_device *rbd_dev)
|
||||||
{
|
{
|
||||||
rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
|
int new_dev_id;
|
||||||
|
|
||||||
|
new_dev_id = ida_simple_get(&rbd_dev_id_ida,
|
||||||
|
0, minor_to_rbd_dev_id(1 << MINORBITS),
|
||||||
|
GFP_KERNEL);
|
||||||
|
if (new_dev_id < 0)
|
||||||
|
return new_dev_id;
|
||||||
|
|
||||||
|
rbd_dev->dev_id = new_dev_id;
|
||||||
|
|
||||||
spin_lock(&rbd_dev_list_lock);
|
spin_lock(&rbd_dev_list_lock);
|
||||||
list_add_tail(&rbd_dev->node, &rbd_dev_list);
|
list_add_tail(&rbd_dev->node, &rbd_dev_list);
|
||||||
spin_unlock(&rbd_dev_list_lock);
|
spin_unlock(&rbd_dev_list_lock);
|
||||||
dout("rbd_dev %p given dev id %llu\n", rbd_dev,
|
|
||||||
(unsigned long long) rbd_dev->dev_id);
|
dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -4395,49 +4473,13 @@ static void rbd_dev_id_get(struct rbd_device *rbd_dev)
|
|||||||
*/
|
*/
|
||||||
static void rbd_dev_id_put(struct rbd_device *rbd_dev)
|
static void rbd_dev_id_put(struct rbd_device *rbd_dev)
|
||||||
{
|
{
|
||||||
struct list_head *tmp;
|
|
||||||
int rbd_id = rbd_dev->dev_id;
|
|
||||||
int max_id;
|
|
||||||
|
|
||||||
rbd_assert(rbd_id > 0);
|
|
||||||
|
|
||||||
dout("rbd_dev %p released dev id %llu\n", rbd_dev,
|
|
||||||
(unsigned long long) rbd_dev->dev_id);
|
|
||||||
spin_lock(&rbd_dev_list_lock);
|
spin_lock(&rbd_dev_list_lock);
|
||||||
list_del_init(&rbd_dev->node);
|
list_del_init(&rbd_dev->node);
|
||||||
|
|
||||||
/*
|
|
||||||
* If the id being "put" is not the current maximum, there
|
|
||||||
* is nothing special we need to do.
|
|
||||||
*/
|
|
||||||
if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
|
|
||||||
spin_unlock(&rbd_dev_list_lock);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We need to update the current maximum id. Search the
|
|
||||||
* list to find out what it is. We're more likely to find
|
|
||||||
* the maximum at the end, so search the list backward.
|
|
||||||
*/
|
|
||||||
max_id = 0;
|
|
||||||
list_for_each_prev(tmp, &rbd_dev_list) {
|
|
||||||
struct rbd_device *rbd_dev;
|
|
||||||
|
|
||||||
rbd_dev = list_entry(tmp, struct rbd_device, node);
|
|
||||||
if (rbd_dev->dev_id > max_id)
|
|
||||||
max_id = rbd_dev->dev_id;
|
|
||||||
}
|
|
||||||
spin_unlock(&rbd_dev_list_lock);
|
spin_unlock(&rbd_dev_list_lock);
|
||||||
|
|
||||||
/*
|
ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
|
||||||
* The max id could have been updated by rbd_dev_id_get(), in
|
|
||||||
* which case it now accurately reflects the new maximum.
|
dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
|
||||||
* Be careful not to overwrite the maximum value in that
|
|
||||||
* case.
|
|
||||||
*/
|
|
||||||
atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
|
|
||||||
dout(" max dev id has been reset\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -4860,20 +4902,29 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* generate unique id: find highest unique id, add one */
|
/* Get an id and fill in device name. */
|
||||||
rbd_dev_id_get(rbd_dev);
|
|
||||||
|
ret = rbd_dev_id_get(rbd_dev);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
/* Fill in the device name, now that we have its id. */
|
|
||||||
BUILD_BUG_ON(DEV_NAME_LEN
|
BUILD_BUG_ON(DEV_NAME_LEN
|
||||||
< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
|
< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
|
||||||
sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
|
sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
|
||||||
|
|
||||||
/* Get our block major device number. */
|
/* Record our major and minor device numbers. */
|
||||||
|
|
||||||
ret = register_blkdev(0, rbd_dev->name);
|
if (!single_major) {
|
||||||
if (ret < 0)
|
ret = register_blkdev(0, rbd_dev->name);
|
||||||
goto err_out_id;
|
if (ret < 0)
|
||||||
rbd_dev->major = ret;
|
goto err_out_id;
|
||||||
|
|
||||||
|
rbd_dev->major = ret;
|
||||||
|
rbd_dev->minor = 0;
|
||||||
|
} else {
|
||||||
|
rbd_dev->major = rbd_major;
|
||||||
|
rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
|
||||||
|
}
|
||||||
|
|
||||||
/* Set up the blkdev mapping. */
|
/* Set up the blkdev mapping. */
|
||||||
|
|
||||||
@ -4905,7 +4956,8 @@ err_out_mapping:
|
|||||||
err_out_disk:
|
err_out_disk:
|
||||||
rbd_free_disk(rbd_dev);
|
rbd_free_disk(rbd_dev);
|
||||||
err_out_blkdev:
|
err_out_blkdev:
|
||||||
unregister_blkdev(rbd_dev->major, rbd_dev->name);
|
if (!single_major)
|
||||||
|
unregister_blkdev(rbd_dev->major, rbd_dev->name);
|
||||||
err_out_id:
|
err_out_id:
|
||||||
rbd_dev_id_put(rbd_dev);
|
rbd_dev_id_put(rbd_dev);
|
||||||
rbd_dev_mapping_clear(rbd_dev);
|
rbd_dev_mapping_clear(rbd_dev);
|
||||||
@ -4961,7 +5013,6 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev)
|
|||||||
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
|
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
int tmp;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the id from the image id object. Unless there's an
|
* Get the id from the image id object. Unless there's an
|
||||||
@ -4980,7 +5031,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
|
|||||||
goto err_out_format;
|
goto err_out_format;
|
||||||
|
|
||||||
if (mapping) {
|
if (mapping) {
|
||||||
ret = rbd_dev_header_watch_sync(rbd_dev, true);
|
ret = rbd_dev_header_watch_sync(rbd_dev);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_header_name;
|
goto out_header_name;
|
||||||
}
|
}
|
||||||
@ -5007,12 +5058,8 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
|
|||||||
err_out_probe:
|
err_out_probe:
|
||||||
rbd_dev_unprobe(rbd_dev);
|
rbd_dev_unprobe(rbd_dev);
|
||||||
err_out_watch:
|
err_out_watch:
|
||||||
if (mapping) {
|
if (mapping)
|
||||||
tmp = rbd_dev_header_watch_sync(rbd_dev, false);
|
rbd_dev_header_unwatch_sync(rbd_dev);
|
||||||
if (tmp)
|
|
||||||
rbd_warn(rbd_dev, "unable to tear down "
|
|
||||||
"watch request (%d)\n", tmp);
|
|
||||||
}
|
|
||||||
out_header_name:
|
out_header_name:
|
||||||
kfree(rbd_dev->header_name);
|
kfree(rbd_dev->header_name);
|
||||||
rbd_dev->header_name = NULL;
|
rbd_dev->header_name = NULL;
|
||||||
@ -5026,9 +5073,9 @@ err_out_format:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t rbd_add(struct bus_type *bus,
|
static ssize_t do_rbd_add(struct bus_type *bus,
|
||||||
const char *buf,
|
const char *buf,
|
||||||
size_t count)
|
size_t count)
|
||||||
{
|
{
|
||||||
struct rbd_device *rbd_dev = NULL;
|
struct rbd_device *rbd_dev = NULL;
|
||||||
struct ceph_options *ceph_opts = NULL;
|
struct ceph_options *ceph_opts = NULL;
|
||||||
@ -5090,6 +5137,12 @@ static ssize_t rbd_add(struct bus_type *bus,
|
|||||||
|
|
||||||
rc = rbd_dev_device_setup(rbd_dev);
|
rc = rbd_dev_device_setup(rbd_dev);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
|
/*
|
||||||
|
* rbd_dev_header_unwatch_sync() can't be moved into
|
||||||
|
* rbd_dev_image_release() without refactoring, see
|
||||||
|
* commit 1f3ef78861ac.
|
||||||
|
*/
|
||||||
|
rbd_dev_header_unwatch_sync(rbd_dev);
|
||||||
rbd_dev_image_release(rbd_dev);
|
rbd_dev_image_release(rbd_dev);
|
||||||
goto err_out_module;
|
goto err_out_module;
|
||||||
}
|
}
|
||||||
@ -5110,6 +5163,23 @@ err_out_module:
|
|||||||
return (ssize_t)rc;
|
return (ssize_t)rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ssize_t rbd_add(struct bus_type *bus,
|
||||||
|
const char *buf,
|
||||||
|
size_t count)
|
||||||
|
{
|
||||||
|
if (single_major)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
return do_rbd_add(bus, buf, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t rbd_add_single_major(struct bus_type *bus,
|
||||||
|
const char *buf,
|
||||||
|
size_t count)
|
||||||
|
{
|
||||||
|
return do_rbd_add(bus, buf, count);
|
||||||
|
}
|
||||||
|
|
||||||
static void rbd_dev_device_release(struct device *dev)
|
static void rbd_dev_device_release(struct device *dev)
|
||||||
{
|
{
|
||||||
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
|
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
|
||||||
@ -5117,8 +5187,8 @@ static void rbd_dev_device_release(struct device *dev)
|
|||||||
rbd_free_disk(rbd_dev);
|
rbd_free_disk(rbd_dev);
|
||||||
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
|
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
|
||||||
rbd_dev_mapping_clear(rbd_dev);
|
rbd_dev_mapping_clear(rbd_dev);
|
||||||
unregister_blkdev(rbd_dev->major, rbd_dev->name);
|
if (!single_major)
|
||||||
rbd_dev->major = 0;
|
unregister_blkdev(rbd_dev->major, rbd_dev->name);
|
||||||
rbd_dev_id_put(rbd_dev);
|
rbd_dev_id_put(rbd_dev);
|
||||||
rbd_dev_mapping_clear(rbd_dev);
|
rbd_dev_mapping_clear(rbd_dev);
|
||||||
}
|
}
|
||||||
@ -5149,9 +5219,9 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t rbd_remove(struct bus_type *bus,
|
static ssize_t do_rbd_remove(struct bus_type *bus,
|
||||||
const char *buf,
|
const char *buf,
|
||||||
size_t count)
|
size_t count)
|
||||||
{
|
{
|
||||||
struct rbd_device *rbd_dev = NULL;
|
struct rbd_device *rbd_dev = NULL;
|
||||||
struct list_head *tmp;
|
struct list_head *tmp;
|
||||||
@ -5191,16 +5261,14 @@ static ssize_t rbd_remove(struct bus_type *bus,
|
|||||||
if (ret < 0 || already)
|
if (ret < 0 || already)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
ret = rbd_dev_header_watch_sync(rbd_dev, false);
|
rbd_dev_header_unwatch_sync(rbd_dev);
|
||||||
if (ret)
|
|
||||||
rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* flush remaining watch callbacks - these must be complete
|
* flush remaining watch callbacks - these must be complete
|
||||||
* before the osd_client is shutdown
|
* before the osd_client is shutdown
|
||||||
*/
|
*/
|
||||||
dout("%s: flushing notifies", __func__);
|
dout("%s: flushing notifies", __func__);
|
||||||
ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
|
ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Don't free anything from rbd_dev->disk until after all
|
* Don't free anything from rbd_dev->disk until after all
|
||||||
* notifies are completely processed. Otherwise
|
* notifies are completely processed. Otherwise
|
||||||
@ -5214,6 +5282,23 @@ static ssize_t rbd_remove(struct bus_type *bus,
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ssize_t rbd_remove(struct bus_type *bus,
|
||||||
|
const char *buf,
|
||||||
|
size_t count)
|
||||||
|
{
|
||||||
|
if (single_major)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
return do_rbd_remove(bus, buf, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t rbd_remove_single_major(struct bus_type *bus,
|
||||||
|
const char *buf,
|
||||||
|
size_t count)
|
||||||
|
{
|
||||||
|
return do_rbd_remove(bus, buf, count);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* create control files in sysfs
|
* create control files in sysfs
|
||||||
* /sys/bus/rbd/...
|
* /sys/bus/rbd/...
|
||||||
@ -5259,7 +5344,7 @@ static int rbd_slab_init(void)
|
|||||||
|
|
||||||
rbd_assert(!rbd_segment_name_cache);
|
rbd_assert(!rbd_segment_name_cache);
|
||||||
rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
|
rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
|
||||||
MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
|
CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
|
||||||
if (rbd_segment_name_cache)
|
if (rbd_segment_name_cache)
|
||||||
return 0;
|
return 0;
|
||||||
out_err:
|
out_err:
|
||||||
@ -5295,24 +5380,45 @@ static int __init rbd_init(void)
|
|||||||
|
|
||||||
if (!libceph_compatible(NULL)) {
|
if (!libceph_compatible(NULL)) {
|
||||||
rbd_warn(NULL, "libceph incompatibility (quitting)");
|
rbd_warn(NULL, "libceph incompatibility (quitting)");
|
||||||
|
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = rbd_slab_init();
|
rc = rbd_slab_init();
|
||||||
if (rc)
|
if (rc)
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
|
if (single_major) {
|
||||||
|
rbd_major = register_blkdev(0, RBD_DRV_NAME);
|
||||||
|
if (rbd_major < 0) {
|
||||||
|
rc = rbd_major;
|
||||||
|
goto err_out_slab;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
rc = rbd_sysfs_init();
|
rc = rbd_sysfs_init();
|
||||||
if (rc)
|
if (rc)
|
||||||
rbd_slab_exit();
|
goto err_out_blkdev;
|
||||||
else
|
|
||||||
pr_info("loaded " RBD_DRV_NAME_LONG "\n");
|
|
||||||
|
|
||||||
|
if (single_major)
|
||||||
|
pr_info("loaded (major %d)\n", rbd_major);
|
||||||
|
else
|
||||||
|
pr_info("loaded\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
err_out_blkdev:
|
||||||
|
if (single_major)
|
||||||
|
unregister_blkdev(rbd_major, RBD_DRV_NAME);
|
||||||
|
err_out_slab:
|
||||||
|
rbd_slab_exit();
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __exit rbd_exit(void)
|
static void __exit rbd_exit(void)
|
||||||
{
|
{
|
||||||
rbd_sysfs_cleanup();
|
rbd_sysfs_cleanup();
|
||||||
|
if (single_major)
|
||||||
|
unregister_blkdev(rbd_major, RBD_DRV_NAME);
|
||||||
rbd_slab_exit();
|
rbd_slab_exit();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5322,9 +5428,8 @@ module_exit(rbd_exit);
|
|||||||
MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
|
MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
|
||||||
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
|
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
|
||||||
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
|
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
|
||||||
MODULE_DESCRIPTION("rados block device");
|
|
||||||
|
|
||||||
/* following authorship retained from original osdblk.c */
|
/* following authorship retained from original osdblk.c */
|
||||||
MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
|
MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
|
||||||
|
|
||||||
|
MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
|
||||||
MODULE_LICENSE("GPL");
|
MODULE_LICENSE("GPL");
|
||||||
|
@ -25,3 +25,16 @@ config CEPH_FSCACHE
|
|||||||
caching support for Ceph clients using FS-Cache
|
caching support for Ceph clients using FS-Cache
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
config CEPH_FS_POSIX_ACL
|
||||||
|
bool "Ceph POSIX Access Control Lists"
|
||||||
|
depends on CEPH_FS
|
||||||
|
select FS_POSIX_ACL
|
||||||
|
help
|
||||||
|
POSIX Access Control Lists (ACLs) support permissions for users and
|
||||||
|
groups beyond the owner/group/world scheme.
|
||||||
|
|
||||||
|
To learn more about Access Control Lists, visit the POSIX ACLs for
|
||||||
|
Linux website <http://acl.bestbits.at/>.
|
||||||
|
|
||||||
|
If you don't know what Access Control Lists are, say N
|
||||||
|
@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
|
|||||||
debugfs.o
|
debugfs.o
|
||||||
|
|
||||||
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
|
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
|
||||||
|
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
|
||||||
|
332
fs/ceph/acl.c
Normal file
332
fs/ceph/acl.c
Normal file
@ -0,0 +1,332 @@
|
|||||||
|
/*
|
||||||
|
* linux/fs/ceph/acl.c
|
||||||
|
*
|
||||||
|
* Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public
|
||||||
|
* License v2 as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public
|
||||||
|
* License along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||||
|
* Boston, MA 021110-1307, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/ceph/ceph_debug.h>
|
||||||
|
#include <linux/fs.h>
|
||||||
|
#include <linux/string.h>
|
||||||
|
#include <linux/xattr.h>
|
||||||
|
#include <linux/posix_acl_xattr.h>
|
||||||
|
#include <linux/posix_acl.h>
|
||||||
|
#include <linux/sched.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
|
|
||||||
|
#include "super.h"
|
||||||
|
|
||||||
|
static inline void ceph_set_cached_acl(struct inode *inode,
|
||||||
|
int type, struct posix_acl *acl)
|
||||||
|
{
|
||||||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
|
||||||
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
|
||||||
|
set_cached_acl(inode, type, acl);
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
|
||||||
|
int type)
|
||||||
|
{
|
||||||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct posix_acl *acl = ACL_NOT_CACHED;
|
||||||
|
|
||||||
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
|
||||||
|
acl = get_cached_acl(inode, type);
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
|
||||||
|
return acl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ceph_forget_all_cached_acls(struct inode *inode)
|
||||||
|
{
|
||||||
|
forget_all_cached_acls(inode);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct posix_acl *ceph_get_acl(struct inode *inode, int type)
|
||||||
|
{
|
||||||
|
int size;
|
||||||
|
const char *name;
|
||||||
|
char *value = NULL;
|
||||||
|
struct posix_acl *acl;
|
||||||
|
|
||||||
|
if (!IS_POSIXACL(inode))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
acl = ceph_get_cached_acl(inode, type);
|
||||||
|
if (acl != ACL_NOT_CACHED)
|
||||||
|
return acl;
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case ACL_TYPE_ACCESS:
|
||||||
|
name = POSIX_ACL_XATTR_ACCESS;
|
||||||
|
break;
|
||||||
|
case ACL_TYPE_DEFAULT:
|
||||||
|
name = POSIX_ACL_XATTR_DEFAULT;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
|
size = __ceph_getxattr(inode, name, "", 0);
|
||||||
|
if (size > 0) {
|
||||||
|
value = kzalloc(size, GFP_NOFS);
|
||||||
|
if (!value)
|
||||||
|
return ERR_PTR(-ENOMEM);
|
||||||
|
size = __ceph_getxattr(inode, name, value, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (size > 0)
|
||||||
|
acl = posix_acl_from_xattr(&init_user_ns, value, size);
|
||||||
|
else if (size == -ERANGE || size == -ENODATA || size == 0)
|
||||||
|
acl = NULL;
|
||||||
|
else
|
||||||
|
acl = ERR_PTR(-EIO);
|
||||||
|
|
||||||
|
kfree(value);
|
||||||
|
|
||||||
|
if (!IS_ERR(acl))
|
||||||
|
ceph_set_cached_acl(inode, type, acl);
|
||||||
|
|
||||||
|
return acl;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ceph_set_acl(struct dentry *dentry, struct inode *inode,
|
||||||
|
struct posix_acl *acl, int type)
|
||||||
|
{
|
||||||
|
int ret = 0, size = 0;
|
||||||
|
const char *name = NULL;
|
||||||
|
char *value = NULL;
|
||||||
|
struct iattr newattrs;
|
||||||
|
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
|
||||||
|
|
||||||
|
if (acl) {
|
||||||
|
ret = posix_acl_valid(acl);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case ACL_TYPE_ACCESS:
|
||||||
|
name = POSIX_ACL_XATTR_ACCESS;
|
||||||
|
if (acl) {
|
||||||
|
ret = posix_acl_equiv_mode(acl, &new_mode);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out;
|
||||||
|
if (ret == 0)
|
||||||
|
acl = NULL;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case ACL_TYPE_DEFAULT:
|
||||||
|
if (!S_ISDIR(inode->i_mode)) {
|
||||||
|
ret = acl ? -EINVAL : 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
name = POSIX_ACL_XATTR_DEFAULT;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (acl) {
|
||||||
|
size = posix_acl_xattr_size(acl->a_count);
|
||||||
|
value = kmalloc(size, GFP_NOFS);
|
||||||
|
if (!value) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out_free;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (new_mode != old_mode) {
|
||||||
|
newattrs.ia_mode = new_mode;
|
||||||
|
newattrs.ia_valid = ATTR_MODE;
|
||||||
|
ret = ceph_setattr(dentry, &newattrs);
|
||||||
|
if (ret)
|
||||||
|
goto out_free;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value)
|
||||||
|
ret = __ceph_setxattr(dentry, name, value, size, 0);
|
||||||
|
else
|
||||||
|
ret = __ceph_removexattr(dentry, name);
|
||||||
|
|
||||||
|
if (ret) {
|
||||||
|
if (new_mode != old_mode) {
|
||||||
|
newattrs.ia_mode = old_mode;
|
||||||
|
newattrs.ia_valid = ATTR_MODE;
|
||||||
|
ceph_setattr(dentry, &newattrs);
|
||||||
|
}
|
||||||
|
goto out_free;
|
||||||
|
}
|
||||||
|
|
||||||
|
ceph_set_cached_acl(inode, type, acl);
|
||||||
|
|
||||||
|
out_free:
|
||||||
|
kfree(value);
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
|
||||||
|
{
|
||||||
|
struct posix_acl *acl = NULL;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (!S_ISLNK(inode->i_mode)) {
|
||||||
|
if (IS_POSIXACL(dir)) {
|
||||||
|
acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT);
|
||||||
|
if (IS_ERR(acl)) {
|
||||||
|
ret = PTR_ERR(acl);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!acl)
|
||||||
|
inode->i_mode &= ~current_umask();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_POSIXACL(dir) && acl) {
|
||||||
|
if (S_ISDIR(inode->i_mode)) {
|
||||||
|
ret = ceph_set_acl(dentry, inode, acl,
|
||||||
|
ACL_TYPE_DEFAULT);
|
||||||
|
if (ret)
|
||||||
|
goto out_release;
|
||||||
|
}
|
||||||
|
ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out;
|
||||||
|
else if (ret > 0)
|
||||||
|
ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS);
|
||||||
|
else
|
||||||
|
cache_no_acl(inode);
|
||||||
|
} else {
|
||||||
|
cache_no_acl(inode);
|
||||||
|
}
|
||||||
|
|
||||||
|
out_release:
|
||||||
|
posix_acl_release(acl);
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
|
||||||
|
{
|
||||||
|
struct posix_acl *acl;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (S_ISLNK(inode->i_mode)) {
|
||||||
|
ret = -EOPNOTSUPP;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!IS_POSIXACL(inode))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
acl = ceph_get_acl(inode, ACL_TYPE_ACCESS);
|
||||||
|
if (IS_ERR_OR_NULL(acl)) {
|
||||||
|
ret = PTR_ERR(acl);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS);
|
||||||
|
posix_acl_release(acl);
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ceph_xattr_acl_get(struct dentry *dentry, const char *name,
|
||||||
|
void *value, size_t size, int type)
|
||||||
|
{
|
||||||
|
struct posix_acl *acl;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (!IS_POSIXACL(dentry->d_inode))
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
|
acl = ceph_get_acl(dentry->d_inode, type);
|
||||||
|
if (IS_ERR(acl))
|
||||||
|
return PTR_ERR(acl);
|
||||||
|
if (acl == NULL)
|
||||||
|
return -ENODATA;
|
||||||
|
|
||||||
|
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
|
||||||
|
posix_acl_release(acl);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ceph_xattr_acl_set(struct dentry *dentry, const char *name,
|
||||||
|
const void *value, size_t size, int flags, int type)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
struct posix_acl *acl = NULL;
|
||||||
|
|
||||||
|
if (!inode_owner_or_capable(dentry->d_inode)) {
|
||||||
|
ret = -EPERM;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!IS_POSIXACL(dentry->d_inode)) {
|
||||||
|
ret = -EOPNOTSUPP;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value) {
|
||||||
|
acl = posix_acl_from_xattr(&init_user_ns, value, size);
|
||||||
|
if (IS_ERR(acl)) {
|
||||||
|
ret = PTR_ERR(acl);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (acl) {
|
||||||
|
ret = posix_acl_valid(acl);
|
||||||
|
if (ret)
|
||||||
|
goto out_release;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = ceph_set_acl(dentry, dentry->d_inode, acl, type);
|
||||||
|
|
||||||
|
out_release:
|
||||||
|
posix_acl_release(acl);
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
const struct xattr_handler ceph_xattr_acl_default_handler = {
|
||||||
|
.prefix = POSIX_ACL_XATTR_DEFAULT,
|
||||||
|
.flags = ACL_TYPE_DEFAULT,
|
||||||
|
.get = ceph_xattr_acl_get,
|
||||||
|
.set = ceph_xattr_acl_set,
|
||||||
|
};
|
||||||
|
|
||||||
|
const struct xattr_handler ceph_xattr_acl_access_handler = {
|
||||||
|
.prefix = POSIX_ACL_XATTR_ACCESS,
|
||||||
|
.flags = ACL_TYPE_ACCESS,
|
||||||
|
.get = ceph_xattr_acl_get,
|
||||||
|
.set = ceph_xattr_acl_set,
|
||||||
|
};
|
@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
|
|||||||
err = 0;
|
err = 0;
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
SetPageError(page);
|
SetPageError(page);
|
||||||
|
ceph_fscache_readpage_cancel(inode, page);
|
||||||
goto out;
|
goto out;
|
||||||
} else {
|
} else {
|
||||||
if (err < PAGE_CACHE_SIZE) {
|
if (err < PAGE_CACHE_SIZE) {
|
||||||
@ -256,6 +257,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
|
|||||||
for (i = 0; i < num_pages; i++) {
|
for (i = 0; i < num_pages; i++) {
|
||||||
struct page *page = osd_data->pages[i];
|
struct page *page = osd_data->pages[i];
|
||||||
|
|
||||||
|
if (rc < 0)
|
||||||
|
goto unlock;
|
||||||
if (bytes < (int)PAGE_CACHE_SIZE) {
|
if (bytes < (int)PAGE_CACHE_SIZE) {
|
||||||
/* zero (remainder of) page */
|
/* zero (remainder of) page */
|
||||||
int s = bytes < 0 ? 0 : bytes;
|
int s = bytes < 0 ? 0 : bytes;
|
||||||
@ -266,6 +269,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
|
|||||||
flush_dcache_page(page);
|
flush_dcache_page(page);
|
||||||
SetPageUptodate(page);
|
SetPageUptodate(page);
|
||||||
ceph_readpage_to_fscache(inode, page);
|
ceph_readpage_to_fscache(inode, page);
|
||||||
|
unlock:
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
page_cache_release(page);
|
page_cache_release(page);
|
||||||
bytes -= PAGE_CACHE_SIZE;
|
bytes -= PAGE_CACHE_SIZE;
|
||||||
@ -1207,6 +1211,41 @@ const struct address_space_operations ceph_aops = {
|
|||||||
/*
|
/*
|
||||||
* vm ops
|
* vm ops
|
||||||
*/
|
*/
|
||||||
|
static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
|
{
|
||||||
|
struct inode *inode = file_inode(vma->vm_file);
|
||||||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_file_info *fi = vma->vm_file->private_data;
|
||||||
|
loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
|
||||||
|
int want, got, ret;
|
||||||
|
|
||||||
|
dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
|
||||||
|
inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
|
||||||
|
if (fi->fmode & CEPH_FILE_MODE_LAZY)
|
||||||
|
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
|
||||||
|
else
|
||||||
|
want = CEPH_CAP_FILE_CACHE;
|
||||||
|
while (1) {
|
||||||
|
got = 0;
|
||||||
|
ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
|
||||||
|
if (ret == 0)
|
||||||
|
break;
|
||||||
|
if (ret != -ERESTARTSYS) {
|
||||||
|
WARN_ON(1);
|
||||||
|
return VM_FAULT_SIGBUS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
|
||||||
|
inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
|
||||||
|
|
||||||
|
ret = filemap_fault(vma, vmf);
|
||||||
|
|
||||||
|
dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
|
||||||
|
inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
|
||||||
|
ceph_put_cap_refs(ci, got);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reuse write_begin here for simplicity.
|
* Reuse write_begin here for simplicity.
|
||||||
@ -1214,23 +1253,41 @@ const struct address_space_operations ceph_aops = {
|
|||||||
static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
struct inode *inode = file_inode(vma->vm_file);
|
struct inode *inode = file_inode(vma->vm_file);
|
||||||
struct page *page = vmf->page;
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_file_info *fi = vma->vm_file->private_data;
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||||
|
struct page *page = vmf->page;
|
||||||
loff_t off = page_offset(page);
|
loff_t off = page_offset(page);
|
||||||
loff_t size, len;
|
loff_t size = i_size_read(inode);
|
||||||
int ret;
|
size_t len;
|
||||||
|
int want, got, ret;
|
||||||
|
|
||||||
/* Update time before taking page lock */
|
|
||||||
file_update_time(vma->vm_file);
|
|
||||||
|
|
||||||
size = i_size_read(inode);
|
|
||||||
if (off + PAGE_CACHE_SIZE <= size)
|
if (off + PAGE_CACHE_SIZE <= size)
|
||||||
len = PAGE_CACHE_SIZE;
|
len = PAGE_CACHE_SIZE;
|
||||||
else
|
else
|
||||||
len = size & ~PAGE_CACHE_MASK;
|
len = size & ~PAGE_CACHE_MASK;
|
||||||
|
|
||||||
dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
|
dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
|
||||||
off, len, page, page->index);
|
inode, ceph_vinop(inode), off, len, size);
|
||||||
|
if (fi->fmode & CEPH_FILE_MODE_LAZY)
|
||||||
|
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
|
||||||
|
else
|
||||||
|
want = CEPH_CAP_FILE_BUFFER;
|
||||||
|
while (1) {
|
||||||
|
got = 0;
|
||||||
|
ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
|
||||||
|
if (ret == 0)
|
||||||
|
break;
|
||||||
|
if (ret != -ERESTARTSYS) {
|
||||||
|
WARN_ON(1);
|
||||||
|
return VM_FAULT_SIGBUS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
|
||||||
|
inode, off, len, ceph_cap_string(got));
|
||||||
|
|
||||||
|
/* Update time before taking page lock */
|
||||||
|
file_update_time(vma->vm_file);
|
||||||
|
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
|
|
||||||
@ -1252,14 +1309,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|||||||
ret = VM_FAULT_SIGBUS;
|
ret = VM_FAULT_SIGBUS;
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
|
if (ret != VM_FAULT_LOCKED) {
|
||||||
if (ret != VM_FAULT_LOCKED)
|
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
} else {
|
||||||
|
int dirty;
|
||||||
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
if (dirty)
|
||||||
|
__mark_inode_dirty(inode, dirty);
|
||||||
|
}
|
||||||
|
|
||||||
|
dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
|
||||||
|
inode, off, len, ceph_cap_string(got), ret);
|
||||||
|
ceph_put_cap_refs(ci, got);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct vm_operations_struct ceph_vmops = {
|
static struct vm_operations_struct ceph_vmops = {
|
||||||
.fault = filemap_fault,
|
.fault = ceph_filemap_fault,
|
||||||
.page_mkwrite = ceph_page_mkwrite,
|
.page_mkwrite = ceph_page_mkwrite,
|
||||||
.remap_pages = generic_file_remap_pages,
|
.remap_pages = generic_file_remap_pages,
|
||||||
};
|
};
|
||||||
|
@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
|
|||||||
return fscache_maybe_release_page(ci->fscache, page, gfp);
|
return fscache_maybe_release_page(ci->fscache, page, gfp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void ceph_fscache_readpage_cancel(struct inode *inode,
|
||||||
|
struct page *page)
|
||||||
|
{
|
||||||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
|
||||||
|
__fscache_uncache_page(ci->fscache, page);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
|
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
|
||||||
struct list_head *pages)
|
struct list_head *pages)
|
||||||
{
|
{
|
||||||
@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void ceph_fscache_readpage_cancel(struct inode *inode,
|
||||||
|
struct page *page)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
|
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
|
||||||
struct list_head *pages)
|
struct list_head *pages)
|
||||||
{
|
{
|
||||||
|
344
fs/ceph/caps.c
344
fs/ceph/caps.c
@ -555,21 +555,34 @@ retry:
|
|||||||
cap->ci = ci;
|
cap->ci = ci;
|
||||||
__insert_cap_node(ci, cap);
|
__insert_cap_node(ci, cap);
|
||||||
|
|
||||||
/* clear out old exporting info? (i.e. on cap import) */
|
|
||||||
if (ci->i_cap_exporting_mds == mds) {
|
|
||||||
ci->i_cap_exporting_issued = 0;
|
|
||||||
ci->i_cap_exporting_mseq = 0;
|
|
||||||
ci->i_cap_exporting_mds = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* add to session cap list */
|
/* add to session cap list */
|
||||||
cap->session = session;
|
cap->session = session;
|
||||||
spin_lock(&session->s_cap_lock);
|
spin_lock(&session->s_cap_lock);
|
||||||
list_add_tail(&cap->session_caps, &session->s_caps);
|
list_add_tail(&cap->session_caps, &session->s_caps);
|
||||||
session->s_nr_caps++;
|
session->s_nr_caps++;
|
||||||
spin_unlock(&session->s_cap_lock);
|
spin_unlock(&session->s_cap_lock);
|
||||||
} else if (new_cap)
|
} else {
|
||||||
ceph_put_cap(mdsc, new_cap);
|
if (new_cap)
|
||||||
|
ceph_put_cap(mdsc, new_cap);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* auth mds of the inode changed. we received the cap export
|
||||||
|
* message, but still haven't received the cap import message.
|
||||||
|
* handle_cap_export() updated the new auth MDS' cap.
|
||||||
|
*
|
||||||
|
* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
|
||||||
|
* a message that was send before the cap import message. So
|
||||||
|
* don't remove caps.
|
||||||
|
*/
|
||||||
|
if (ceph_seq_cmp(seq, cap->seq) <= 0) {
|
||||||
|
WARN_ON(cap != ci->i_auth_cap);
|
||||||
|
WARN_ON(cap->cap_id != cap_id);
|
||||||
|
seq = cap->seq;
|
||||||
|
mseq = cap->mseq;
|
||||||
|
issued |= cap->issued;
|
||||||
|
flags |= CEPH_CAP_FLAG_AUTH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!ci->i_snap_realm) {
|
if (!ci->i_snap_realm) {
|
||||||
/*
|
/*
|
||||||
@ -611,15 +624,9 @@ retry:
|
|||||||
if (ci->i_auth_cap == NULL ||
|
if (ci->i_auth_cap == NULL ||
|
||||||
ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
|
ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
|
||||||
ci->i_auth_cap = cap;
|
ci->i_auth_cap = cap;
|
||||||
} else if (ci->i_auth_cap == cap) {
|
ci->i_cap_exporting_issued = 0;
|
||||||
ci->i_auth_cap = NULL;
|
} else {
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
WARN_ON(ci->i_auth_cap == cap);
|
||||||
if (!list_empty(&ci->i_dirty_item)) {
|
|
||||||
dout(" moving %p to cap_dirty_migrating\n", inode);
|
|
||||||
list_move(&ci->i_dirty_item,
|
|
||||||
&mdsc->cap_dirty_migrating);
|
|
||||||
}
|
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
|
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
|
||||||
@ -628,7 +635,7 @@ retry:
|
|||||||
cap->cap_id = cap_id;
|
cap->cap_id = cap_id;
|
||||||
cap->issued = issued;
|
cap->issued = issued;
|
||||||
cap->implemented |= issued;
|
cap->implemented |= issued;
|
||||||
if (mseq > cap->mseq)
|
if (ceph_seq_cmp(mseq, cap->mseq) > 0)
|
||||||
cap->mds_wanted = wanted;
|
cap->mds_wanted = wanted;
|
||||||
else
|
else
|
||||||
cap->mds_wanted |= wanted;
|
cap->mds_wanted |= wanted;
|
||||||
@ -816,7 +823,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
|
|||||||
|
|
||||||
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
||||||
cap = rb_entry(p, struct ceph_cap, ci_node);
|
cap = rb_entry(p, struct ceph_cap, ci_node);
|
||||||
if (cap != ocap && __cap_is_valid(cap) &&
|
if (cap != ocap &&
|
||||||
(cap->implemented & ~cap->issued & mask))
|
(cap->implemented & ~cap->issued & mask))
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -888,7 +895,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
|
|||||||
*/
|
*/
|
||||||
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
|
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
|
||||||
{
|
{
|
||||||
return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
|
return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ceph_is_any_caps(struct inode *inode)
|
||||||
|
{
|
||||||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
ret = __ceph_is_any_caps(ci);
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1383,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
|
|||||||
ci->i_snap_realm->cached_context);
|
ci->i_snap_realm->cached_context);
|
||||||
dout(" inode %p now dirty snapc %p auth cap %p\n",
|
dout(" inode %p now dirty snapc %p auth cap %p\n",
|
||||||
&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
|
&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
|
||||||
|
WARN_ON(!ci->i_auth_cap);
|
||||||
BUG_ON(!list_empty(&ci->i_dirty_item));
|
BUG_ON(!list_empty(&ci->i_dirty_item));
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
spin_lock(&mdsc->cap_dirty_lock);
|
||||||
if (ci->i_auth_cap)
|
list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
|
||||||
list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
|
|
||||||
else
|
|
||||||
list_add(&ci->i_dirty_item,
|
|
||||||
&mdsc->cap_dirty_migrating);
|
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
spin_unlock(&mdsc->cap_dirty_lock);
|
||||||
if (ci->i_flushing_caps == 0) {
|
if (ci->i_flushing_caps == 0) {
|
||||||
ihold(inode);
|
ihold(inode);
|
||||||
@ -1735,13 +1751,12 @@ ack:
|
|||||||
/*
|
/*
|
||||||
* Try to flush dirty caps back to the auth mds.
|
* Try to flush dirty caps back to the auth mds.
|
||||||
*/
|
*/
|
||||||
static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
|
static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
|
||||||
unsigned *flush_tid)
|
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
int unlock_session = session ? 0 : 1;
|
|
||||||
int flushing = 0;
|
int flushing = 0;
|
||||||
|
struct ceph_mds_session *session = NULL;
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
@ -1755,13 +1770,14 @@ retry:
|
|||||||
int want = __ceph_caps_wanted(ci);
|
int want = __ceph_caps_wanted(ci);
|
||||||
int delayed;
|
int delayed;
|
||||||
|
|
||||||
if (!session) {
|
if (!session || session != cap->session) {
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
if (session)
|
||||||
|
mutex_unlock(&session->s_mutex);
|
||||||
session = cap->session;
|
session = cap->session;
|
||||||
mutex_lock(&session->s_mutex);
|
mutex_lock(&session->s_mutex);
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
BUG_ON(session != cap->session);
|
|
||||||
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
|
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
@ -1780,7 +1796,7 @@ retry:
|
|||||||
out:
|
out:
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
out_unlocked:
|
out_unlocked:
|
||||||
if (session && unlock_session)
|
if (session)
|
||||||
mutex_unlock(&session->s_mutex);
|
mutex_unlock(&session->s_mutex);
|
||||||
return flushing;
|
return flushing;
|
||||||
}
|
}
|
||||||
@ -1865,7 +1881,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
|||||||
return ret;
|
return ret;
|
||||||
mutex_lock(&inode->i_mutex);
|
mutex_lock(&inode->i_mutex);
|
||||||
|
|
||||||
dirty = try_flush_caps(inode, NULL, &flush_tid);
|
dirty = try_flush_caps(inode, &flush_tid);
|
||||||
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
|
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1900,7 +1916,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
|
|||||||
|
|
||||||
dout("write_inode %p wait=%d\n", inode, wait);
|
dout("write_inode %p wait=%d\n", inode, wait);
|
||||||
if (wait) {
|
if (wait) {
|
||||||
dirty = try_flush_caps(inode, NULL, &flush_tid);
|
dirty = try_flush_caps(inode, &flush_tid);
|
||||||
if (dirty)
|
if (dirty)
|
||||||
err = wait_event_interruptible(ci->i_cap_wq,
|
err = wait_event_interruptible(ci->i_cap_wq,
|
||||||
caps_are_flushed(inode, flush_tid));
|
caps_are_flushed(inode, flush_tid));
|
||||||
@ -2350,11 +2366,11 @@ static void invalidate_aliases(struct inode *inode)
|
|||||||
d_prune_aliases(inode);
|
d_prune_aliases(inode);
|
||||||
/*
|
/*
|
||||||
* For non-directory inode, d_find_alias() only returns
|
* For non-directory inode, d_find_alias() only returns
|
||||||
* connected dentry. After calling d_invalidate(), the
|
* hashed dentry. After calling d_invalidate(), the
|
||||||
* dentry become disconnected.
|
* dentry becomes unhashed.
|
||||||
*
|
*
|
||||||
* For directory inode, d_find_alias() can return
|
* For directory inode, d_find_alias() can return
|
||||||
* disconnected dentry. But directory inode should have
|
* unhashed dentry. But directory inode should have
|
||||||
* one alias at most.
|
* one alias at most.
|
||||||
*/
|
*/
|
||||||
while ((dn = d_find_alias(inode))) {
|
while ((dn = d_find_alias(inode))) {
|
||||||
@ -2408,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
|
|||||||
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
|
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
|
||||||
inode->i_size);
|
inode->i_size);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* auth mds of the inode changed. we received the cap export message,
|
||||||
|
* but still haven't received the cap import message. handle_cap_export
|
||||||
|
* updated the new auth MDS' cap.
|
||||||
|
*
|
||||||
|
* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
|
||||||
|
* that was sent before the cap import message. So don't remove caps.
|
||||||
|
*/
|
||||||
|
if (ceph_seq_cmp(seq, cap->seq) <= 0) {
|
||||||
|
WARN_ON(cap != ci->i_auth_cap);
|
||||||
|
WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
|
||||||
|
seq = cap->seq;
|
||||||
|
newcaps |= cap->issued;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If CACHE is being revoked, and we have no dirty buffers,
|
* If CACHE is being revoked, and we have no dirty buffers,
|
||||||
* try to invalidate (once). (If there are dirty buffers, we
|
* try to invalidate (once). (If there are dirty buffers, we
|
||||||
@ -2434,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
|
|||||||
issued |= implemented | __ceph_caps_dirty(ci);
|
issued |= implemented | __ceph_caps_dirty(ci);
|
||||||
|
|
||||||
cap->cap_gen = session->s_cap_gen;
|
cap->cap_gen = session->s_cap_gen;
|
||||||
|
cap->seq = seq;
|
||||||
|
|
||||||
__check_cap_issue(ci, cap, newcaps);
|
__check_cap_issue(ci, cap, newcaps);
|
||||||
|
|
||||||
@ -2464,6 +2497,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
|
|||||||
ceph_buffer_put(ci->i_xattrs.blob);
|
ceph_buffer_put(ci->i_xattrs.blob);
|
||||||
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
|
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
|
||||||
ci->i_xattrs.version = version;
|
ci->i_xattrs.version = version;
|
||||||
|
ceph_forget_all_cached_acls(inode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2483,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
|
|||||||
le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
|
le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
|
||||||
&atime);
|
&atime);
|
||||||
|
|
||||||
|
|
||||||
|
/* file layout may have changed */
|
||||||
|
ci->i_layout = grant->layout;
|
||||||
|
|
||||||
/* max size increase? */
|
/* max size increase? */
|
||||||
if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
|
if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
|
||||||
dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
|
dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
|
||||||
@ -2511,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
|
|||||||
check_caps = 1;
|
check_caps = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
cap->seq = seq;
|
|
||||||
|
|
||||||
/* file layout may have changed */
|
|
||||||
ci->i_layout = grant->layout;
|
|
||||||
|
|
||||||
/* revocation, grant, or no-op? */
|
/* revocation, grant, or no-op? */
|
||||||
if (cap->issued & ~newcaps) {
|
if (cap->issued & ~newcaps) {
|
||||||
int revoking = cap->issued & ~newcaps;
|
int revoking = cap->issued & ~newcaps;
|
||||||
@ -2741,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode,
|
|||||||
* caller holds s_mutex
|
* caller holds s_mutex
|
||||||
*/
|
*/
|
||||||
static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
|
static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
|
||||||
struct ceph_mds_session *session,
|
struct ceph_mds_cap_peer *ph,
|
||||||
int *open_target_sessions)
|
struct ceph_mds_session *session)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||||
|
struct ceph_mds_session *tsession = NULL;
|
||||||
|
struct ceph_cap *cap, *tcap;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
int mds = session->s_mds;
|
u64 t_cap_id;
|
||||||
unsigned mseq = le32_to_cpu(ex->migrate_seq);
|
unsigned mseq = le32_to_cpu(ex->migrate_seq);
|
||||||
struct ceph_cap *cap = NULL, *t;
|
unsigned t_seq, t_mseq;
|
||||||
struct rb_node *p;
|
int target, issued;
|
||||||
int remember = 1;
|
int mds = session->s_mds;
|
||||||
|
|
||||||
dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
|
if (ph) {
|
||||||
inode, ci, mds, mseq);
|
t_cap_id = le64_to_cpu(ph->cap_id);
|
||||||
|
t_seq = le32_to_cpu(ph->seq);
|
||||||
spin_lock(&ci->i_ceph_lock);
|
t_mseq = le32_to_cpu(ph->mseq);
|
||||||
|
target = le32_to_cpu(ph->mds);
|
||||||
/* make sure we haven't seen a higher mseq */
|
} else {
|
||||||
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
t_cap_id = t_seq = t_mseq = 0;
|
||||||
t = rb_entry(p, struct ceph_cap, ci_node);
|
target = -1;
|
||||||
if (ceph_seq_cmp(t->mseq, mseq) > 0) {
|
|
||||||
dout(" higher mseq on cap from mds%d\n",
|
|
||||||
t->session->s_mds);
|
|
||||||
remember = 0;
|
|
||||||
}
|
|
||||||
if (t->session->s_mds == mds)
|
|
||||||
cap = t;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cap) {
|
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
|
||||||
if (remember) {
|
inode, ci, mds, mseq, target);
|
||||||
/* make note */
|
retry:
|
||||||
ci->i_cap_exporting_mds = mds;
|
spin_lock(&ci->i_ceph_lock);
|
||||||
ci->i_cap_exporting_mseq = mseq;
|
cap = __get_cap_for_mds(ci, mds);
|
||||||
ci->i_cap_exporting_issued = cap->issued;
|
if (!cap)
|
||||||
|
goto out_unlock;
|
||||||
|
|
||||||
/*
|
if (target < 0) {
|
||||||
* make sure we have open sessions with all possible
|
__ceph_remove_cap(cap, false);
|
||||||
* export targets, so that we get the matching IMPORT
|
goto out_unlock;
|
||||||
*/
|
}
|
||||||
*open_target_sessions = 1;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* we can't flush dirty caps that we've seen the
|
* now we know we haven't received the cap import message yet
|
||||||
* EXPORT but no IMPORT for
|
* because the exported cap still exist.
|
||||||
*/
|
*/
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
|
||||||
if (!list_empty(&ci->i_dirty_item)) {
|
issued = cap->issued;
|
||||||
dout(" moving %p to cap_dirty_migrating\n",
|
WARN_ON(issued != cap->implemented);
|
||||||
inode);
|
|
||||||
list_move(&ci->i_dirty_item,
|
tcap = __get_cap_for_mds(ci, target);
|
||||||
&mdsc->cap_dirty_migrating);
|
if (tcap) {
|
||||||
|
/* already have caps from the target */
|
||||||
|
if (tcap->cap_id != t_cap_id ||
|
||||||
|
ceph_seq_cmp(tcap->seq, t_seq) < 0) {
|
||||||
|
dout(" updating import cap %p mds%d\n", tcap, target);
|
||||||
|
tcap->cap_id = t_cap_id;
|
||||||
|
tcap->seq = t_seq - 1;
|
||||||
|
tcap->issue_seq = t_seq - 1;
|
||||||
|
tcap->mseq = t_mseq;
|
||||||
|
tcap->issued |= issued;
|
||||||
|
tcap->implemented |= issued;
|
||||||
|
if (cap == ci->i_auth_cap)
|
||||||
|
ci->i_auth_cap = tcap;
|
||||||
|
if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
|
||||||
|
spin_lock(&mdsc->cap_dirty_lock);
|
||||||
|
list_move_tail(&ci->i_flushing_item,
|
||||||
|
&tcap->session->s_cap_flushing);
|
||||||
|
spin_unlock(&mdsc->cap_dirty_lock);
|
||||||
}
|
}
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
|
||||||
}
|
}
|
||||||
__ceph_remove_cap(cap, false);
|
__ceph_remove_cap(cap, false);
|
||||||
|
goto out_unlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tsession) {
|
||||||
|
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
/* add placeholder for the export tagert */
|
||||||
|
ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
|
||||||
|
t_seq - 1, t_mseq, (u64)-1, flag, NULL);
|
||||||
|
goto retry;
|
||||||
}
|
}
|
||||||
/* else, we already released it */
|
|
||||||
|
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
mutex_unlock(&session->s_mutex);
|
||||||
|
|
||||||
|
/* open target session */
|
||||||
|
tsession = ceph_mdsc_open_export_target_session(mdsc, target);
|
||||||
|
if (!IS_ERR(tsession)) {
|
||||||
|
if (mds > target) {
|
||||||
|
mutex_lock(&session->s_mutex);
|
||||||
|
mutex_lock_nested(&tsession->s_mutex,
|
||||||
|
SINGLE_DEPTH_NESTING);
|
||||||
|
} else {
|
||||||
|
mutex_lock(&tsession->s_mutex);
|
||||||
|
mutex_lock_nested(&session->s_mutex,
|
||||||
|
SINGLE_DEPTH_NESTING);
|
||||||
|
}
|
||||||
|
ceph_add_cap_releases(mdsc, tsession);
|
||||||
|
} else {
|
||||||
|
WARN_ON(1);
|
||||||
|
tsession = NULL;
|
||||||
|
target = -1;
|
||||||
|
}
|
||||||
|
goto retry;
|
||||||
|
|
||||||
|
out_unlock:
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
mutex_unlock(&session->s_mutex);
|
||||||
|
if (tsession) {
|
||||||
|
mutex_unlock(&tsession->s_mutex);
|
||||||
|
ceph_put_mds_session(tsession);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2810,10 +2892,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
|
|||||||
*/
|
*/
|
||||||
static void handle_cap_import(struct ceph_mds_client *mdsc,
|
static void handle_cap_import(struct ceph_mds_client *mdsc,
|
||||||
struct inode *inode, struct ceph_mds_caps *im,
|
struct inode *inode, struct ceph_mds_caps *im,
|
||||||
|
struct ceph_mds_cap_peer *ph,
|
||||||
struct ceph_mds_session *session,
|
struct ceph_mds_session *session,
|
||||||
void *snaptrace, int snaptrace_len)
|
void *snaptrace, int snaptrace_len)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_cap *cap;
|
||||||
int mds = session->s_mds;
|
int mds = session->s_mds;
|
||||||
unsigned issued = le32_to_cpu(im->caps);
|
unsigned issued = le32_to_cpu(im->caps);
|
||||||
unsigned wanted = le32_to_cpu(im->wanted);
|
unsigned wanted = le32_to_cpu(im->wanted);
|
||||||
@ -2821,28 +2905,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
|
|||||||
unsigned mseq = le32_to_cpu(im->migrate_seq);
|
unsigned mseq = le32_to_cpu(im->migrate_seq);
|
||||||
u64 realmino = le64_to_cpu(im->realm);
|
u64 realmino = le64_to_cpu(im->realm);
|
||||||
u64 cap_id = le64_to_cpu(im->cap_id);
|
u64 cap_id = le64_to_cpu(im->cap_id);
|
||||||
|
u64 p_cap_id;
|
||||||
|
int peer;
|
||||||
|
|
||||||
if (ci->i_cap_exporting_mds >= 0 &&
|
if (ph) {
|
||||||
ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
|
p_cap_id = le64_to_cpu(ph->cap_id);
|
||||||
dout("handle_cap_import inode %p ci %p mds%d mseq %d"
|
peer = le32_to_cpu(ph->mds);
|
||||||
" - cleared exporting from mds%d\n",
|
|
||||||
inode, ci, mds, mseq,
|
|
||||||
ci->i_cap_exporting_mds);
|
|
||||||
ci->i_cap_exporting_issued = 0;
|
|
||||||
ci->i_cap_exporting_mseq = 0;
|
|
||||||
ci->i_cap_exporting_mds = -1;
|
|
||||||
|
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
|
||||||
if (!list_empty(&ci->i_dirty_item)) {
|
|
||||||
dout(" moving %p back to cap_dirty\n", inode);
|
|
||||||
list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
|
|
||||||
}
|
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
|
||||||
} else {
|
} else {
|
||||||
dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
|
p_cap_id = 0;
|
||||||
inode, ci, mds, mseq);
|
peer = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
|
||||||
|
inode, ci, mds, mseq, peer);
|
||||||
|
|
||||||
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
|
||||||
|
if (cap && cap->cap_id == p_cap_id) {
|
||||||
|
dout(" remove export cap %p mds%d flags %d\n",
|
||||||
|
cap, peer, ph->flags);
|
||||||
|
if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
|
||||||
|
(cap->seq != le32_to_cpu(ph->seq) ||
|
||||||
|
cap->mseq != le32_to_cpu(ph->mseq))) {
|
||||||
|
pr_err("handle_cap_import: mismatched seq/mseq: "
|
||||||
|
"ino (%llx.%llx) mds%d seq %d mseq %d "
|
||||||
|
"importer mds%d has peer seq %d mseq %d\n",
|
||||||
|
ceph_vinop(inode), peer, cap->seq,
|
||||||
|
cap->mseq, mds, le32_to_cpu(ph->seq),
|
||||||
|
le32_to_cpu(ph->mseq));
|
||||||
|
}
|
||||||
|
ci->i_cap_exporting_issued = cap->issued;
|
||||||
|
__ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* make sure we re-request max_size, if necessary */
|
||||||
|
ci->i_wanted_max_size = 0;
|
||||||
|
ci->i_requested_max_size = 0;
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
|
||||||
down_write(&mdsc->snap_rwsem);
|
down_write(&mdsc->snap_rwsem);
|
||||||
ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
|
ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
|
||||||
false);
|
false);
|
||||||
@ -2853,11 +2953,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
|
|||||||
kick_flushing_inode_caps(mdsc, session, inode);
|
kick_flushing_inode_caps(mdsc, session, inode);
|
||||||
up_read(&mdsc->snap_rwsem);
|
up_read(&mdsc->snap_rwsem);
|
||||||
|
|
||||||
/* make sure we re-request max_size, if necessary */
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
|
||||||
ci->i_wanted_max_size = 0; /* reset */
|
|
||||||
ci->i_requested_max_size = 0;
|
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2875,6 +2970,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
|
|||||||
struct ceph_inode_info *ci;
|
struct ceph_inode_info *ci;
|
||||||
struct ceph_cap *cap;
|
struct ceph_cap *cap;
|
||||||
struct ceph_mds_caps *h;
|
struct ceph_mds_caps *h;
|
||||||
|
struct ceph_mds_cap_peer *peer = NULL;
|
||||||
int mds = session->s_mds;
|
int mds = session->s_mds;
|
||||||
int op;
|
int op;
|
||||||
u32 seq, mseq;
|
u32 seq, mseq;
|
||||||
@ -2885,12 +2981,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
|
|||||||
void *snaptrace;
|
void *snaptrace;
|
||||||
size_t snaptrace_len;
|
size_t snaptrace_len;
|
||||||
void *flock;
|
void *flock;
|
||||||
|
void *end;
|
||||||
u32 flock_len;
|
u32 flock_len;
|
||||||
int open_target_sessions = 0;
|
|
||||||
|
|
||||||
dout("handle_caps from mds%d\n", mds);
|
dout("handle_caps from mds%d\n", mds);
|
||||||
|
|
||||||
/* decode */
|
/* decode */
|
||||||
|
end = msg->front.iov_base + msg->front.iov_len;
|
||||||
tid = le64_to_cpu(msg->hdr.tid);
|
tid = le64_to_cpu(msg->hdr.tid);
|
||||||
if (msg->front.iov_len < sizeof(*h))
|
if (msg->front.iov_len < sizeof(*h))
|
||||||
goto bad;
|
goto bad;
|
||||||
@ -2908,17 +3005,28 @@ void ceph_handle_caps(struct ceph_mds_session *session,
|
|||||||
snaptrace_len = le32_to_cpu(h->snap_trace_len);
|
snaptrace_len = le32_to_cpu(h->snap_trace_len);
|
||||||
|
|
||||||
if (le16_to_cpu(msg->hdr.version) >= 2) {
|
if (le16_to_cpu(msg->hdr.version) >= 2) {
|
||||||
void *p, *end;
|
void *p = snaptrace + snaptrace_len;
|
||||||
|
|
||||||
p = snaptrace + snaptrace_len;
|
|
||||||
end = msg->front.iov_base + msg->front.iov_len;
|
|
||||||
ceph_decode_32_safe(&p, end, flock_len, bad);
|
ceph_decode_32_safe(&p, end, flock_len, bad);
|
||||||
|
if (p + flock_len > end)
|
||||||
|
goto bad;
|
||||||
flock = p;
|
flock = p;
|
||||||
} else {
|
} else {
|
||||||
flock = NULL;
|
flock = NULL;
|
||||||
flock_len = 0;
|
flock_len = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (le16_to_cpu(msg->hdr.version) >= 3) {
|
||||||
|
if (op == CEPH_CAP_OP_IMPORT) {
|
||||||
|
void *p = flock + flock_len;
|
||||||
|
if (p + sizeof(*peer) > end)
|
||||||
|
goto bad;
|
||||||
|
peer = p;
|
||||||
|
} else if (op == CEPH_CAP_OP_EXPORT) {
|
||||||
|
/* recorded in unused fields */
|
||||||
|
peer = (void *)&h->size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mutex_lock(&session->s_mutex);
|
mutex_lock(&session->s_mutex);
|
||||||
session->s_seq++;
|
session->s_seq++;
|
||||||
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
|
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
|
||||||
@ -2951,11 +3059,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
|
|||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
case CEPH_CAP_OP_EXPORT:
|
case CEPH_CAP_OP_EXPORT:
|
||||||
handle_cap_export(inode, h, session, &open_target_sessions);
|
handle_cap_export(inode, h, peer, session);
|
||||||
goto done;
|
goto done_unlocked;
|
||||||
|
|
||||||
case CEPH_CAP_OP_IMPORT:
|
case CEPH_CAP_OP_IMPORT:
|
||||||
handle_cap_import(mdsc, inode, h, session,
|
handle_cap_import(mdsc, inode, h, peer, session,
|
||||||
snaptrace, snaptrace_len);
|
snaptrace, snaptrace_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3007,8 +3115,6 @@ done:
|
|||||||
done_unlocked:
|
done_unlocked:
|
||||||
if (inode)
|
if (inode)
|
||||||
iput(inode);
|
iput(inode);
|
||||||
if (open_target_sessions)
|
|
||||||
ceph_mdsc_open_export_target_sessions(mdsc, session);
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
bad:
|
bad:
|
||||||
|
@ -693,6 +693,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
|
|||||||
if (!err && !req->r_reply_info.head->is_dentry)
|
if (!err && !req->r_reply_info.head->is_dentry)
|
||||||
err = ceph_handle_notrace_create(dir, dentry);
|
err = ceph_handle_notrace_create(dir, dentry);
|
||||||
ceph_mdsc_put_request(req);
|
ceph_mdsc_put_request(req);
|
||||||
|
|
||||||
|
if (!err)
|
||||||
|
err = ceph_init_acl(dentry, dentry->d_inode, dir);
|
||||||
|
|
||||||
if (err)
|
if (err)
|
||||||
d_drop(dentry);
|
d_drop(dentry);
|
||||||
return err;
|
return err;
|
||||||
@ -1037,14 +1041,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
|
|||||||
valid = 1;
|
valid = 1;
|
||||||
} else if (dentry_lease_is_valid(dentry) ||
|
} else if (dentry_lease_is_valid(dentry) ||
|
||||||
dir_lease_is_valid(dir, dentry)) {
|
dir_lease_is_valid(dir, dentry)) {
|
||||||
valid = 1;
|
if (dentry->d_inode)
|
||||||
|
valid = ceph_is_any_caps(dentry->d_inode);
|
||||||
|
else
|
||||||
|
valid = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
|
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
|
||||||
if (valid)
|
if (valid) {
|
||||||
ceph_dentry_lru_touch(dentry);
|
ceph_dentry_lru_touch(dentry);
|
||||||
else
|
} else {
|
||||||
|
ceph_dir_clear_complete(dir);
|
||||||
d_drop(dentry);
|
d_drop(dentry);
|
||||||
|
}
|
||||||
iput(dir);
|
iput(dir);
|
||||||
return valid;
|
return valid;
|
||||||
}
|
}
|
||||||
@ -1293,6 +1302,7 @@ const struct inode_operations ceph_dir_iops = {
|
|||||||
.getxattr = ceph_getxattr,
|
.getxattr = ceph_getxattr,
|
||||||
.listxattr = ceph_listxattr,
|
.listxattr = ceph_listxattr,
|
||||||
.removexattr = ceph_removexattr,
|
.removexattr = ceph_removexattr,
|
||||||
|
.get_acl = ceph_get_acl,
|
||||||
.mknod = ceph_mknod,
|
.mknod = ceph_mknod,
|
||||||
.symlink = ceph_symlink,
|
.symlink = ceph_symlink,
|
||||||
.mkdir = ceph_mkdir,
|
.mkdir = ceph_mkdir,
|
||||||
|
435
fs/ceph/file.c
435
fs/ceph/file.c
@ -408,51 +408,92 @@ more:
|
|||||||
*
|
*
|
||||||
* If the read spans object boundary, just do multiple reads.
|
* If the read spans object boundary, just do multiple reads.
|
||||||
*/
|
*/
|
||||||
static ssize_t ceph_sync_read(struct file *file, char __user *data,
|
static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
|
||||||
unsigned len, loff_t *poff, int *checkeof)
|
int *checkeof)
|
||||||
{
|
{
|
||||||
|
struct file *file = iocb->ki_filp;
|
||||||
struct inode *inode = file_inode(file);
|
struct inode *inode = file_inode(file);
|
||||||
struct page **pages;
|
struct page **pages;
|
||||||
u64 off = *poff;
|
u64 off = iocb->ki_pos;
|
||||||
int num_pages, ret;
|
int num_pages, ret;
|
||||||
|
size_t len = i->count;
|
||||||
|
|
||||||
dout("sync_read on file %p %llu~%u %s\n", file, off, len,
|
dout("sync_read on file %p %llu~%u %s\n", file, off,
|
||||||
|
(unsigned)len,
|
||||||
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
|
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
|
||||||
|
|
||||||
if (file->f_flags & O_DIRECT) {
|
|
||||||
num_pages = calc_pages_for((unsigned long)data, len);
|
|
||||||
pages = ceph_get_direct_page_vector(data, num_pages, true);
|
|
||||||
} else {
|
|
||||||
num_pages = calc_pages_for(off, len);
|
|
||||||
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
|
|
||||||
}
|
|
||||||
if (IS_ERR(pages))
|
|
||||||
return PTR_ERR(pages);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* flush any page cache pages in this range. this
|
* flush any page cache pages in this range. this
|
||||||
* will make concurrent normal and sync io slow,
|
* will make concurrent normal and sync io slow,
|
||||||
* but it will at least behave sensibly when they are
|
* but it will at least behave sensibly when they are
|
||||||
* in sequence.
|
* in sequence.
|
||||||
*/
|
*/
|
||||||
ret = filemap_write_and_wait(inode->i_mapping);
|
ret = filemap_write_and_wait_range(inode->i_mapping, off,
|
||||||
|
off + len);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
goto done;
|
return ret;
|
||||||
|
|
||||||
ret = striped_read(inode, off, len, pages, num_pages, checkeof,
|
if (file->f_flags & O_DIRECT) {
|
||||||
file->f_flags & O_DIRECT,
|
while (iov_iter_count(i)) {
|
||||||
(unsigned long)data & ~PAGE_MASK);
|
void __user *data = i->iov[0].iov_base + i->iov_offset;
|
||||||
|
size_t len = i->iov[0].iov_len - i->iov_offset;
|
||||||
|
|
||||||
if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
|
num_pages = calc_pages_for((unsigned long)data, len);
|
||||||
ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
|
pages = ceph_get_direct_page_vector(data,
|
||||||
if (ret >= 0)
|
num_pages, true);
|
||||||
*poff = off + ret;
|
if (IS_ERR(pages))
|
||||||
|
return PTR_ERR(pages);
|
||||||
|
|
||||||
done:
|
ret = striped_read(inode, off, len,
|
||||||
if (file->f_flags & O_DIRECT)
|
pages, num_pages, checkeof,
|
||||||
ceph_put_page_vector(pages, num_pages, true);
|
1, (unsigned long)data & ~PAGE_MASK);
|
||||||
else
|
ceph_put_page_vector(pages, num_pages, true);
|
||||||
|
|
||||||
|
if (ret <= 0)
|
||||||
|
break;
|
||||||
|
off += ret;
|
||||||
|
iov_iter_advance(i, ret);
|
||||||
|
if (ret < len)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
num_pages = calc_pages_for(off, len);
|
||||||
|
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
|
||||||
|
if (IS_ERR(pages))
|
||||||
|
return PTR_ERR(pages);
|
||||||
|
ret = striped_read(inode, off, len, pages,
|
||||||
|
num_pages, checkeof, 0, 0);
|
||||||
|
if (ret > 0) {
|
||||||
|
int l, k = 0;
|
||||||
|
size_t left = len = ret;
|
||||||
|
|
||||||
|
while (left) {
|
||||||
|
void __user *data = i->iov[0].iov_base
|
||||||
|
+ i->iov_offset;
|
||||||
|
l = min(i->iov[0].iov_len - i->iov_offset,
|
||||||
|
left);
|
||||||
|
|
||||||
|
ret = ceph_copy_page_vector_to_user(&pages[k],
|
||||||
|
data, off,
|
||||||
|
l);
|
||||||
|
if (ret > 0) {
|
||||||
|
iov_iter_advance(i, ret);
|
||||||
|
left -= ret;
|
||||||
|
off += ret;
|
||||||
|
k = calc_pages_for(iocb->ki_pos,
|
||||||
|
len - left + 1) - 1;
|
||||||
|
BUG_ON(k >= num_pages && left);
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
ceph_release_page_vector(pages, num_pages);
|
ceph_release_page_vector(pages, num_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (off > iocb->ki_pos) {
|
||||||
|
ret = off - iocb->ki_pos;
|
||||||
|
iocb->ki_pos = off;
|
||||||
|
}
|
||||||
|
|
||||||
dout("sync_read result %d\n", ret);
|
dout("sync_read result %d\n", ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -489,83 +530,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Synchronous write, straight from __user pointer or user pages (if
|
* Synchronous write, straight from __user pointer or user pages.
|
||||||
* O_DIRECT).
|
|
||||||
*
|
*
|
||||||
* If write spans object boundary, just do multiple writes. (For a
|
* If write spans object boundary, just do multiple writes. (For a
|
||||||
* correct atomic write, we should e.g. take write locks on all
|
* correct atomic write, we should e.g. take write locks on all
|
||||||
* objects, rollback on failure, etc.)
|
* objects, rollback on failure, etc.)
|
||||||
*/
|
*/
|
||||||
static ssize_t ceph_sync_write(struct file *file, const char __user *data,
|
static ssize_t
|
||||||
size_t left, loff_t pos, loff_t *ppos)
|
ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
|
||||||
|
unsigned long nr_segs, size_t count)
|
||||||
{
|
{
|
||||||
|
struct file *file = iocb->ki_filp;
|
||||||
struct inode *inode = file_inode(file);
|
struct inode *inode = file_inode(file);
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||||
struct ceph_snap_context *snapc;
|
struct ceph_snap_context *snapc;
|
||||||
struct ceph_vino vino;
|
struct ceph_vino vino;
|
||||||
struct ceph_osd_request *req;
|
struct ceph_osd_request *req;
|
||||||
int num_ops = 1;
|
|
||||||
struct page **pages;
|
struct page **pages;
|
||||||
int num_pages;
|
int num_pages;
|
||||||
u64 len;
|
|
||||||
int written = 0;
|
int written = 0;
|
||||||
int flags;
|
int flags;
|
||||||
int check_caps = 0;
|
int check_caps = 0;
|
||||||
int page_align, io_align;
|
int page_align;
|
||||||
unsigned long buf_align;
|
|
||||||
int ret;
|
int ret;
|
||||||
struct timespec mtime = CURRENT_TIME;
|
struct timespec mtime = CURRENT_TIME;
|
||||||
bool own_pages = false;
|
loff_t pos = iocb->ki_pos;
|
||||||
|
struct iov_iter i;
|
||||||
|
|
||||||
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
|
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
|
||||||
return -EROFS;
|
return -EROFS;
|
||||||
|
|
||||||
dout("sync_write on file %p %lld~%u %s\n", file, pos,
|
dout("sync_direct_write on file %p %lld~%u\n", file, pos,
|
||||||
(unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
|
(unsigned)count);
|
||||||
|
|
||||||
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
|
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
ret = invalidate_inode_pages2_range(inode->i_mapping,
|
ret = invalidate_inode_pages2_range(inode->i_mapping,
|
||||||
pos >> PAGE_CACHE_SHIFT,
|
pos >> PAGE_CACHE_SHIFT,
|
||||||
(pos + left) >> PAGE_CACHE_SHIFT);
|
(pos + count) >> PAGE_CACHE_SHIFT);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
dout("invalidate_inode_pages2_range returned %d\n", ret);
|
dout("invalidate_inode_pages2_range returned %d\n", ret);
|
||||||
|
|
||||||
flags = CEPH_OSD_FLAG_ORDERSNAP |
|
flags = CEPH_OSD_FLAG_ORDERSNAP |
|
||||||
CEPH_OSD_FLAG_ONDISK |
|
CEPH_OSD_FLAG_ONDISK |
|
||||||
CEPH_OSD_FLAG_WRITE;
|
CEPH_OSD_FLAG_WRITE;
|
||||||
if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
|
|
||||||
flags |= CEPH_OSD_FLAG_ACK;
|
|
||||||
else
|
|
||||||
num_ops++; /* Also include a 'startsync' command. */
|
|
||||||
|
|
||||||
/*
|
iov_iter_init(&i, iov, nr_segs, count, 0);
|
||||||
* we may need to do multiple writes here if we span an object
|
|
||||||
* boundary. this isn't atomic, unfortunately. :(
|
|
||||||
*/
|
|
||||||
more:
|
|
||||||
io_align = pos & ~PAGE_MASK;
|
|
||||||
buf_align = (unsigned long)data & ~PAGE_MASK;
|
|
||||||
len = left;
|
|
||||||
|
|
||||||
snapc = ci->i_snap_realm->cached_context;
|
while (iov_iter_count(&i) > 0) {
|
||||||
vino = ceph_vino(inode);
|
void __user *data = i.iov->iov_base + i.iov_offset;
|
||||||
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
u64 len = i.iov->iov_len - i.iov_offset;
|
||||||
vino, pos, &len, num_ops,
|
|
||||||
CEPH_OSD_OP_WRITE, flags, snapc,
|
|
||||||
ci->i_truncate_seq, ci->i_truncate_size,
|
|
||||||
false);
|
|
||||||
if (IS_ERR(req))
|
|
||||||
return PTR_ERR(req);
|
|
||||||
|
|
||||||
/* write from beginning of first page, regardless of io alignment */
|
page_align = (unsigned long)data & ~PAGE_MASK;
|
||||||
page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
|
|
||||||
num_pages = calc_pages_for(page_align, len);
|
snapc = ci->i_snap_realm->cached_context;
|
||||||
if (file->f_flags & O_DIRECT) {
|
vino = ceph_vino(inode);
|
||||||
|
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
||||||
|
vino, pos, &len,
|
||||||
|
2,/*include a 'startsync' command*/
|
||||||
|
CEPH_OSD_OP_WRITE, flags, snapc,
|
||||||
|
ci->i_truncate_seq,
|
||||||
|
ci->i_truncate_size,
|
||||||
|
false);
|
||||||
|
if (IS_ERR(req)) {
|
||||||
|
ret = PTR_ERR(req);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
num_pages = calc_pages_for(page_align, len);
|
||||||
pages = ceph_get_direct_page_vector(data, num_pages, false);
|
pages = ceph_get_direct_page_vector(data, num_pages, false);
|
||||||
if (IS_ERR(pages)) {
|
if (IS_ERR(pages)) {
|
||||||
ret = PTR_ERR(pages);
|
ret = PTR_ERR(pages);
|
||||||
@ -577,60 +614,175 @@ more:
|
|||||||
* may block.
|
* may block.
|
||||||
*/
|
*/
|
||||||
truncate_inode_pages_range(inode->i_mapping, pos,
|
truncate_inode_pages_range(inode->i_mapping, pos,
|
||||||
(pos+len) | (PAGE_CACHE_SIZE-1));
|
(pos+len) | (PAGE_CACHE_SIZE-1));
|
||||||
} else {
|
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
|
||||||
|
false, false);
|
||||||
|
|
||||||
|
/* BUG_ON(vino.snap != CEPH_NOSNAP); */
|
||||||
|
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
|
||||||
|
|
||||||
|
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
|
||||||
|
if (!ret)
|
||||||
|
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
|
||||||
|
|
||||||
|
ceph_put_page_vector(pages, num_pages, false);
|
||||||
|
|
||||||
|
out:
|
||||||
|
ceph_osdc_put_request(req);
|
||||||
|
if (ret == 0) {
|
||||||
|
pos += len;
|
||||||
|
written += len;
|
||||||
|
iov_iter_advance(&i, (size_t)len);
|
||||||
|
|
||||||
|
if (pos > i_size_read(inode)) {
|
||||||
|
check_caps = ceph_inode_set_size(inode, pos);
|
||||||
|
if (check_caps)
|
||||||
|
ceph_check_caps(ceph_inode(inode),
|
||||||
|
CHECK_CAPS_AUTHONLY,
|
||||||
|
NULL);
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret != -EOLDSNAPC && written > 0) {
|
||||||
|
iocb->ki_pos = pos;
|
||||||
|
ret = written;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Synchronous write, straight from __user pointer or user pages.
|
||||||
|
*
|
||||||
|
* If write spans object boundary, just do multiple writes. (For a
|
||||||
|
* correct atomic write, we should e.g. take write locks on all
|
||||||
|
* objects, rollback on failure, etc.)
|
||||||
|
*/
|
||||||
|
static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
|
||||||
|
unsigned long nr_segs, size_t count)
|
||||||
|
{
|
||||||
|
struct file *file = iocb->ki_filp;
|
||||||
|
struct inode *inode = file_inode(file);
|
||||||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||||
|
struct ceph_snap_context *snapc;
|
||||||
|
struct ceph_vino vino;
|
||||||
|
struct ceph_osd_request *req;
|
||||||
|
struct page **pages;
|
||||||
|
u64 len;
|
||||||
|
int num_pages;
|
||||||
|
int written = 0;
|
||||||
|
int flags;
|
||||||
|
int check_caps = 0;
|
||||||
|
int ret;
|
||||||
|
struct timespec mtime = CURRENT_TIME;
|
||||||
|
loff_t pos = iocb->ki_pos;
|
||||||
|
struct iov_iter i;
|
||||||
|
|
||||||
|
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
|
||||||
|
return -EROFS;
|
||||||
|
|
||||||
|
dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
|
||||||
|
|
||||||
|
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
ret = invalidate_inode_pages2_range(inode->i_mapping,
|
||||||
|
pos >> PAGE_CACHE_SHIFT,
|
||||||
|
(pos + count) >> PAGE_CACHE_SHIFT);
|
||||||
|
if (ret < 0)
|
||||||
|
dout("invalidate_inode_pages2_range returned %d\n", ret);
|
||||||
|
|
||||||
|
flags = CEPH_OSD_FLAG_ORDERSNAP |
|
||||||
|
CEPH_OSD_FLAG_ONDISK |
|
||||||
|
CEPH_OSD_FLAG_WRITE |
|
||||||
|
CEPH_OSD_FLAG_ACK;
|
||||||
|
|
||||||
|
iov_iter_init(&i, iov, nr_segs, count, 0);
|
||||||
|
|
||||||
|
while ((len = iov_iter_count(&i)) > 0) {
|
||||||
|
size_t left;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
snapc = ci->i_snap_realm->cached_context;
|
||||||
|
vino = ceph_vino(inode);
|
||||||
|
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
||||||
|
vino, pos, &len, 1,
|
||||||
|
CEPH_OSD_OP_WRITE, flags, snapc,
|
||||||
|
ci->i_truncate_seq,
|
||||||
|
ci->i_truncate_size,
|
||||||
|
false);
|
||||||
|
if (IS_ERR(req)) {
|
||||||
|
ret = PTR_ERR(req);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* write from beginning of first page,
|
||||||
|
* regardless of io alignment
|
||||||
|
*/
|
||||||
|
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
||||||
|
|
||||||
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
|
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
|
||||||
if (IS_ERR(pages)) {
|
if (IS_ERR(pages)) {
|
||||||
ret = PTR_ERR(pages);
|
ret = PTR_ERR(pages);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
|
|
||||||
|
left = len;
|
||||||
|
for (n = 0; n < num_pages; n++) {
|
||||||
|
size_t plen = min_t(size_t, left, PAGE_SIZE);
|
||||||
|
ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
|
||||||
|
if (ret != plen) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
left -= ret;
|
||||||
|
iov_iter_advance(&i, ret);
|
||||||
|
}
|
||||||
|
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
ceph_release_page_vector(pages, num_pages);
|
ceph_release_page_vector(pages, num_pages);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((file->f_flags & O_SYNC) == 0) {
|
/* get a second commit callback */
|
||||||
/* get a second commit callback */
|
req->r_unsafe_callback = ceph_sync_write_unsafe;
|
||||||
req->r_unsafe_callback = ceph_sync_write_unsafe;
|
req->r_inode = inode;
|
||||||
req->r_inode = inode;
|
|
||||||
own_pages = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
|
|
||||||
false, own_pages);
|
|
||||||
|
|
||||||
/* BUG_ON(vino.snap != CEPH_NOSNAP); */
|
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
|
||||||
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
|
false, true);
|
||||||
|
|
||||||
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
|
/* BUG_ON(vino.snap != CEPH_NOSNAP); */
|
||||||
if (!ret)
|
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
|
||||||
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
|
|
||||||
|
|
||||||
if (file->f_flags & O_DIRECT)
|
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
|
||||||
ceph_put_page_vector(pages, num_pages, false);
|
if (!ret)
|
||||||
else if (file->f_flags & O_SYNC)
|
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
|
||||||
ceph_release_page_vector(pages, num_pages);
|
|
||||||
|
|
||||||
out:
|
out:
|
||||||
ceph_osdc_put_request(req);
|
ceph_osdc_put_request(req);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
pos += len;
|
pos += len;
|
||||||
written += len;
|
written += len;
|
||||||
left -= len;
|
|
||||||
data += len;
|
|
||||||
if (left)
|
|
||||||
goto more;
|
|
||||||
|
|
||||||
|
if (pos > i_size_read(inode)) {
|
||||||
|
check_caps = ceph_inode_set_size(inode, pos);
|
||||||
|
if (check_caps)
|
||||||
|
ceph_check_caps(ceph_inode(inode),
|
||||||
|
CHECK_CAPS_AUTHONLY,
|
||||||
|
NULL);
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret != -EOLDSNAPC && written > 0) {
|
||||||
ret = written;
|
ret = written;
|
||||||
*ppos = pos;
|
iocb->ki_pos = pos;
|
||||||
if (pos > i_size_read(inode))
|
|
||||||
check_caps = ceph_inode_set_size(inode, pos);
|
|
||||||
if (check_caps)
|
|
||||||
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
|
|
||||||
NULL);
|
|
||||||
} else if (ret != -EOLDSNAPC && written > 0) {
|
|
||||||
ret = written;
|
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -647,55 +799,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
|||||||
{
|
{
|
||||||
struct file *filp = iocb->ki_filp;
|
struct file *filp = iocb->ki_filp;
|
||||||
struct ceph_file_info *fi = filp->private_data;
|
struct ceph_file_info *fi = filp->private_data;
|
||||||
loff_t *ppos = &iocb->ki_pos;
|
size_t len = iocb->ki_nbytes;
|
||||||
size_t len = iov->iov_len;
|
|
||||||
struct inode *inode = file_inode(filp);
|
struct inode *inode = file_inode(filp);
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
void __user *base = iov->iov_base;
|
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
int want, got = 0;
|
int want, got = 0;
|
||||||
int checkeof = 0, read = 0;
|
int checkeof = 0, read = 0;
|
||||||
|
|
||||||
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
|
|
||||||
inode, ceph_vinop(inode), pos, (unsigned)len, inode);
|
|
||||||
again:
|
again:
|
||||||
|
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
|
||||||
|
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
|
||||||
|
|
||||||
if (fi->fmode & CEPH_FILE_MODE_LAZY)
|
if (fi->fmode & CEPH_FILE_MODE_LAZY)
|
||||||
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
|
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
|
||||||
else
|
else
|
||||||
want = CEPH_CAP_FILE_CACHE;
|
want = CEPH_CAP_FILE_CACHE;
|
||||||
ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
|
ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
goto out;
|
return ret;
|
||||||
dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
|
|
||||||
inode, ceph_vinop(inode), pos, (unsigned)len,
|
|
||||||
ceph_cap_string(got));
|
|
||||||
|
|
||||||
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
|
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
|
||||||
(iocb->ki_filp->f_flags & O_DIRECT) ||
|
(iocb->ki_filp->f_flags & O_DIRECT) ||
|
||||||
(fi->flags & CEPH_F_SYNC))
|
(fi->flags & CEPH_F_SYNC)) {
|
||||||
/* hmm, this isn't really async... */
|
struct iov_iter i;
|
||||||
ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
|
|
||||||
else
|
|
||||||
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
|
|
||||||
|
|
||||||
|
dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
|
||||||
|
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
|
||||||
|
ceph_cap_string(got));
|
||||||
|
|
||||||
|
if (!read) {
|
||||||
|
ret = generic_segment_checks(iov, &nr_segs,
|
||||||
|
&len, VERIFY_WRITE);
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
iov_iter_init(&i, iov, nr_segs, len, read);
|
||||||
|
|
||||||
|
/* hmm, this isn't really async... */
|
||||||
|
ret = ceph_sync_read(iocb, &i, &checkeof);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* We can't modify the content of iov,
|
||||||
|
* so we only read from beginning.
|
||||||
|
*/
|
||||||
|
if (read) {
|
||||||
|
iocb->ki_pos = pos;
|
||||||
|
len = iocb->ki_nbytes;
|
||||||
|
read = 0;
|
||||||
|
}
|
||||||
|
dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
|
||||||
|
inode, ceph_vinop(inode), pos, (unsigned)len,
|
||||||
|
ceph_cap_string(got));
|
||||||
|
|
||||||
|
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
|
||||||
|
}
|
||||||
out:
|
out:
|
||||||
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
|
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
|
||||||
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
|
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
|
||||||
ceph_put_cap_refs(ci, got);
|
ceph_put_cap_refs(ci, got);
|
||||||
|
|
||||||
if (checkeof && ret >= 0) {
|
if (checkeof && ret >= 0) {
|
||||||
int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
|
int statret = ceph_do_getattr(inode,
|
||||||
|
CEPH_STAT_CAP_SIZE);
|
||||||
|
|
||||||
/* hit EOF or hole? */
|
/* hit EOF or hole? */
|
||||||
if (statret == 0 && *ppos < inode->i_size) {
|
if (statret == 0 && iocb->ki_pos < inode->i_size &&
|
||||||
dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
|
ret < len) {
|
||||||
|
dout("sync_read hit hole, ppos %lld < size %lld"
|
||||||
|
", reading more\n", iocb->ki_pos,
|
||||||
|
inode->i_size);
|
||||||
|
|
||||||
read += ret;
|
read += ret;
|
||||||
base += ret;
|
|
||||||
len -= ret;
|
len -= ret;
|
||||||
checkeof = 0;
|
checkeof = 0;
|
||||||
goto again;
|
goto again;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ret >= 0)
|
if (ret >= 0)
|
||||||
ret += read;
|
ret += read;
|
||||||
|
|
||||||
@ -772,11 +953,13 @@ retry_snap:
|
|||||||
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
|
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
|
||||||
|
|
||||||
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
|
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
|
||||||
(iocb->ki_filp->f_flags & O_DIRECT) ||
|
(file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
|
||||||
(fi->flags & CEPH_F_SYNC)) {
|
|
||||||
mutex_unlock(&inode->i_mutex);
|
mutex_unlock(&inode->i_mutex);
|
||||||
written = ceph_sync_write(file, iov->iov_base, count,
|
if (file->f_flags & O_DIRECT)
|
||||||
pos, &iocb->ki_pos);
|
written = ceph_sync_direct_write(iocb, iov,
|
||||||
|
nr_segs, count);
|
||||||
|
else
|
||||||
|
written = ceph_sync_write(iocb, iov, nr_segs, count);
|
||||||
if (written == -EOLDSNAPC) {
|
if (written == -EOLDSNAPC) {
|
||||||
dout("aio_write %p %llx.%llx %llu~%u"
|
dout("aio_write %p %llx.%llx %llu~%u"
|
||||||
"got EOLDSNAPC, retrying\n",
|
"got EOLDSNAPC, retrying\n",
|
||||||
@ -1018,7 +1201,7 @@ static long ceph_fallocate(struct file *file, int mode,
|
|||||||
loff_t offset, loff_t length)
|
loff_t offset, loff_t length)
|
||||||
{
|
{
|
||||||
struct ceph_file_info *fi = file->private_data;
|
struct ceph_file_info *fi = file->private_data;
|
||||||
struct inode *inode = file->f_dentry->d_inode;
|
struct inode *inode = file_inode(file);
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_osd_client *osdc =
|
struct ceph_osd_client *osdc =
|
||||||
&ceph_inode_to_client(inode)->client->osdc;
|
&ceph_inode_to_client(inode)->client->osdc;
|
||||||
|
@ -95,6 +95,7 @@ const struct inode_operations ceph_file_iops = {
|
|||||||
.getxattr = ceph_getxattr,
|
.getxattr = ceph_getxattr,
|
||||||
.listxattr = ceph_listxattr,
|
.listxattr = ceph_listxattr,
|
||||||
.removexattr = ceph_removexattr,
|
.removexattr = ceph_removexattr,
|
||||||
|
.get_acl = ceph_get_acl,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -335,12 +336,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
|
|||||||
ci->i_hold_caps_min = 0;
|
ci->i_hold_caps_min = 0;
|
||||||
ci->i_hold_caps_max = 0;
|
ci->i_hold_caps_max = 0;
|
||||||
INIT_LIST_HEAD(&ci->i_cap_delay_list);
|
INIT_LIST_HEAD(&ci->i_cap_delay_list);
|
||||||
ci->i_cap_exporting_mds = 0;
|
|
||||||
ci->i_cap_exporting_mseq = 0;
|
|
||||||
ci->i_cap_exporting_issued = 0;
|
|
||||||
INIT_LIST_HEAD(&ci->i_cap_snaps);
|
INIT_LIST_HEAD(&ci->i_cap_snaps);
|
||||||
ci->i_head_snapc = NULL;
|
ci->i_head_snapc = NULL;
|
||||||
ci->i_snap_caps = 0;
|
ci->i_snap_caps = 0;
|
||||||
|
ci->i_cap_exporting_issued = 0;
|
||||||
|
|
||||||
for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
|
for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
|
||||||
ci->i_nr_by_mode[i] = 0;
|
ci->i_nr_by_mode[i] = 0;
|
||||||
@ -436,6 +435,16 @@ void ceph_destroy_inode(struct inode *inode)
|
|||||||
call_rcu(&inode->i_rcu, ceph_i_callback);
|
call_rcu(&inode->i_rcu, ceph_i_callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ceph_drop_inode(struct inode *inode)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Positve dentry and corresponding inode are always accompanied
|
||||||
|
* in MDS reply. So no need to keep inode in the cache after
|
||||||
|
* dropping all its aliases.
|
||||||
|
*/
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Helpers to fill in size, ctime, mtime, and atime. We have to be
|
* Helpers to fill in size, ctime, mtime, and atime. We have to be
|
||||||
* careful because either the client or MDS may have more up to date
|
* careful because either the client or MDS may have more up to date
|
||||||
@ -670,6 +679,7 @@ static int fill_inode(struct inode *inode,
|
|||||||
memcpy(ci->i_xattrs.blob->vec.iov_base,
|
memcpy(ci->i_xattrs.blob->vec.iov_base,
|
||||||
iinfo->xattr_data, iinfo->xattr_len);
|
iinfo->xattr_data, iinfo->xattr_len);
|
||||||
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
|
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
|
||||||
|
ceph_forget_all_cached_acls(inode);
|
||||||
xattr_blob = NULL;
|
xattr_blob = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1454,7 +1464,8 @@ static void ceph_invalidate_work(struct work_struct *work)
|
|||||||
dout("invalidate_pages %p gen %d revoking %d\n", inode,
|
dout("invalidate_pages %p gen %d revoking %d\n", inode,
|
||||||
ci->i_rdcache_gen, ci->i_rdcache_revoking);
|
ci->i_rdcache_gen, ci->i_rdcache_revoking);
|
||||||
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
|
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
|
||||||
/* nevermind! */
|
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
|
||||||
|
check = 1;
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
mutex_unlock(&ci->i_truncate_mutex);
|
mutex_unlock(&ci->i_truncate_mutex);
|
||||||
goto out;
|
goto out;
|
||||||
@ -1475,13 +1486,14 @@ static void ceph_invalidate_work(struct work_struct *work)
|
|||||||
dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
|
dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
|
||||||
inode, orig_gen, ci->i_rdcache_gen,
|
inode, orig_gen, ci->i_rdcache_gen,
|
||||||
ci->i_rdcache_revoking);
|
ci->i_rdcache_revoking);
|
||||||
|
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
|
||||||
|
check = 1;
|
||||||
}
|
}
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
mutex_unlock(&ci->i_truncate_mutex);
|
mutex_unlock(&ci->i_truncate_mutex);
|
||||||
|
out:
|
||||||
if (check)
|
if (check)
|
||||||
ceph_check_caps(ci, 0, NULL);
|
ceph_check_caps(ci, 0, NULL);
|
||||||
out:
|
|
||||||
iput(inode);
|
iput(inode);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1602,6 +1614,7 @@ static const struct inode_operations ceph_symlink_iops = {
|
|||||||
.getxattr = ceph_getxattr,
|
.getxattr = ceph_getxattr,
|
||||||
.listxattr = ceph_listxattr,
|
.listxattr = ceph_listxattr,
|
||||||
.removexattr = ceph_removexattr,
|
.removexattr = ceph_removexattr,
|
||||||
|
.get_acl = ceph_get_acl,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1675,6 +1688,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
|||||||
dirtied |= CEPH_CAP_AUTH_EXCL;
|
dirtied |= CEPH_CAP_AUTH_EXCL;
|
||||||
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
|
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
|
||||||
attr->ia_mode != inode->i_mode) {
|
attr->ia_mode != inode->i_mode) {
|
||||||
|
inode->i_mode = attr->ia_mode;
|
||||||
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
|
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
|
||||||
mask |= CEPH_SETATTR_MODE;
|
mask |= CEPH_SETATTR_MODE;
|
||||||
release |= CEPH_CAP_AUTH_SHARED;
|
release |= CEPH_CAP_AUTH_SHARED;
|
||||||
@ -1790,6 +1804,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
|||||||
if (inode_dirty_flags)
|
if (inode_dirty_flags)
|
||||||
__mark_inode_dirty(inode, inode_dirty_flags);
|
__mark_inode_dirty(inode, inode_dirty_flags);
|
||||||
|
|
||||||
|
if (ia_valid & ATTR_MODE) {
|
||||||
|
err = ceph_acl_chmod(dentry, inode);
|
||||||
|
if (err)
|
||||||
|
goto out_put;
|
||||||
|
}
|
||||||
|
|
||||||
if (mask) {
|
if (mask) {
|
||||||
req->r_inode = inode;
|
req->r_inode = inode;
|
||||||
ihold(inode);
|
ihold(inode);
|
||||||
@ -1809,6 +1829,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
|||||||
return err;
|
return err;
|
||||||
out:
|
out:
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
out_put:
|
||||||
ceph_mdsc_put_request(req);
|
ceph_mdsc_put_request(req);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
|
|||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_osd_client *osdc =
|
struct ceph_osd_client *osdc =
|
||||||
&ceph_sb_to_client(inode->i_sb)->client->osdc;
|
&ceph_sb_to_client(inode->i_sb)->client->osdc;
|
||||||
|
struct ceph_object_locator oloc;
|
||||||
|
struct ceph_object_id oid;
|
||||||
u64 len = 1, olen;
|
u64 len = 1, olen;
|
||||||
u64 tmp;
|
u64 tmp;
|
||||||
struct ceph_pg pgid;
|
struct ceph_pg pgid;
|
||||||
@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
|
|||||||
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
|
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
|
||||||
ceph_ino(inode), dl.object_no);
|
ceph_ino(inode), dl.object_no);
|
||||||
|
|
||||||
r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
|
oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
|
||||||
ceph_file_layout_pg_pool(ci->i_layout));
|
ceph_oid_set_name(&oid, dl.object_name);
|
||||||
|
|
||||||
|
r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
up_read(&osdc->map_sem);
|
up_read(&osdc->map_sem);
|
||||||
return r;
|
return r;
|
||||||
|
@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops;
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info_in(void **p, void *end,
|
static int parse_reply_info_in(void **p, void *end,
|
||||||
struct ceph_mds_reply_info_in *info,
|
struct ceph_mds_reply_info_in *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
int err = -EIO;
|
int err = -EIO;
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ bad:
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info_trace(void **p, void *end,
|
static int parse_reply_info_trace(void **p, void *end,
|
||||||
struct ceph_mds_reply_info_parsed *info,
|
struct ceph_mds_reply_info_parsed *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
@ -145,7 +145,7 @@ out_bad:
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info_dir(void **p, void *end,
|
static int parse_reply_info_dir(void **p, void *end,
|
||||||
struct ceph_mds_reply_info_parsed *info,
|
struct ceph_mds_reply_info_parsed *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
u32 num, i = 0;
|
u32 num, i = 0;
|
||||||
int err;
|
int err;
|
||||||
@ -217,7 +217,7 @@ out_bad:
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info_filelock(void **p, void *end,
|
static int parse_reply_info_filelock(void **p, void *end,
|
||||||
struct ceph_mds_reply_info_parsed *info,
|
struct ceph_mds_reply_info_parsed *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
if (*p + sizeof(*info->filelock_reply) > end)
|
if (*p + sizeof(*info->filelock_reply) > end)
|
||||||
goto bad;
|
goto bad;
|
||||||
@ -238,7 +238,7 @@ bad:
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info_create(void **p, void *end,
|
static int parse_reply_info_create(void **p, void *end,
|
||||||
struct ceph_mds_reply_info_parsed *info,
|
struct ceph_mds_reply_info_parsed *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
|
if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
|
||||||
if (*p == end) {
|
if (*p == end) {
|
||||||
@ -262,7 +262,7 @@ bad:
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info_extra(void **p, void *end,
|
static int parse_reply_info_extra(void **p, void *end,
|
||||||
struct ceph_mds_reply_info_parsed *info,
|
struct ceph_mds_reply_info_parsed *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
|
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
|
||||||
return parse_reply_info_filelock(p, end, info, features);
|
return parse_reply_info_filelock(p, end, info, features);
|
||||||
@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end,
|
|||||||
*/
|
*/
|
||||||
static int parse_reply_info(struct ceph_msg *msg,
|
static int parse_reply_info(struct ceph_msg *msg,
|
||||||
struct ceph_mds_reply_info_parsed *info,
|
struct ceph_mds_reply_info_parsed *info,
|
||||||
int features)
|
u64 features)
|
||||||
{
|
{
|
||||||
void *p, *end;
|
void *p, *end;
|
||||||
u32 len;
|
u32 len;
|
||||||
@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
|
|||||||
struct dentry *dn = get_nonsnap_parent(parent);
|
struct dentry *dn = get_nonsnap_parent(parent);
|
||||||
inode = dn->d_inode;
|
inode = dn->d_inode;
|
||||||
dout("__choose_mds using nonsnap parent %p\n", inode);
|
dout("__choose_mds using nonsnap parent %p\n", inode);
|
||||||
} else if (req->r_dentry->d_inode) {
|
} else {
|
||||||
/* dentry target */
|
/* dentry target */
|
||||||
inode = req->r_dentry->d_inode;
|
inode = req->r_dentry->d_inode;
|
||||||
} else {
|
if (!inode || mode == USE_AUTH_MDS) {
|
||||||
/* dir + name */
|
/* dir + name */
|
||||||
inode = dir;
|
inode = dir;
|
||||||
hash = ceph_dentry_hash(dir, req->r_dentry);
|
hash = ceph_dentry_hash(dir, req->r_dentry);
|
||||||
is_hash = true;
|
is_hash = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -846,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
|
|||||||
*
|
*
|
||||||
* called under mdsc->mutex
|
* called under mdsc->mutex
|
||||||
*/
|
*/
|
||||||
|
static struct ceph_mds_session *
|
||||||
|
__open_export_target_session(struct ceph_mds_client *mdsc, int target)
|
||||||
|
{
|
||||||
|
struct ceph_mds_session *session;
|
||||||
|
|
||||||
|
session = __ceph_lookup_mds_session(mdsc, target);
|
||||||
|
if (!session) {
|
||||||
|
session = register_session(mdsc, target);
|
||||||
|
if (IS_ERR(session))
|
||||||
|
return session;
|
||||||
|
}
|
||||||
|
if (session->s_state == CEPH_MDS_SESSION_NEW ||
|
||||||
|
session->s_state == CEPH_MDS_SESSION_CLOSING)
|
||||||
|
__open_session(mdsc, session);
|
||||||
|
|
||||||
|
return session;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ceph_mds_session *
|
||||||
|
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
|
||||||
|
{
|
||||||
|
struct ceph_mds_session *session;
|
||||||
|
|
||||||
|
dout("open_export_target_session to mds%d\n", target);
|
||||||
|
|
||||||
|
mutex_lock(&mdsc->mutex);
|
||||||
|
session = __open_export_target_session(mdsc, target);
|
||||||
|
mutex_unlock(&mdsc->mutex);
|
||||||
|
|
||||||
|
return session;
|
||||||
|
}
|
||||||
|
|
||||||
static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
|
static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
|
||||||
struct ceph_mds_session *session)
|
struct ceph_mds_session *session)
|
||||||
{
|
{
|
||||||
struct ceph_mds_info *mi;
|
struct ceph_mds_info *mi;
|
||||||
struct ceph_mds_session *ts;
|
struct ceph_mds_session *ts;
|
||||||
int i, mds = session->s_mds;
|
int i, mds = session->s_mds;
|
||||||
int target;
|
|
||||||
|
|
||||||
if (mds >= mdsc->mdsmap->m_max_mds)
|
if (mds >= mdsc->mdsmap->m_max_mds)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
mi = &mdsc->mdsmap->m_info[mds];
|
mi = &mdsc->mdsmap->m_info[mds];
|
||||||
dout("open_export_target_sessions for mds%d (%d targets)\n",
|
dout("open_export_target_sessions for mds%d (%d targets)\n",
|
||||||
session->s_mds, mi->num_export_targets);
|
session->s_mds, mi->num_export_targets);
|
||||||
|
|
||||||
for (i = 0; i < mi->num_export_targets; i++) {
|
for (i = 0; i < mi->num_export_targets; i++) {
|
||||||
target = mi->export_targets[i];
|
ts = __open_export_target_session(mdsc, mi->export_targets[i]);
|
||||||
ts = __ceph_lookup_mds_session(mdsc, target);
|
if (!IS_ERR(ts))
|
||||||
if (!ts) {
|
ceph_put_mds_session(ts);
|
||||||
ts = register_session(mdsc, target);
|
|
||||||
if (IS_ERR(ts))
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (session->s_state == CEPH_MDS_SESSION_NEW ||
|
|
||||||
session->s_state == CEPH_MDS_SESSION_CLOSING)
|
|
||||||
__open_session(mdsc, session);
|
|
||||||
else
|
|
||||||
dout(" mds%d target mds%d %p is %s\n", session->s_mds,
|
|
||||||
i, ts, session_state_name(ts->s_state));
|
|
||||||
ceph_put_mds_session(ts);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1136,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
|
||||||
|
struct ceph_mds_session *session, u64 seq)
|
||||||
|
{
|
||||||
|
struct ceph_msg *msg;
|
||||||
|
|
||||||
|
dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
|
||||||
|
session->s_mds, session_state_name(session->s_state), seq);
|
||||||
|
msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
|
||||||
|
if (!msg)
|
||||||
|
return -ENOMEM;
|
||||||
|
ceph_con_send(&session->s_con, msg);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note new cap ttl, and any transition from stale -> not stale (fresh?).
|
* Note new cap ttl, and any transition from stale -> not stale (fresh?).
|
||||||
*
|
*
|
||||||
@ -1214,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
|
|||||||
{
|
{
|
||||||
struct ceph_mds_session *session = arg;
|
struct ceph_mds_session *session = arg;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
int used, oissued, mine;
|
int used, wanted, oissued, mine;
|
||||||
|
|
||||||
if (session->s_trim_caps <= 0)
|
if (session->s_trim_caps <= 0)
|
||||||
return -1;
|
return -1;
|
||||||
@ -1222,14 +1259,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
|
|||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
mine = cap->issued | cap->implemented;
|
mine = cap->issued | cap->implemented;
|
||||||
used = __ceph_caps_used(ci);
|
used = __ceph_caps_used(ci);
|
||||||
|
wanted = __ceph_caps_file_wanted(ci);
|
||||||
oissued = __ceph_caps_issued_other(ci, cap);
|
oissued = __ceph_caps_issued_other(ci, cap);
|
||||||
|
|
||||||
dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
|
dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
|
||||||
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
|
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
|
||||||
ceph_cap_string(used));
|
ceph_cap_string(used), ceph_cap_string(wanted));
|
||||||
if (ci->i_dirty_caps)
|
if (cap == ci->i_auth_cap) {
|
||||||
goto out; /* dirty caps */
|
if (ci->i_dirty_caps | ci->i_flushing_caps)
|
||||||
if ((used & ~oissued) & mine)
|
goto out;
|
||||||
|
if ((used | wanted) & CEPH_CAP_ANY_WR)
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
if ((used | wanted) & ~oissued & mine)
|
||||||
goto out; /* we need these caps */
|
goto out; /* we need these caps */
|
||||||
|
|
||||||
session->s_trim_caps--;
|
session->s_trim_caps--;
|
||||||
@ -2156,26 +2198,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
|
|||||||
*/
|
*/
|
||||||
if (result == -ESTALE) {
|
if (result == -ESTALE) {
|
||||||
dout("got ESTALE on request %llu", req->r_tid);
|
dout("got ESTALE on request %llu", req->r_tid);
|
||||||
if (!req->r_inode) {
|
if (req->r_direct_mode != USE_AUTH_MDS) {
|
||||||
/* do nothing; not an authority problem */
|
|
||||||
} else if (req->r_direct_mode != USE_AUTH_MDS) {
|
|
||||||
dout("not using auth, setting for that now");
|
dout("not using auth, setting for that now");
|
||||||
req->r_direct_mode = USE_AUTH_MDS;
|
req->r_direct_mode = USE_AUTH_MDS;
|
||||||
__do_request(mdsc, req);
|
__do_request(mdsc, req);
|
||||||
mutex_unlock(&mdsc->mutex);
|
mutex_unlock(&mdsc->mutex);
|
||||||
goto out;
|
goto out;
|
||||||
} else {
|
} else {
|
||||||
struct ceph_inode_info *ci = ceph_inode(req->r_inode);
|
int mds = __choose_mds(mdsc, req);
|
||||||
struct ceph_cap *cap = NULL;
|
if (mds >= 0 && mds != req->r_session->s_mds) {
|
||||||
|
dout("but auth changed, so resending");
|
||||||
if (req->r_session)
|
|
||||||
cap = ceph_get_cap_for_mds(ci,
|
|
||||||
req->r_session->s_mds);
|
|
||||||
|
|
||||||
dout("already using auth");
|
|
||||||
if ((!cap || cap != ci->i_auth_cap) ||
|
|
||||||
(cap->mseq != req->r_sent_on_mseq)) {
|
|
||||||
dout("but cap changed, so resending");
|
|
||||||
__do_request(mdsc, req);
|
__do_request(mdsc, req);
|
||||||
mutex_unlock(&mdsc->mutex);
|
mutex_unlock(&mdsc->mutex);
|
||||||
goto out;
|
goto out;
|
||||||
@ -2400,6 +2432,10 @@ static void handle_session(struct ceph_mds_session *session,
|
|||||||
trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
|
trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case CEPH_SESSION_FLUSHMSG:
|
||||||
|
send_flushmsg_ack(mdsc, session, seq);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
|
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
|
||||||
WARN_ON(1);
|
WARN_ON(1);
|
||||||
|
@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
|
|||||||
extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
|
extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
|
||||||
struct ceph_msg *msg);
|
struct ceph_msg *msg);
|
||||||
|
|
||||||
|
extern struct ceph_mds_session *
|
||||||
|
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
|
||||||
extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
|
extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
|
||||||
struct ceph_mds_session *session);
|
struct ceph_mds_session *session);
|
||||||
|
|
||||||
|
@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op)
|
|||||||
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
|
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
|
||||||
case CEPH_SESSION_STALE: return "stale";
|
case CEPH_SESSION_STALE: return "stale";
|
||||||
case CEPH_SESSION_RECALL_STATE: return "recall_state";
|
case CEPH_SESSION_RECALL_STATE: return "recall_state";
|
||||||
|
case CEPH_SESSION_FLUSHMSG: return "flushmsg";
|
||||||
|
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
|
||||||
}
|
}
|
||||||
return "???";
|
return "???";
|
||||||
}
|
}
|
||||||
|
@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
|
|||||||
struct ceph_options *opt)
|
struct ceph_options *opt)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc;
|
struct ceph_fs_client *fsc;
|
||||||
const unsigned supported_features =
|
const u64 supported_features =
|
||||||
CEPH_FEATURE_FLOCK |
|
CEPH_FEATURE_FLOCK |
|
||||||
CEPH_FEATURE_DIRLAYOUTHASH;
|
CEPH_FEATURE_DIRLAYOUTHASH;
|
||||||
const unsigned required_features = 0;
|
const u64 required_features = 0;
|
||||||
int page_count;
|
int page_count;
|
||||||
size_t size;
|
size_t size;
|
||||||
int err = -ENOMEM;
|
int err = -ENOMEM;
|
||||||
@ -686,6 +686,7 @@ static const struct super_operations ceph_super_ops = {
|
|||||||
.alloc_inode = ceph_alloc_inode,
|
.alloc_inode = ceph_alloc_inode,
|
||||||
.destroy_inode = ceph_destroy_inode,
|
.destroy_inode = ceph_destroy_inode,
|
||||||
.write_inode = ceph_write_inode,
|
.write_inode = ceph_write_inode,
|
||||||
|
.drop_inode = ceph_drop_inode,
|
||||||
.sync_fs = ceph_sync_fs,
|
.sync_fs = ceph_sync_fs,
|
||||||
.put_super = ceph_put_super,
|
.put_super = ceph_put_super,
|
||||||
.show_options = ceph_show_options,
|
.show_options = ceph_show_options,
|
||||||
@ -818,7 +819,11 @@ static int ceph_set_super(struct super_block *s, void *data)
|
|||||||
|
|
||||||
s->s_flags = fsc->mount_options->sb_flags;
|
s->s_flags = fsc->mount_options->sb_flags;
|
||||||
s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
|
s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
|
||||||
|
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||||
|
s->s_flags |= MS_POSIXACL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
s->s_xattr = ceph_xattr_handlers;
|
||||||
s->s_fs_info = fsc;
|
s->s_fs_info = fsc;
|
||||||
fsc->sb = s;
|
fsc->sb = s;
|
||||||
|
|
||||||
|
@ -287,14 +287,12 @@ struct ceph_inode_info {
|
|||||||
unsigned long i_hold_caps_min; /* jiffies */
|
unsigned long i_hold_caps_min; /* jiffies */
|
||||||
unsigned long i_hold_caps_max; /* jiffies */
|
unsigned long i_hold_caps_max; /* jiffies */
|
||||||
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
|
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
|
||||||
int i_cap_exporting_mds; /* to handle cap migration between */
|
|
||||||
unsigned i_cap_exporting_mseq; /* mds's. */
|
|
||||||
unsigned i_cap_exporting_issued;
|
|
||||||
struct ceph_cap_reservation i_cap_migration_resv;
|
struct ceph_cap_reservation i_cap_migration_resv;
|
||||||
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
|
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
|
||||||
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
|
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
|
||||||
dirty|flushing caps */
|
dirty|flushing caps */
|
||||||
unsigned i_snap_caps; /* cap bits for snapped files */
|
unsigned i_snap_caps; /* cap bits for snapped files */
|
||||||
|
unsigned i_cap_exporting_issued;
|
||||||
|
|
||||||
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
|
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
|
||||||
|
|
||||||
@ -335,7 +333,6 @@ struct ceph_inode_info {
|
|||||||
u32 i_fscache_gen; /* sequence, for delayed fscache validate */
|
u32 i_fscache_gen; /* sequence, for delayed fscache validate */
|
||||||
struct work_struct i_revalidate_work;
|
struct work_struct i_revalidate_work;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct inode vfs_inode; /* at end */
|
struct inode vfs_inode; /* at end */
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -529,6 +526,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
|
|||||||
}
|
}
|
||||||
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
|
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
|
||||||
|
|
||||||
|
extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
|
||||||
|
struct ceph_cap *ocap, int mask);
|
||||||
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
|
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
|
||||||
extern int __ceph_caps_used(struct ceph_inode_info *ci);
|
extern int __ceph_caps_used(struct ceph_inode_info *ci);
|
||||||
|
|
||||||
@ -691,6 +690,7 @@ extern const struct inode_operations ceph_file_iops;
|
|||||||
|
|
||||||
extern struct inode *ceph_alloc_inode(struct super_block *sb);
|
extern struct inode *ceph_alloc_inode(struct super_block *sb);
|
||||||
extern void ceph_destroy_inode(struct inode *inode);
|
extern void ceph_destroy_inode(struct inode *inode);
|
||||||
|
extern int ceph_drop_inode(struct inode *inode);
|
||||||
|
|
||||||
extern struct inode *ceph_get_inode(struct super_block *sb,
|
extern struct inode *ceph_get_inode(struct super_block *sb,
|
||||||
struct ceph_vino vino);
|
struct ceph_vino vino);
|
||||||
@ -724,6 +724,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|||||||
/* xattr.c */
|
/* xattr.c */
|
||||||
extern int ceph_setxattr(struct dentry *, const char *, const void *,
|
extern int ceph_setxattr(struct dentry *, const char *, const void *,
|
||||||
size_t, int);
|
size_t, int);
|
||||||
|
int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
|
||||||
|
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
|
||||||
|
int __ceph_removexattr(struct dentry *, const char *);
|
||||||
extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
|
extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
|
||||||
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
|
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
|
||||||
extern int ceph_removexattr(struct dentry *, const char *);
|
extern int ceph_removexattr(struct dentry *, const char *);
|
||||||
@ -732,6 +735,39 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
|
|||||||
extern void __init ceph_xattr_init(void);
|
extern void __init ceph_xattr_init(void);
|
||||||
extern void ceph_xattr_exit(void);
|
extern void ceph_xattr_exit(void);
|
||||||
|
|
||||||
|
/* acl.c */
|
||||||
|
extern const struct xattr_handler ceph_xattr_acl_access_handler;
|
||||||
|
extern const struct xattr_handler ceph_xattr_acl_default_handler;
|
||||||
|
extern const struct xattr_handler *ceph_xattr_handlers[];
|
||||||
|
|
||||||
|
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||||
|
|
||||||
|
struct posix_acl *ceph_get_acl(struct inode *, int);
|
||||||
|
int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
|
||||||
|
int ceph_acl_chmod(struct dentry *, struct inode *);
|
||||||
|
void ceph_forget_all_cached_acls(struct inode *inode);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ceph_get_acl NULL
|
||||||
|
|
||||||
|
static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
|
||||||
|
struct inode *dir)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ceph_forget_all_cached_acls(struct inode *inode)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/* caps.c */
|
/* caps.c */
|
||||||
extern const char *ceph_cap_string(int c);
|
extern const char *ceph_cap_string(int c);
|
||||||
extern void ceph_handle_caps(struct ceph_mds_session *session,
|
extern void ceph_handle_caps(struct ceph_mds_session *session,
|
||||||
@ -744,6 +780,7 @@ extern int ceph_add_cap(struct inode *inode,
|
|||||||
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
|
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
|
||||||
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
|
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
|
||||||
struct ceph_cap *cap);
|
struct ceph_cap *cap);
|
||||||
|
extern int ceph_is_any_caps(struct inode *inode);
|
||||||
|
|
||||||
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
|
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
|
||||||
u64 cap_id, u32 migrate_seq, u32 issue_seq);
|
u64 cap_id, u32 migrate_seq, u32 issue_seq);
|
||||||
|
@ -11,11 +11,24 @@
|
|||||||
#define XATTR_CEPH_PREFIX "ceph."
|
#define XATTR_CEPH_PREFIX "ceph."
|
||||||
#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
|
#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* List of handlers for synthetic system.* attributes. Other
|
||||||
|
* attributes are handled directly.
|
||||||
|
*/
|
||||||
|
const struct xattr_handler *ceph_xattr_handlers[] = {
|
||||||
|
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||||
|
&ceph_xattr_acl_access_handler,
|
||||||
|
&ceph_xattr_acl_default_handler,
|
||||||
|
#endif
|
||||||
|
NULL,
|
||||||
|
};
|
||||||
|
|
||||||
static bool ceph_is_valid_xattr(const char *name)
|
static bool ceph_is_valid_xattr(const char *name)
|
||||||
{
|
{
|
||||||
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
|
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
|
||||||
!strncmp(name, XATTR_SECURITY_PREFIX,
|
!strncmp(name, XATTR_SECURITY_PREFIX,
|
||||||
XATTR_SECURITY_PREFIX_LEN) ||
|
XATTR_SECURITY_PREFIX_LEN) ||
|
||||||
|
!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
|
||||||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
|
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
|
||||||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
|
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
|
||||||
}
|
}
|
||||||
@ -663,10 +676,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
|
ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
|
||||||
size_t size)
|
size_t size)
|
||||||
{
|
{
|
||||||
struct inode *inode = dentry->d_inode;
|
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
int err;
|
int err;
|
||||||
struct ceph_inode_xattr *xattr;
|
struct ceph_inode_xattr *xattr;
|
||||||
@ -675,7 +687,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
|
|||||||
if (!ceph_is_valid_xattr(name))
|
if (!ceph_is_valid_xattr(name))
|
||||||
return -ENODATA;
|
return -ENODATA;
|
||||||
|
|
||||||
|
|
||||||
/* let's see if a virtual xattr was requested */
|
/* let's see if a virtual xattr was requested */
|
||||||
vxattr = ceph_match_vxattr(inode, name);
|
vxattr = ceph_match_vxattr(inode, name);
|
||||||
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
|
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
|
||||||
@ -725,6 +736,15 @@ out:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
|
||||||
|
size_t size)
|
||||||
|
{
|
||||||
|
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
|
||||||
|
return generic_getxattr(dentry, name, value, size);
|
||||||
|
|
||||||
|
return __ceph_getxattr(dentry->d_inode, name, value, size);
|
||||||
|
}
|
||||||
|
|
||||||
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
|
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
|
||||||
{
|
{
|
||||||
struct inode *inode = dentry->d_inode;
|
struct inode *inode = dentry->d_inode;
|
||||||
@ -863,8 +883,8 @@ out:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ceph_setxattr(struct dentry *dentry, const char *name,
|
int __ceph_setxattr(struct dentry *dentry, const char *name,
|
||||||
const void *value, size_t size, int flags)
|
const void *value, size_t size, int flags)
|
||||||
{
|
{
|
||||||
struct inode *inode = dentry->d_inode;
|
struct inode *inode = dentry->d_inode;
|
||||||
struct ceph_vxattr *vxattr;
|
struct ceph_vxattr *vxattr;
|
||||||
@ -879,9 +899,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
|
|||||||
struct ceph_inode_xattr *xattr = NULL;
|
struct ceph_inode_xattr *xattr = NULL;
|
||||||
int required_blob_size;
|
int required_blob_size;
|
||||||
|
|
||||||
if (ceph_snap(inode) != CEPH_NOSNAP)
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
if (!ceph_is_valid_xattr(name))
|
if (!ceph_is_valid_xattr(name))
|
||||||
return -EOPNOTSUPP;
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
@ -958,6 +975,18 @@ out:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ceph_setxattr(struct dentry *dentry, const char *name,
|
||||||
|
const void *value, size_t size, int flags)
|
||||||
|
{
|
||||||
|
if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
|
||||||
|
return -EROFS;
|
||||||
|
|
||||||
|
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
|
||||||
|
return generic_setxattr(dentry, name, value, size, flags);
|
||||||
|
|
||||||
|
return __ceph_setxattr(dentry, name, value, size, flags);
|
||||||
|
}
|
||||||
|
|
||||||
static int ceph_send_removexattr(struct dentry *dentry, const char *name)
|
static int ceph_send_removexattr(struct dentry *dentry, const char *name)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
|
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
|
||||||
@ -984,7 +1013,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ceph_removexattr(struct dentry *dentry, const char *name)
|
int __ceph_removexattr(struct dentry *dentry, const char *name)
|
||||||
{
|
{
|
||||||
struct inode *inode = dentry->d_inode;
|
struct inode *inode = dentry->d_inode;
|
||||||
struct ceph_vxattr *vxattr;
|
struct ceph_vxattr *vxattr;
|
||||||
@ -994,9 +1023,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
|
|||||||
int required_blob_size;
|
int required_blob_size;
|
||||||
int dirty;
|
int dirty;
|
||||||
|
|
||||||
if (ceph_snap(inode) != CEPH_NOSNAP)
|
|
||||||
return -EROFS;
|
|
||||||
|
|
||||||
if (!ceph_is_valid_xattr(name))
|
if (!ceph_is_valid_xattr(name))
|
||||||
return -EOPNOTSUPP;
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
@ -1053,3 +1079,13 @@ out:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ceph_removexattr(struct dentry *dentry, const char *name)
|
||||||
|
{
|
||||||
|
if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
|
||||||
|
return -EROFS;
|
||||||
|
|
||||||
|
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
|
||||||
|
return generic_removexattr(dentry, name);
|
||||||
|
|
||||||
|
return __ceph_removexattr(dentry, name);
|
||||||
|
}
|
||||||
|
@ -17,7 +17,6 @@ struct ceph_buffer {
|
|||||||
struct kref kref;
|
struct kref kref;
|
||||||
struct kvec vec;
|
struct kvec vec;
|
||||||
size_t alloc_len;
|
size_t alloc_len;
|
||||||
bool is_vmalloc;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
|
extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
|
||||||
|
@ -4,42 +4,73 @@
|
|||||||
/*
|
/*
|
||||||
* feature bits
|
* feature bits
|
||||||
*/
|
*/
|
||||||
#define CEPH_FEATURE_UID (1<<0)
|
#define CEPH_FEATURE_UID (1ULL<<0)
|
||||||
#define CEPH_FEATURE_NOSRCADDR (1<<1)
|
#define CEPH_FEATURE_NOSRCADDR (1ULL<<1)
|
||||||
#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
|
#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2)
|
||||||
#define CEPH_FEATURE_FLOCK (1<<3)
|
#define CEPH_FEATURE_FLOCK (1ULL<<3)
|
||||||
#define CEPH_FEATURE_SUBSCRIBE2 (1<<4)
|
#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4)
|
||||||
#define CEPH_FEATURE_MONNAMES (1<<5)
|
#define CEPH_FEATURE_MONNAMES (1ULL<<5)
|
||||||
#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
|
#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6)
|
||||||
#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
|
#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7)
|
||||||
#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
|
#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8)
|
||||||
#define CEPH_FEATURE_PGID64 (1<<9)
|
#define CEPH_FEATURE_PGID64 (1ULL<<9)
|
||||||
#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
|
#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10)
|
||||||
#define CEPH_FEATURE_PGPOOL3 (1<<11)
|
#define CEPH_FEATURE_PGPOOL3 (1ULL<<11)
|
||||||
#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
|
#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12)
|
||||||
#define CEPH_FEATURE_OSDENC (1<<13)
|
#define CEPH_FEATURE_OSDENC (1ULL<<13)
|
||||||
#define CEPH_FEATURE_OMAP (1<<14)
|
#define CEPH_FEATURE_OMAP (1ULL<<14)
|
||||||
#define CEPH_FEATURE_MONENC (1<<15)
|
#define CEPH_FEATURE_MONENC (1ULL<<15)
|
||||||
#define CEPH_FEATURE_QUERY_T (1<<16)
|
#define CEPH_FEATURE_QUERY_T (1ULL<<16)
|
||||||
#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
|
#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17)
|
||||||
#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
|
#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
|
||||||
#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
|
#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19)
|
||||||
#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
|
#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20)
|
||||||
#define CEPH_FEATURE_MON_GV (1<<21)
|
#define CEPH_FEATURE_MON_GV (1ULL<<21)
|
||||||
#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
|
#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
|
||||||
#define CEPH_FEATURE_MSG_AUTH (1<<23)
|
#define CEPH_FEATURE_MSG_AUTH (1ULL<<23)
|
||||||
#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
|
#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
|
||||||
#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
|
#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
|
||||||
#define CEPH_FEATURE_CREATEPOOLID (1<<26)
|
#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26)
|
||||||
#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
|
#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27)
|
||||||
#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
|
#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28)
|
||||||
#define CEPH_FEATURE_MDSENC (1<<29)
|
#define CEPH_FEATURE_MDSENC (1ULL<<29)
|
||||||
#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
|
#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30)
|
||||||
|
#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
|
||||||
|
#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
|
||||||
|
#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
|
||||||
|
#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
|
||||||
|
#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
|
||||||
|
#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
|
||||||
|
#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
|
||||||
|
#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
|
||||||
|
* vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63
|
||||||
|
* to mean 33 bit ~0, and introduce a helper below to do the
|
||||||
|
* translation.
|
||||||
|
*
|
||||||
|
* This was introduced by ceph.git commit
|
||||||
|
* 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8
|
||||||
|
* and fixed by ceph.git commit
|
||||||
|
* 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c
|
||||||
|
*/
|
||||||
|
#define CEPH_FEATURE_RESERVED (1ULL<<63)
|
||||||
|
|
||||||
|
static inline u64 ceph_sanitize_features(u64 features)
|
||||||
|
{
|
||||||
|
if (features & CEPH_FEATURE_RESERVED) {
|
||||||
|
/* everything through OSD_SNAPMAPPER */
|
||||||
|
return 0x1ffffffffull;
|
||||||
|
} else {
|
||||||
|
return features;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Features supported.
|
* Features supported.
|
||||||
*/
|
*/
|
||||||
#define CEPH_FEATURES_SUPPORTED_DEFAULT \
|
#define CEPH_FEATURES_SUPPORTED_DEFAULT \
|
||||||
(CEPH_FEATURE_NOSRCADDR | \
|
(CEPH_FEATURE_NOSRCADDR | \
|
||||||
CEPH_FEATURE_RECONNECT_SEQ | \
|
CEPH_FEATURE_RECONNECT_SEQ | \
|
||||||
CEPH_FEATURE_PGID64 | \
|
CEPH_FEATURE_PGID64 | \
|
||||||
@ -48,7 +79,10 @@
|
|||||||
CEPH_FEATURE_CRUSH_TUNABLES | \
|
CEPH_FEATURE_CRUSH_TUNABLES | \
|
||||||
CEPH_FEATURE_CRUSH_TUNABLES2 | \
|
CEPH_FEATURE_CRUSH_TUNABLES2 | \
|
||||||
CEPH_FEATURE_REPLY_CREATE_INODE | \
|
CEPH_FEATURE_REPLY_CREATE_INODE | \
|
||||||
CEPH_FEATURE_OSDHASHPSPOOL)
|
CEPH_FEATURE_OSDHASHPSPOOL | \
|
||||||
|
CEPH_FEATURE_OSD_CACHEPOOL | \
|
||||||
|
CEPH_FEATURE_CRUSH_V2 | \
|
||||||
|
CEPH_FEATURE_EXPORT_PEER)
|
||||||
|
|
||||||
#define CEPH_FEATURES_REQUIRED_DEFAULT \
|
#define CEPH_FEATURES_REQUIRED_DEFAULT \
|
||||||
(CEPH_FEATURE_NOSRCADDR | \
|
(CEPH_FEATURE_NOSRCADDR | \
|
||||||
@ -56,4 +90,5 @@
|
|||||||
CEPH_FEATURE_PGID64 | \
|
CEPH_FEATURE_PGID64 | \
|
||||||
CEPH_FEATURE_PGPOOL3 | \
|
CEPH_FEATURE_PGPOOL3 | \
|
||||||
CEPH_FEATURE_OSDENC)
|
CEPH_FEATURE_OSDENC)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -53,6 +53,29 @@ struct ceph_file_layout {
|
|||||||
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
|
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
|
||||||
} __attribute__ ((packed));
|
} __attribute__ ((packed));
|
||||||
|
|
||||||
|
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
|
||||||
|
#define ceph_file_layout_stripe_count(l) \
|
||||||
|
((__s32)le32_to_cpu((l).fl_stripe_count))
|
||||||
|
#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
|
||||||
|
#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
|
||||||
|
#define ceph_file_layout_object_su(l) \
|
||||||
|
((__s32)le32_to_cpu((l).fl_object_stripe_unit))
|
||||||
|
#define ceph_file_layout_pg_pool(l) \
|
||||||
|
((__s32)le32_to_cpu((l).fl_pg_pool))
|
||||||
|
|
||||||
|
static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
|
||||||
|
{
|
||||||
|
return le32_to_cpu(l->fl_stripe_unit) *
|
||||||
|
le32_to_cpu(l->fl_stripe_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* "period" == bytes before i start on a new set of objects */
|
||||||
|
static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
|
||||||
|
{
|
||||||
|
return le32_to_cpu(l->fl_object_size) *
|
||||||
|
le32_to_cpu(l->fl_stripe_count);
|
||||||
|
}
|
||||||
|
|
||||||
#define CEPH_MIN_STRIPE_UNIT 65536
|
#define CEPH_MIN_STRIPE_UNIT 65536
|
||||||
|
|
||||||
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
|
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
|
||||||
@ -282,6 +305,8 @@ enum {
|
|||||||
CEPH_SESSION_RENEWCAPS,
|
CEPH_SESSION_RENEWCAPS,
|
||||||
CEPH_SESSION_STALE,
|
CEPH_SESSION_STALE,
|
||||||
CEPH_SESSION_RECALL_STATE,
|
CEPH_SESSION_RECALL_STATE,
|
||||||
|
CEPH_SESSION_FLUSHMSG,
|
||||||
|
CEPH_SESSION_FLUSHMSG_ACK,
|
||||||
};
|
};
|
||||||
|
|
||||||
extern const char *ceph_session_op_name(int op);
|
extern const char *ceph_session_op_name(int op);
|
||||||
@ -457,7 +482,8 @@ struct ceph_mds_reply_cap {
|
|||||||
__u8 flags; /* CEPH_CAP_FLAG_* */
|
__u8 flags; /* CEPH_CAP_FLAG_* */
|
||||||
} __attribute__ ((packed));
|
} __attribute__ ((packed));
|
||||||
|
|
||||||
#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
|
#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */
|
||||||
|
#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */
|
||||||
|
|
||||||
/* inode record, for bundling with mds reply */
|
/* inode record, for bundling with mds reply */
|
||||||
struct ceph_mds_reply_inode {
|
struct ceph_mds_reply_inode {
|
||||||
@ -658,6 +684,14 @@ struct ceph_mds_caps {
|
|||||||
__le32 time_warp_seq;
|
__le32 time_warp_seq;
|
||||||
} __attribute__ ((packed));
|
} __attribute__ ((packed));
|
||||||
|
|
||||||
|
struct ceph_mds_cap_peer {
|
||||||
|
__le64 cap_id;
|
||||||
|
__le32 seq;
|
||||||
|
__le32 mseq;
|
||||||
|
__le32 mds;
|
||||||
|
__u8 flags;
|
||||||
|
} __attribute__ ((packed));
|
||||||
|
|
||||||
/* cap release msg head */
|
/* cap release msg head */
|
||||||
struct ceph_mds_cap_release {
|
struct ceph_mds_cap_release {
|
||||||
__le32 num; /* number of cap_items that follow */
|
__le32 num; /* number of cap_items that follow */
|
||||||
|
@ -122,8 +122,8 @@ struct ceph_client {
|
|||||||
|
|
||||||
int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
|
int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
|
||||||
|
|
||||||
u32 supported_features;
|
u64 supported_features;
|
||||||
u32 required_features;
|
u64 required_features;
|
||||||
|
|
||||||
struct ceph_messenger msgr; /* messenger instance */
|
struct ceph_messenger msgr; /* messenger instance */
|
||||||
struct ceph_mon_client monc;
|
struct ceph_mon_client monc;
|
||||||
@ -173,15 +173,18 @@ static inline int calc_pages_for(u64 off, u64 len)
|
|||||||
(off >> PAGE_CACHE_SHIFT);
|
(off >> PAGE_CACHE_SHIFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern struct kmem_cache *ceph_inode_cachep;
|
||||||
|
extern struct kmem_cache *ceph_cap_cachep;
|
||||||
|
extern struct kmem_cache *ceph_dentry_cachep;
|
||||||
|
extern struct kmem_cache *ceph_file_cachep;
|
||||||
|
|
||||||
/* ceph_common.c */
|
/* ceph_common.c */
|
||||||
extern bool libceph_compatible(void *data);
|
extern bool libceph_compatible(void *data);
|
||||||
|
|
||||||
extern const char *ceph_msg_type_name(int type);
|
extern const char *ceph_msg_type_name(int type);
|
||||||
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
|
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
|
||||||
extern struct kmem_cache *ceph_inode_cachep;
|
extern void *ceph_kvmalloc(size_t size, gfp_t flags);
|
||||||
extern struct kmem_cache *ceph_cap_cachep;
|
extern void ceph_kvfree(const void *ptr);
|
||||||
extern struct kmem_cache *ceph_dentry_cachep;
|
|
||||||
extern struct kmem_cache *ceph_file_cachep;
|
|
||||||
|
|
||||||
extern struct ceph_options *ceph_parse_options(char *options,
|
extern struct ceph_options *ceph_parse_options(char *options,
|
||||||
const char *dev_name, const char *dev_name_end,
|
const char *dev_name, const char *dev_name_end,
|
||||||
@ -192,8 +195,8 @@ extern int ceph_compare_options(struct ceph_options *new_opt,
|
|||||||
struct ceph_client *client);
|
struct ceph_client *client);
|
||||||
extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
|
extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
|
||||||
void *private,
|
void *private,
|
||||||
unsigned supported_features,
|
u64 supported_features,
|
||||||
unsigned required_features);
|
u64 required_features);
|
||||||
extern u64 ceph_client_id(struct ceph_client *client);
|
extern u64 ceph_client_id(struct ceph_client *client);
|
||||||
extern void ceph_destroy_client(struct ceph_client *client);
|
extern void ceph_destroy_client(struct ceph_client *client);
|
||||||
extern int __ceph_open_session(struct ceph_client *client,
|
extern int __ceph_open_session(struct ceph_client *client,
|
||||||
|
@ -60,8 +60,8 @@ struct ceph_messenger {
|
|||||||
u32 global_seq;
|
u32 global_seq;
|
||||||
spinlock_t global_seq_lock;
|
spinlock_t global_seq_lock;
|
||||||
|
|
||||||
u32 supported_features;
|
u64 supported_features;
|
||||||
u32 required_features;
|
u64 required_features;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ceph_msg_data_type {
|
enum ceph_msg_data_type {
|
||||||
@ -154,10 +154,9 @@ struct ceph_msg {
|
|||||||
struct list_head list_head; /* links for connection lists */
|
struct list_head list_head; /* links for connection lists */
|
||||||
|
|
||||||
struct kref kref;
|
struct kref kref;
|
||||||
bool front_is_vmalloc;
|
|
||||||
bool more_to_follow;
|
bool more_to_follow;
|
||||||
bool needs_out_seq;
|
bool needs_out_seq;
|
||||||
int front_max;
|
int front_alloc_len;
|
||||||
unsigned long ack_stamp; /* tx: when we were acked */
|
unsigned long ack_stamp; /* tx: when we were acked */
|
||||||
|
|
||||||
struct ceph_msgpool *pool;
|
struct ceph_msgpool *pool;
|
||||||
@ -192,7 +191,7 @@ struct ceph_connection {
|
|||||||
|
|
||||||
struct ceph_entity_name peer_name; /* peer name */
|
struct ceph_entity_name peer_name; /* peer name */
|
||||||
|
|
||||||
unsigned peer_features;
|
u64 peer_features;
|
||||||
u32 connect_seq; /* identify the most recent connection
|
u32 connect_seq; /* identify the most recent connection
|
||||||
attempt for this connection, client */
|
attempt for this connection, client */
|
||||||
u32 peer_global_seq; /* peer's global seq for this connection */
|
u32 peer_global_seq; /* peer's global seq for this connection */
|
||||||
@ -256,8 +255,8 @@ extern void ceph_msgr_flush(void);
|
|||||||
|
|
||||||
extern void ceph_messenger_init(struct ceph_messenger *msgr,
|
extern void ceph_messenger_init(struct ceph_messenger *msgr,
|
||||||
struct ceph_entity_addr *myaddr,
|
struct ceph_entity_addr *myaddr,
|
||||||
u32 supported_features,
|
u64 supported_features,
|
||||||
u32 required_features,
|
u64 required_features,
|
||||||
bool nocrc);
|
bool nocrc);
|
||||||
|
|
||||||
extern void ceph_con_init(struct ceph_connection *con, void *private,
|
extern void ceph_con_init(struct ceph_connection *con, void *private,
|
||||||
|
@ -12,12 +12,6 @@
|
|||||||
#include <linux/ceph/auth.h>
|
#include <linux/ceph/auth.h>
|
||||||
#include <linux/ceph/pagelist.h>
|
#include <linux/ceph/pagelist.h>
|
||||||
|
|
||||||
/*
|
|
||||||
* Maximum object name size
|
|
||||||
* (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100)
|
|
||||||
*/
|
|
||||||
#define MAX_OBJ_NAME_SIZE 100
|
|
||||||
|
|
||||||
struct ceph_msg;
|
struct ceph_msg;
|
||||||
struct ceph_snap_context;
|
struct ceph_snap_context;
|
||||||
struct ceph_osd_request;
|
struct ceph_osd_request;
|
||||||
@ -138,6 +132,7 @@ struct ceph_osd_request {
|
|||||||
__le64 *r_request_pool;
|
__le64 *r_request_pool;
|
||||||
void *r_request_pgid;
|
void *r_request_pgid;
|
||||||
__le32 *r_request_attempts;
|
__le32 *r_request_attempts;
|
||||||
|
bool r_paused;
|
||||||
struct ceph_eversion *r_request_reassert_version;
|
struct ceph_eversion *r_request_reassert_version;
|
||||||
|
|
||||||
int r_result;
|
int r_result;
|
||||||
@ -158,15 +153,21 @@ struct ceph_osd_request {
|
|||||||
struct inode *r_inode; /* for use by callbacks */
|
struct inode *r_inode; /* for use by callbacks */
|
||||||
void *r_priv; /* ditto */
|
void *r_priv; /* ditto */
|
||||||
|
|
||||||
char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
|
struct ceph_object_locator r_base_oloc;
|
||||||
int r_oid_len;
|
struct ceph_object_id r_base_oid;
|
||||||
|
struct ceph_object_locator r_target_oloc;
|
||||||
|
struct ceph_object_id r_target_oid;
|
||||||
|
|
||||||
u64 r_snapid;
|
u64 r_snapid;
|
||||||
unsigned long r_stamp; /* send OR check time */
|
unsigned long r_stamp; /* send OR check time */
|
||||||
|
|
||||||
struct ceph_file_layout r_file_layout;
|
|
||||||
struct ceph_snap_context *r_snapc; /* snap context for writes */
|
struct ceph_snap_context *r_snapc; /* snap context for writes */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ceph_request_redirect {
|
||||||
|
struct ceph_object_locator oloc;
|
||||||
|
};
|
||||||
|
|
||||||
struct ceph_osd_event {
|
struct ceph_osd_event {
|
||||||
u64 cookie;
|
u64 cookie;
|
||||||
int one_shot;
|
int one_shot;
|
||||||
|
@ -35,13 +35,26 @@ struct ceph_pg_pool_info {
|
|||||||
u8 object_hash;
|
u8 object_hash;
|
||||||
u32 pg_num, pgp_num;
|
u32 pg_num, pgp_num;
|
||||||
int pg_num_mask, pgp_num_mask;
|
int pg_num_mask, pgp_num_mask;
|
||||||
|
s64 read_tier;
|
||||||
|
s64 write_tier; /* wins for read+write ops */
|
||||||
u64 flags;
|
u64 flags;
|
||||||
char *name;
|
char *name;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ceph_object_locator {
|
struct ceph_object_locator {
|
||||||
uint64_t pool;
|
s64 pool;
|
||||||
char *key;
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Maximum supported by kernel client object name length
|
||||||
|
*
|
||||||
|
* (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
|
||||||
|
*/
|
||||||
|
#define CEPH_MAX_OID_NAME_LEN 100
|
||||||
|
|
||||||
|
struct ceph_object_id {
|
||||||
|
char name[CEPH_MAX_OID_NAME_LEN];
|
||||||
|
int name_len;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ceph_pg_mapping {
|
struct ceph_pg_mapping {
|
||||||
@ -73,33 +86,30 @@ struct ceph_osdmap {
|
|||||||
struct crush_map *crush;
|
struct crush_map *crush;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
static inline void ceph_oid_set_name(struct ceph_object_id *oid,
|
||||||
* file layout helpers
|
const char *name)
|
||||||
*/
|
|
||||||
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
|
|
||||||
#define ceph_file_layout_stripe_count(l) \
|
|
||||||
((__s32)le32_to_cpu((l).fl_stripe_count))
|
|
||||||
#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
|
|
||||||
#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
|
|
||||||
#define ceph_file_layout_object_su(l) \
|
|
||||||
((__s32)le32_to_cpu((l).fl_object_stripe_unit))
|
|
||||||
#define ceph_file_layout_pg_pool(l) \
|
|
||||||
((__s32)le32_to_cpu((l).fl_pg_pool))
|
|
||||||
|
|
||||||
static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
|
|
||||||
{
|
{
|
||||||
return le32_to_cpu(l->fl_stripe_unit) *
|
int len;
|
||||||
le32_to_cpu(l->fl_stripe_count);
|
|
||||||
|
len = strlen(name);
|
||||||
|
if (len > sizeof(oid->name)) {
|
||||||
|
WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
|
||||||
|
name, len, sizeof(oid->name));
|
||||||
|
len = sizeof(oid->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(oid->name, name, len);
|
||||||
|
oid->name_len = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* "period" == bytes before i start on a new set of objects */
|
static inline void ceph_oid_copy(struct ceph_object_id *dest,
|
||||||
static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
|
struct ceph_object_id *src)
|
||||||
{
|
{
|
||||||
return le32_to_cpu(l->fl_object_size) *
|
BUG_ON(src->name_len > sizeof(dest->name));
|
||||||
le32_to_cpu(l->fl_stripe_count);
|
memcpy(dest->name, src->name, src->name_len);
|
||||||
|
dest->name_len = src->name_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
|
static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
|
||||||
{
|
{
|
||||||
return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
|
return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
|
||||||
@ -155,14 +165,20 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
|
|||||||
u64 *bno, u64 *oxoff, u64 *oxlen);
|
u64 *bno, u64 *oxoff, u64 *oxlen);
|
||||||
|
|
||||||
/* calculate mapping of object to a placement group */
|
/* calculate mapping of object to a placement group */
|
||||||
extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
|
extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
|
||||||
struct ceph_osdmap *osdmap, uint64_t pool);
|
struct ceph_object_locator *oloc,
|
||||||
|
struct ceph_object_id *oid,
|
||||||
|
struct ceph_pg *pg_out);
|
||||||
|
|
||||||
extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
|
extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
|
||||||
struct ceph_pg pgid,
|
struct ceph_pg pgid,
|
||||||
int *acting);
|
int *acting);
|
||||||
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
|
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
|
||||||
struct ceph_pg pgid);
|
struct ceph_pg pgid);
|
||||||
|
|
||||||
|
extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
|
||||||
|
u64 id);
|
||||||
|
|
||||||
extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
|
extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
|
||||||
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
|
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
|
||||||
|
|
||||||
|
@ -344,6 +344,10 @@ enum {
|
|||||||
CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
|
CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
|
||||||
CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
|
CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
|
||||||
CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
|
CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
|
||||||
|
CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
|
||||||
|
CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
|
||||||
|
CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
|
||||||
|
CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
|
||||||
};
|
};
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
|
@ -19,11 +19,12 @@
|
|||||||
|
|
||||||
#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
|
#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
|
||||||
|
|
||||||
|
|
||||||
#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
|
#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
|
||||||
#define CRUSH_MAX_SET 10 /* max size of a mapping result */
|
|
||||||
|
|
||||||
|
|
||||||
|
#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
|
||||||
|
#define CRUSH_ITEM_NONE 0x7fffffff /* no result */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CRUSH uses user-defined "rules" to describe how inputs should be
|
* CRUSH uses user-defined "rules" to describe how inputs should be
|
||||||
* mapped to devices. A rule consists of sequence of steps to perform
|
* mapped to devices. A rule consists of sequence of steps to perform
|
||||||
@ -43,8 +44,13 @@ enum {
|
|||||||
/* arg2 = type */
|
/* arg2 = type */
|
||||||
CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
|
CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
|
||||||
CRUSH_RULE_EMIT = 4, /* no args */
|
CRUSH_RULE_EMIT = 4, /* no args */
|
||||||
CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
|
CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
|
||||||
CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
|
CRUSH_RULE_CHOOSELEAF_INDEP = 7,
|
||||||
|
|
||||||
|
CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
|
||||||
|
CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
|
||||||
|
CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
|
||||||
|
CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -162,7 +168,10 @@ struct crush_map {
|
|||||||
__u32 choose_local_fallback_tries;
|
__u32 choose_local_fallback_tries;
|
||||||
/* choose attempts before giving up */
|
/* choose attempts before giving up */
|
||||||
__u32 choose_total_tries;
|
__u32 choose_total_tries;
|
||||||
/* attempt chooseleaf inner descent once; on failure retry outer descent */
|
/* attempt chooseleaf inner descent once for firstn mode; on
|
||||||
|
* reject retry outer descent. Note that this does *not*
|
||||||
|
* apply to a collision: in that case we will retry as we used
|
||||||
|
* to. */
|
||||||
__u32 chooseleaf_descend_once;
|
__u32 chooseleaf_descend_once;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -174,6 +183,7 @@ extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
|
|||||||
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
|
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
|
||||||
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
|
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
|
||||||
extern void crush_destroy_bucket(struct crush_bucket *b);
|
extern void crush_destroy_bucket(struct crush_bucket *b);
|
||||||
|
extern void crush_destroy_rule(struct crush_rule *r);
|
||||||
extern void crush_destroy(struct crush_map *map);
|
extern void crush_destroy(struct crush_map *map);
|
||||||
|
|
||||||
static inline int crush_calc_tree_node(int i)
|
static inline int crush_calc_tree_node(int i)
|
||||||
|
@ -14,6 +14,7 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i
|
|||||||
extern int crush_do_rule(const struct crush_map *map,
|
extern int crush_do_rule(const struct crush_map *map,
|
||||||
int ruleno,
|
int ruleno,
|
||||||
int x, int *result, int result_max,
|
int x, int *result, int result_max,
|
||||||
const __u32 *weights);
|
const __u32 *weights, int weight_max,
|
||||||
|
int *scratch);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#include <linux/ceph/buffer.h>
|
#include <linux/ceph/buffer.h>
|
||||||
#include <linux/ceph/decode.h>
|
#include <linux/ceph/decode.h>
|
||||||
|
#include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */
|
||||||
|
|
||||||
struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
|
struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
|
||||||
{
|
{
|
||||||
@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
|
|||||||
if (!b)
|
if (!b)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
|
b->vec.iov_base = ceph_kvmalloc(len, gfp);
|
||||||
if (b->vec.iov_base) {
|
if (!b->vec.iov_base) {
|
||||||
b->is_vmalloc = false;
|
kfree(b);
|
||||||
} else {
|
return NULL;
|
||||||
b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL);
|
|
||||||
if (!b->vec.iov_base) {
|
|
||||||
kfree(b);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
b->is_vmalloc = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kref_init(&b->kref);
|
kref_init(&b->kref);
|
||||||
@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref)
|
|||||||
struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
|
struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
|
||||||
|
|
||||||
dout("buffer_release %p\n", b);
|
dout("buffer_release %p\n", b);
|
||||||
if (b->vec.iov_base) {
|
ceph_kvfree(b->vec.iov_base);
|
||||||
if (b->is_vmalloc)
|
|
||||||
vfree(b->vec.iov_base);
|
|
||||||
else
|
|
||||||
kfree(b->vec.iov_base);
|
|
||||||
}
|
|
||||||
kfree(b);
|
kfree(b);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ceph_buffer_release);
|
EXPORT_SYMBOL(ceph_buffer_release);
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/statfs.h>
|
#include <linux/statfs.h>
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
|
#include <linux/vmalloc.h>
|
||||||
#include <linux/nsproxy.h>
|
#include <linux/nsproxy.h>
|
||||||
#include <net/net_namespace.h>
|
#include <net/net_namespace.h>
|
||||||
|
|
||||||
@ -170,6 +171,25 @@ int ceph_compare_options(struct ceph_options *new_opt,
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ceph_compare_options);
|
EXPORT_SYMBOL(ceph_compare_options);
|
||||||
|
|
||||||
|
void *ceph_kvmalloc(size_t size, gfp_t flags)
|
||||||
|
{
|
||||||
|
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
|
||||||
|
void *ptr = kmalloc(size, flags | __GFP_NOWARN);
|
||||||
|
if (ptr)
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ceph_kvfree(const void *ptr)
|
||||||
|
{
|
||||||
|
if (is_vmalloc_addr(ptr))
|
||||||
|
vfree(ptr);
|
||||||
|
else
|
||||||
|
kfree(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int parse_fsid(const char *str, struct ceph_fsid *fsid)
|
static int parse_fsid(const char *str, struct ceph_fsid *fsid)
|
||||||
{
|
{
|
||||||
@ -461,8 +481,8 @@ EXPORT_SYMBOL(ceph_client_id);
|
|||||||
* create a fresh client instance
|
* create a fresh client instance
|
||||||
*/
|
*/
|
||||||
struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
|
struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
|
||||||
unsigned int supported_features,
|
u64 supported_features,
|
||||||
unsigned int required_features)
|
u64 required_features)
|
||||||
{
|
{
|
||||||
struct ceph_client *client;
|
struct ceph_client *client;
|
||||||
struct ceph_entity_addr *myaddr = NULL;
|
struct ceph_entity_addr *myaddr = NULL;
|
||||||
|
@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map)
|
|||||||
if (map->rules) {
|
if (map->rules) {
|
||||||
__u32 b;
|
__u32 b;
|
||||||
for (b = 0; b < map->max_rules; b++)
|
for (b = 0; b < map->max_rules; b++)
|
||||||
kfree(map->rules[b]);
|
crush_destroy_rule(map->rules[b]);
|
||||||
kfree(map->rules);
|
kfree(map->rules);
|
||||||
}
|
}
|
||||||
|
|
||||||
kfree(map);
|
kfree(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void crush_destroy_rule(struct crush_rule *rule)
|
||||||
|
{
|
||||||
|
kfree(rule);
|
||||||
|
}
|
||||||
|
@ -189,7 +189,7 @@ static int terminal(int x)
|
|||||||
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
|
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
|
||||||
int x, int r)
|
int x, int r)
|
||||||
{
|
{
|
||||||
int n, l;
|
int n;
|
||||||
__u32 w;
|
__u32 w;
|
||||||
__u64 t;
|
__u64 t;
|
||||||
|
|
||||||
@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
|
|||||||
n = bucket->num_nodes >> 1;
|
n = bucket->num_nodes >> 1;
|
||||||
|
|
||||||
while (!terminal(n)) {
|
while (!terminal(n)) {
|
||||||
|
int l;
|
||||||
/* pick point in [0, w) */
|
/* pick point in [0, w) */
|
||||||
w = bucket->node_weights[n];
|
w = bucket->node_weights[n];
|
||||||
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
|
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
|
||||||
@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
|
|||||||
* true if device is marked "out" (failed, fully offloaded)
|
* true if device is marked "out" (failed, fully offloaded)
|
||||||
* of the cluster
|
* of the cluster
|
||||||
*/
|
*/
|
||||||
static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
|
static int is_out(const struct crush_map *map,
|
||||||
|
const __u32 *weight, int weight_max,
|
||||||
|
int item, int x)
|
||||||
{
|
{
|
||||||
|
if (item >= weight_max)
|
||||||
|
return 1;
|
||||||
if (weight[item] >= 0x10000)
|
if (weight[item] >= 0x10000)
|
||||||
return 0;
|
return 0;
|
||||||
if (weight[item] == 0)
|
if (weight[item] == 0)
|
||||||
@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* crush_choose - choose numrep distinct items of given type
|
* crush_choose_firstn - choose numrep distinct items of given type
|
||||||
* @map: the crush_map
|
* @map: the crush_map
|
||||||
* @bucket: the bucket we are choose an item from
|
* @bucket: the bucket we are choose an item from
|
||||||
* @x: crush input value
|
* @x: crush input value
|
||||||
@ -285,18 +290,24 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
|
|||||||
* @type: the type of item to choose
|
* @type: the type of item to choose
|
||||||
* @out: pointer to output vector
|
* @out: pointer to output vector
|
||||||
* @outpos: our position in that vector
|
* @outpos: our position in that vector
|
||||||
* @firstn: true if choosing "first n" items, false if choosing "indep"
|
* @tries: number of attempts to make
|
||||||
* @recurse_to_leaf: true if we want one device under each item of given type
|
* @recurse_tries: number of attempts to have recursive chooseleaf make
|
||||||
* @descend_once: true if we should only try one descent before giving up
|
* @local_tries: localized retries
|
||||||
|
* @local_fallback_tries: localized fallback retries
|
||||||
|
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
|
||||||
* @out2: second output vector for leaf items (if @recurse_to_leaf)
|
* @out2: second output vector for leaf items (if @recurse_to_leaf)
|
||||||
*/
|
*/
|
||||||
static int crush_choose(const struct crush_map *map,
|
static int crush_choose_firstn(const struct crush_map *map,
|
||||||
struct crush_bucket *bucket,
|
struct crush_bucket *bucket,
|
||||||
const __u32 *weight,
|
const __u32 *weight, int weight_max,
|
||||||
int x, int numrep, int type,
|
int x, int numrep, int type,
|
||||||
int *out, int outpos,
|
int *out, int outpos,
|
||||||
int firstn, int recurse_to_leaf,
|
unsigned int tries,
|
||||||
int descend_once, int *out2)
|
unsigned int recurse_tries,
|
||||||
|
unsigned int local_tries,
|
||||||
|
unsigned int local_fallback_tries,
|
||||||
|
int recurse_to_leaf,
|
||||||
|
int *out2)
|
||||||
{
|
{
|
||||||
int rep;
|
int rep;
|
||||||
unsigned int ftotal, flocal;
|
unsigned int ftotal, flocal;
|
||||||
@ -325,35 +336,17 @@ static int crush_choose(const struct crush_map *map,
|
|||||||
collide = 0;
|
collide = 0;
|
||||||
retry_bucket = 0;
|
retry_bucket = 0;
|
||||||
r = rep;
|
r = rep;
|
||||||
if (in->alg == CRUSH_BUCKET_UNIFORM) {
|
/* r' = r + f_total */
|
||||||
/* be careful */
|
r += ftotal;
|
||||||
if (firstn || (__u32)numrep >= in->size)
|
|
||||||
/* r' = r + f_total */
|
|
||||||
r += ftotal;
|
|
||||||
else if (in->size % numrep == 0)
|
|
||||||
/* r'=r+(n+1)*f_local */
|
|
||||||
r += (numrep+1) *
|
|
||||||
(flocal+ftotal);
|
|
||||||
else
|
|
||||||
/* r' = r + n*f_local */
|
|
||||||
r += numrep * (flocal+ftotal);
|
|
||||||
} else {
|
|
||||||
if (firstn)
|
|
||||||
/* r' = r + f_total */
|
|
||||||
r += ftotal;
|
|
||||||
else
|
|
||||||
/* r' = r + n*f_local */
|
|
||||||
r += numrep * (flocal+ftotal);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* bucket choose */
|
/* bucket choose */
|
||||||
if (in->size == 0) {
|
if (in->size == 0) {
|
||||||
reject = 1;
|
reject = 1;
|
||||||
goto reject;
|
goto reject;
|
||||||
}
|
}
|
||||||
if (map->choose_local_fallback_tries > 0 &&
|
if (local_fallback_tries > 0 &&
|
||||||
flocal >= (in->size>>1) &&
|
flocal >= (in->size>>1) &&
|
||||||
flocal > map->choose_local_fallback_tries)
|
flocal > local_fallback_tries)
|
||||||
item = bucket_perm_choose(in, x, r);
|
item = bucket_perm_choose(in, x, r);
|
||||||
else
|
else
|
||||||
item = crush_bucket_choose(in, x, r);
|
item = crush_bucket_choose(in, x, r);
|
||||||
@ -394,13 +387,15 @@ static int crush_choose(const struct crush_map *map,
|
|||||||
reject = 0;
|
reject = 0;
|
||||||
if (!collide && recurse_to_leaf) {
|
if (!collide && recurse_to_leaf) {
|
||||||
if (item < 0) {
|
if (item < 0) {
|
||||||
if (crush_choose(map,
|
if (crush_choose_firstn(map,
|
||||||
map->buckets[-1-item],
|
map->buckets[-1-item],
|
||||||
weight,
|
weight, weight_max,
|
||||||
x, outpos+1, 0,
|
x, outpos+1, 0,
|
||||||
out2, outpos,
|
out2, outpos,
|
||||||
firstn, 0,
|
recurse_tries, 0,
|
||||||
map->chooseleaf_descend_once,
|
local_tries,
|
||||||
|
local_fallback_tries,
|
||||||
|
0,
|
||||||
NULL) <= outpos)
|
NULL) <= outpos)
|
||||||
/* didn't get leaf */
|
/* didn't get leaf */
|
||||||
reject = 1;
|
reject = 1;
|
||||||
@ -414,6 +409,7 @@ static int crush_choose(const struct crush_map *map,
|
|||||||
/* out? */
|
/* out? */
|
||||||
if (itemtype == 0)
|
if (itemtype == 0)
|
||||||
reject = is_out(map, weight,
|
reject = is_out(map, weight,
|
||||||
|
weight_max,
|
||||||
item, x);
|
item, x);
|
||||||
else
|
else
|
||||||
reject = 0;
|
reject = 0;
|
||||||
@ -424,17 +420,14 @@ reject:
|
|||||||
ftotal++;
|
ftotal++;
|
||||||
flocal++;
|
flocal++;
|
||||||
|
|
||||||
if (reject && descend_once)
|
if (collide && flocal <= local_tries)
|
||||||
/* let outer call try again */
|
|
||||||
skip_rep = 1;
|
|
||||||
else if (collide && flocal <= map->choose_local_tries)
|
|
||||||
/* retry locally a few times */
|
/* retry locally a few times */
|
||||||
retry_bucket = 1;
|
retry_bucket = 1;
|
||||||
else if (map->choose_local_fallback_tries > 0 &&
|
else if (local_fallback_tries > 0 &&
|
||||||
flocal <= in->size + map->choose_local_fallback_tries)
|
flocal <= in->size + local_fallback_tries)
|
||||||
/* exhaustive bucket search */
|
/* exhaustive bucket search */
|
||||||
retry_bucket = 1;
|
retry_bucket = 1;
|
||||||
else if (ftotal <= map->choose_total_tries)
|
else if (ftotal <= tries)
|
||||||
/* then retry descent */
|
/* then retry descent */
|
||||||
retry_descent = 1;
|
retry_descent = 1;
|
||||||
else
|
else
|
||||||
@ -463,6 +456,160 @@ reject:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* crush_choose_indep: alternative breadth-first positionally stable mapping
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static void crush_choose_indep(const struct crush_map *map,
|
||||||
|
struct crush_bucket *bucket,
|
||||||
|
const __u32 *weight, int weight_max,
|
||||||
|
int x, int left, int numrep, int type,
|
||||||
|
int *out, int outpos,
|
||||||
|
unsigned int tries,
|
||||||
|
unsigned int recurse_tries,
|
||||||
|
int recurse_to_leaf,
|
||||||
|
int *out2,
|
||||||
|
int parent_r)
|
||||||
|
{
|
||||||
|
struct crush_bucket *in = bucket;
|
||||||
|
int endpos = outpos + left;
|
||||||
|
int rep;
|
||||||
|
unsigned int ftotal;
|
||||||
|
int r;
|
||||||
|
int i;
|
||||||
|
int item = 0;
|
||||||
|
int itemtype;
|
||||||
|
int collide;
|
||||||
|
|
||||||
|
dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
|
||||||
|
bucket->id, x, outpos, numrep);
|
||||||
|
|
||||||
|
/* initially my result is undefined */
|
||||||
|
for (rep = outpos; rep < endpos; rep++) {
|
||||||
|
out[rep] = CRUSH_ITEM_UNDEF;
|
||||||
|
if (out2)
|
||||||
|
out2[rep] = CRUSH_ITEM_UNDEF;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
|
||||||
|
for (rep = outpos; rep < endpos; rep++) {
|
||||||
|
if (out[rep] != CRUSH_ITEM_UNDEF)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
in = bucket; /* initial bucket */
|
||||||
|
|
||||||
|
/* choose through intervening buckets */
|
||||||
|
for (;;) {
|
||||||
|
/* note: we base the choice on the position
|
||||||
|
* even in the nested call. that means that
|
||||||
|
* if the first layer chooses the same bucket
|
||||||
|
* in a different position, we will tend to
|
||||||
|
* choose a different item in that bucket.
|
||||||
|
* this will involve more devices in data
|
||||||
|
* movement and tend to distribute the load.
|
||||||
|
*/
|
||||||
|
r = rep + parent_r;
|
||||||
|
|
||||||
|
/* be careful */
|
||||||
|
if (in->alg == CRUSH_BUCKET_UNIFORM &&
|
||||||
|
in->size % numrep == 0)
|
||||||
|
/* r'=r+(n+1)*f_total */
|
||||||
|
r += (numrep+1) * ftotal;
|
||||||
|
else
|
||||||
|
/* r' = r + n*f_total */
|
||||||
|
r += numrep * ftotal;
|
||||||
|
|
||||||
|
/* bucket choose */
|
||||||
|
if (in->size == 0) {
|
||||||
|
dprintk(" empty bucket\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
item = crush_bucket_choose(in, x, r);
|
||||||
|
if (item >= map->max_devices) {
|
||||||
|
dprintk(" bad item %d\n", item);
|
||||||
|
out[rep] = CRUSH_ITEM_NONE;
|
||||||
|
if (out2)
|
||||||
|
out2[rep] = CRUSH_ITEM_NONE;
|
||||||
|
left--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* desired type? */
|
||||||
|
if (item < 0)
|
||||||
|
itemtype = map->buckets[-1-item]->type;
|
||||||
|
else
|
||||||
|
itemtype = 0;
|
||||||
|
dprintk(" item %d type %d\n", item, itemtype);
|
||||||
|
|
||||||
|
/* keep going? */
|
||||||
|
if (itemtype != type) {
|
||||||
|
if (item >= 0 ||
|
||||||
|
(-1-item) >= map->max_buckets) {
|
||||||
|
dprintk(" bad item type %d\n", type);
|
||||||
|
out[rep] = CRUSH_ITEM_NONE;
|
||||||
|
if (out2)
|
||||||
|
out2[rep] =
|
||||||
|
CRUSH_ITEM_NONE;
|
||||||
|
left--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
in = map->buckets[-1-item];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* collision? */
|
||||||
|
collide = 0;
|
||||||
|
for (i = outpos; i < endpos; i++) {
|
||||||
|
if (out[i] == item) {
|
||||||
|
collide = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (collide)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (recurse_to_leaf) {
|
||||||
|
if (item < 0) {
|
||||||
|
crush_choose_indep(map,
|
||||||
|
map->buckets[-1-item],
|
||||||
|
weight, weight_max,
|
||||||
|
x, 1, numrep, 0,
|
||||||
|
out2, rep,
|
||||||
|
recurse_tries, 0,
|
||||||
|
0, NULL, r);
|
||||||
|
if (out2[rep] == CRUSH_ITEM_NONE) {
|
||||||
|
/* placed nothing; no leaf */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* we already have a leaf! */
|
||||||
|
out2[rep] = item;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* out? */
|
||||||
|
if (itemtype == 0 &&
|
||||||
|
is_out(map, weight, weight_max, item, x))
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* yay! */
|
||||||
|
out[rep] = item;
|
||||||
|
left--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (rep = outpos; rep < endpos; rep++) {
|
||||||
|
if (out[rep] == CRUSH_ITEM_UNDEF) {
|
||||||
|
out[rep] = CRUSH_ITEM_NONE;
|
||||||
|
}
|
||||||
|
if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
|
||||||
|
out2[rep] = CRUSH_ITEM_NONE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* crush_do_rule - calculate a mapping with the given input and rule
|
* crush_do_rule - calculate a mapping with the given input and rule
|
||||||
* @map: the crush_map
|
* @map: the crush_map
|
||||||
@ -470,15 +617,19 @@ reject:
|
|||||||
* @x: hash input
|
* @x: hash input
|
||||||
* @result: pointer to result vector
|
* @result: pointer to result vector
|
||||||
* @result_max: maximum result size
|
* @result_max: maximum result size
|
||||||
|
* @weight: weight vector (for map leaves)
|
||||||
|
* @weight_max: size of weight vector
|
||||||
|
* @scratch: scratch vector for private use; must be >= 3 * result_max
|
||||||
*/
|
*/
|
||||||
int crush_do_rule(const struct crush_map *map,
|
int crush_do_rule(const struct crush_map *map,
|
||||||
int ruleno, int x, int *result, int result_max,
|
int ruleno, int x, int *result, int result_max,
|
||||||
const __u32 *weight)
|
const __u32 *weight, int weight_max,
|
||||||
|
int *scratch)
|
||||||
{
|
{
|
||||||
int result_len;
|
int result_len;
|
||||||
int a[CRUSH_MAX_SET];
|
int *a = scratch;
|
||||||
int b[CRUSH_MAX_SET];
|
int *b = scratch + result_max;
|
||||||
int c[CRUSH_MAX_SET];
|
int *c = scratch + result_max*2;
|
||||||
int recurse_to_leaf;
|
int recurse_to_leaf;
|
||||||
int *w;
|
int *w;
|
||||||
int wsize = 0;
|
int wsize = 0;
|
||||||
@ -489,8 +640,10 @@ int crush_do_rule(const struct crush_map *map,
|
|||||||
__u32 step;
|
__u32 step;
|
||||||
int i, j;
|
int i, j;
|
||||||
int numrep;
|
int numrep;
|
||||||
int firstn;
|
int choose_tries = map->choose_total_tries;
|
||||||
const int descend_once = 0;
|
int choose_local_tries = map->choose_local_tries;
|
||||||
|
int choose_local_fallback_tries = map->choose_local_fallback_tries;
|
||||||
|
int choose_leaf_tries = 0;
|
||||||
|
|
||||||
if ((__u32)ruleno >= map->max_rules) {
|
if ((__u32)ruleno >= map->max_rules) {
|
||||||
dprintk(" bad ruleno %d\n", ruleno);
|
dprintk(" bad ruleno %d\n", ruleno);
|
||||||
@ -503,29 +656,49 @@ int crush_do_rule(const struct crush_map *map,
|
|||||||
o = b;
|
o = b;
|
||||||
|
|
||||||
for (step = 0; step < rule->len; step++) {
|
for (step = 0; step < rule->len; step++) {
|
||||||
|
int firstn = 0;
|
||||||
struct crush_rule_step *curstep = &rule->steps[step];
|
struct crush_rule_step *curstep = &rule->steps[step];
|
||||||
|
|
||||||
firstn = 0;
|
|
||||||
switch (curstep->op) {
|
switch (curstep->op) {
|
||||||
case CRUSH_RULE_TAKE:
|
case CRUSH_RULE_TAKE:
|
||||||
w[0] = curstep->arg1;
|
w[0] = curstep->arg1;
|
||||||
wsize = 1;
|
wsize = 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
|
case CRUSH_RULE_SET_CHOOSE_TRIES:
|
||||||
|
if (curstep->arg1 > 0)
|
||||||
|
choose_tries = curstep->arg1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
|
||||||
|
if (curstep->arg1 > 0)
|
||||||
|
choose_leaf_tries = curstep->arg1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
|
||||||
|
if (curstep->arg1 > 0)
|
||||||
|
choose_local_tries = curstep->arg1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
|
||||||
|
if (curstep->arg1 > 0)
|
||||||
|
choose_local_fallback_tries = curstep->arg1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
|
||||||
case CRUSH_RULE_CHOOSE_FIRSTN:
|
case CRUSH_RULE_CHOOSE_FIRSTN:
|
||||||
firstn = 1;
|
firstn = 1;
|
||||||
/* fall through */
|
/* fall through */
|
||||||
case CRUSH_RULE_CHOOSE_LEAF_INDEP:
|
case CRUSH_RULE_CHOOSELEAF_INDEP:
|
||||||
case CRUSH_RULE_CHOOSE_INDEP:
|
case CRUSH_RULE_CHOOSE_INDEP:
|
||||||
if (wsize == 0)
|
if (wsize == 0)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
recurse_to_leaf =
|
recurse_to_leaf =
|
||||||
curstep->op ==
|
curstep->op ==
|
||||||
CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
|
CRUSH_RULE_CHOOSELEAF_FIRSTN ||
|
||||||
curstep->op ==
|
curstep->op ==
|
||||||
CRUSH_RULE_CHOOSE_LEAF_INDEP;
|
CRUSH_RULE_CHOOSELEAF_INDEP;
|
||||||
|
|
||||||
/* reset output */
|
/* reset output */
|
||||||
osize = 0;
|
osize = 0;
|
||||||
@ -543,22 +716,51 @@ int crush_do_rule(const struct crush_map *map,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
j = 0;
|
j = 0;
|
||||||
osize += crush_choose(map,
|
if (firstn) {
|
||||||
map->buckets[-1-w[i]],
|
int recurse_tries;
|
||||||
weight,
|
if (choose_leaf_tries)
|
||||||
x, numrep,
|
recurse_tries =
|
||||||
curstep->arg2,
|
choose_leaf_tries;
|
||||||
o+osize, j,
|
else if (map->chooseleaf_descend_once)
|
||||||
firstn,
|
recurse_tries = 1;
|
||||||
recurse_to_leaf,
|
else
|
||||||
descend_once, c+osize);
|
recurse_tries = choose_tries;
|
||||||
|
osize += crush_choose_firstn(
|
||||||
|
map,
|
||||||
|
map->buckets[-1-w[i]],
|
||||||
|
weight, weight_max,
|
||||||
|
x, numrep,
|
||||||
|
curstep->arg2,
|
||||||
|
o+osize, j,
|
||||||
|
choose_tries,
|
||||||
|
recurse_tries,
|
||||||
|
choose_local_tries,
|
||||||
|
choose_local_fallback_tries,
|
||||||
|
recurse_to_leaf,
|
||||||
|
c+osize);
|
||||||
|
} else {
|
||||||
|
crush_choose_indep(
|
||||||
|
map,
|
||||||
|
map->buckets[-1-w[i]],
|
||||||
|
weight, weight_max,
|
||||||
|
x, numrep, numrep,
|
||||||
|
curstep->arg2,
|
||||||
|
o+osize, j,
|
||||||
|
choose_tries,
|
||||||
|
choose_leaf_tries ?
|
||||||
|
choose_leaf_tries : 1,
|
||||||
|
recurse_to_leaf,
|
||||||
|
c+osize,
|
||||||
|
0);
|
||||||
|
osize += numrep;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recurse_to_leaf)
|
if (recurse_to_leaf)
|
||||||
/* copy final _leaf_ values to output set */
|
/* copy final _leaf_ values to output set */
|
||||||
memcpy(o, c, osize*sizeof(*o));
|
memcpy(o, c, osize*sizeof(*o));
|
||||||
|
|
||||||
/* swap t and w arrays */
|
/* swap o and w arrays */
|
||||||
tmp = o;
|
tmp = o;
|
||||||
o = w;
|
o = w;
|
||||||
w = tmp;
|
w = tmp;
|
||||||
|
@ -132,7 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp)
|
|||||||
req->r_osd ? req->r_osd->o_osd : -1,
|
req->r_osd ? req->r_osd->o_osd : -1,
|
||||||
req->r_pgid.pool, req->r_pgid.seed);
|
req->r_pgid.pool, req->r_pgid.seed);
|
||||||
|
|
||||||
seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
|
seq_printf(s, "%.*s", req->r_base_oid.name_len,
|
||||||
|
req->r_base_oid.name);
|
||||||
|
|
||||||
if (req->r_reassert_version.epoch)
|
if (req->r_reassert_version.epoch)
|
||||||
seq_printf(s, "\t%u'%llu",
|
seq_printf(s, "\t%u'%llu",
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include <linux/dns_resolver.h>
|
#include <linux/dns_resolver.h>
|
||||||
#include <net/tcp.h>
|
#include <net/tcp.h>
|
||||||
|
|
||||||
|
#include <linux/ceph/ceph_features.h>
|
||||||
#include <linux/ceph/libceph.h>
|
#include <linux/ceph/libceph.h>
|
||||||
#include <linux/ceph/messenger.h>
|
#include <linux/ceph/messenger.h>
|
||||||
#include <linux/ceph/decode.h>
|
#include <linux/ceph/decode.h>
|
||||||
@ -1865,7 +1866,9 @@ int ceph_parse_ips(const char *c, const char *end,
|
|||||||
port = (port * 10) + (*p - '0');
|
port = (port * 10) + (*p - '0');
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
if (port > 65535 || port == 0)
|
if (port == 0)
|
||||||
|
port = CEPH_MON_PORT;
|
||||||
|
else if (port > 65535)
|
||||||
goto bad;
|
goto bad;
|
||||||
} else {
|
} else {
|
||||||
port = CEPH_MON_PORT;
|
port = CEPH_MON_PORT;
|
||||||
@ -1945,7 +1948,8 @@ static int process_connect(struct ceph_connection *con)
|
|||||||
{
|
{
|
||||||
u64 sup_feat = con->msgr->supported_features;
|
u64 sup_feat = con->msgr->supported_features;
|
||||||
u64 req_feat = con->msgr->required_features;
|
u64 req_feat = con->msgr->required_features;
|
||||||
u64 server_feat = le64_to_cpu(con->in_reply.features);
|
u64 server_feat = ceph_sanitize_features(
|
||||||
|
le64_to_cpu(con->in_reply.features));
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
|
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
|
||||||
@ -2853,8 +2857,8 @@ static void con_fault(struct ceph_connection *con)
|
|||||||
*/
|
*/
|
||||||
void ceph_messenger_init(struct ceph_messenger *msgr,
|
void ceph_messenger_init(struct ceph_messenger *msgr,
|
||||||
struct ceph_entity_addr *myaddr,
|
struct ceph_entity_addr *myaddr,
|
||||||
u32 supported_features,
|
u64 supported_features,
|
||||||
u32 required_features,
|
u64 required_features,
|
||||||
bool nocrc)
|
bool nocrc)
|
||||||
{
|
{
|
||||||
msgr->supported_features = supported_features;
|
msgr->supported_features = supported_features;
|
||||||
@ -3126,15 +3130,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
|
|||||||
INIT_LIST_HEAD(&m->data);
|
INIT_LIST_HEAD(&m->data);
|
||||||
|
|
||||||
/* front */
|
/* front */
|
||||||
m->front_max = front_len;
|
|
||||||
if (front_len) {
|
if (front_len) {
|
||||||
if (front_len > PAGE_CACHE_SIZE) {
|
m->front.iov_base = ceph_kvmalloc(front_len, flags);
|
||||||
m->front.iov_base = __vmalloc(front_len, flags,
|
|
||||||
PAGE_KERNEL);
|
|
||||||
m->front_is_vmalloc = true;
|
|
||||||
} else {
|
|
||||||
m->front.iov_base = kmalloc(front_len, flags);
|
|
||||||
}
|
|
||||||
if (m->front.iov_base == NULL) {
|
if (m->front.iov_base == NULL) {
|
||||||
dout("ceph_msg_new can't allocate %d bytes\n",
|
dout("ceph_msg_new can't allocate %d bytes\n",
|
||||||
front_len);
|
front_len);
|
||||||
@ -3143,7 +3140,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
|
|||||||
} else {
|
} else {
|
||||||
m->front.iov_base = NULL;
|
m->front.iov_base = NULL;
|
||||||
}
|
}
|
||||||
m->front.iov_len = front_len;
|
m->front_alloc_len = m->front.iov_len = front_len;
|
||||||
|
|
||||||
dout("ceph_msg_new %p front %d\n", m, front_len);
|
dout("ceph_msg_new %p front %d\n", m, front_len);
|
||||||
return m;
|
return m;
|
||||||
@ -3256,10 +3253,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
|
|||||||
void ceph_msg_kfree(struct ceph_msg *m)
|
void ceph_msg_kfree(struct ceph_msg *m)
|
||||||
{
|
{
|
||||||
dout("msg_kfree %p\n", m);
|
dout("msg_kfree %p\n", m);
|
||||||
if (m->front_is_vmalloc)
|
ceph_kvfree(m->front.iov_base);
|
||||||
vfree(m->front.iov_base);
|
|
||||||
else
|
|
||||||
kfree(m->front.iov_base);
|
|
||||||
kmem_cache_free(ceph_msg_cache, m);
|
kmem_cache_free(ceph_msg_cache, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3301,8 +3295,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
|
|||||||
|
|
||||||
void ceph_msg_dump(struct ceph_msg *msg)
|
void ceph_msg_dump(struct ceph_msg *msg)
|
||||||
{
|
{
|
||||||
pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
|
pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
|
||||||
msg->front_max, msg->data_length);
|
msg->front_alloc_len, msg->data_length);
|
||||||
print_hex_dump(KERN_DEBUG, "header: ",
|
print_hex_dump(KERN_DEBUG, "header: ",
|
||||||
DUMP_PREFIX_OFFSET, 16, 1,
|
DUMP_PREFIX_OFFSET, 16, 1,
|
||||||
&msg->hdr, sizeof(msg->hdr), true);
|
&msg->hdr, sizeof(msg->hdr), true);
|
||||||
|
@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc)
|
|||||||
/* initiatiate authentication handshake */
|
/* initiatiate authentication handshake */
|
||||||
ret = ceph_auth_build_hello(monc->auth,
|
ret = ceph_auth_build_hello(monc->auth,
|
||||||
monc->m_auth->front.iov_base,
|
monc->m_auth->front.iov_base,
|
||||||
monc->m_auth->front_max);
|
monc->m_auth->front_alloc_len);
|
||||||
__send_prepared_auth_request(monc, ret);
|
__send_prepared_auth_request(monc, ret);
|
||||||
} else {
|
} else {
|
||||||
dout("open_session mon%d already open\n", monc->cur_mon);
|
dout("open_session mon%d already open\n", monc->cur_mon);
|
||||||
@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
|
|||||||
int num;
|
int num;
|
||||||
|
|
||||||
p = msg->front.iov_base;
|
p = msg->front.iov_base;
|
||||||
end = p + msg->front_max;
|
end = p + msg->front_alloc_len;
|
||||||
|
|
||||||
num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
|
num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
|
||||||
ceph_encode_32(&p, num);
|
ceph_encode_32(&p, num);
|
||||||
@ -897,7 +897,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
|
|||||||
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
|
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
|
||||||
msg->front.iov_len,
|
msg->front.iov_len,
|
||||||
monc->m_auth->front.iov_base,
|
monc->m_auth->front.iov_base,
|
||||||
monc->m_auth->front_max);
|
monc->m_auth->front_alloc_len);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
monc->client->auth_err = ret;
|
monc->client->auth_err = ret;
|
||||||
wake_up_all(&monc->client->auth_wq);
|
wake_up_all(&monc->client->auth_wq);
|
||||||
@ -939,7 +939,7 @@ static int __validate_auth(struct ceph_mon_client *monc)
|
|||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
|
ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
|
||||||
monc->m_auth->front_max);
|
monc->m_auth->front_alloc_len);
|
||||||
if (ret <= 0)
|
if (ret <= 0)
|
||||||
return ret; /* either an error, or no need to authenticate */
|
return ret; /* either an error, or no need to authenticate */
|
||||||
__send_prepared_auth_request(monc, ret);
|
__send_prepared_auth_request(monc, ret);
|
||||||
|
@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
|
|||||||
msg_size = 4 + 4 + 8 + 8 + 4+8;
|
msg_size = 4 + 4 + 8 + 8 + 4+8;
|
||||||
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
|
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
|
||||||
msg_size += 1 + 8 + 4 + 4; /* pg_t */
|
msg_size += 1 + 8 + 4 + 4; /* pg_t */
|
||||||
msg_size += 4 + MAX_OBJ_NAME_SIZE;
|
msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
|
||||||
msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
|
msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
|
||||||
msg_size += 8; /* snapid */
|
msg_size += 8; /* snapid */
|
||||||
msg_size += 8; /* snap_seq */
|
msg_size += 8; /* snap_seq */
|
||||||
@ -368,6 +368,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
|
|||||||
INIT_LIST_HEAD(&req->r_req_lru_item);
|
INIT_LIST_HEAD(&req->r_req_lru_item);
|
||||||
INIT_LIST_HEAD(&req->r_osd_item);
|
INIT_LIST_HEAD(&req->r_osd_item);
|
||||||
|
|
||||||
|
req->r_base_oloc.pool = -1;
|
||||||
|
req->r_target_oloc.pool = -1;
|
||||||
|
|
||||||
/* create reply message */
|
/* create reply message */
|
||||||
if (use_mempool)
|
if (use_mempool)
|
||||||
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
|
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
|
||||||
@ -761,11 +764,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
|
|||||||
if (num_ops > 1)
|
if (num_ops > 1)
|
||||||
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
|
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
|
||||||
|
|
||||||
req->r_file_layout = *layout; /* keep a copy */
|
req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
|
||||||
|
|
||||||
snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx",
|
snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
|
||||||
vino.ino, objnum);
|
"%llx.%08llx", vino.ino, objnum);
|
||||||
req->r_oid_len = strlen(req->r_oid);
|
req->r_base_oid.name_len = strlen(req->r_base_oid.name);
|
||||||
|
|
||||||
return req;
|
return req;
|
||||||
}
|
}
|
||||||
@ -1044,8 +1047,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
|
|||||||
!ceph_con_opened(&osd->o_con)) {
|
!ceph_con_opened(&osd->o_con)) {
|
||||||
struct ceph_osd_request *req;
|
struct ceph_osd_request *req;
|
||||||
|
|
||||||
dout(" osd addr hasn't changed and connection never opened,"
|
dout("osd addr hasn't changed and connection never opened, "
|
||||||
" letting msgr retry");
|
"letting msgr retry\n");
|
||||||
/* touch each r_stamp for handle_timeout()'s benfit */
|
/* touch each r_stamp for handle_timeout()'s benfit */
|
||||||
list_for_each_entry(req, &osd->o_requests, r_osd_item)
|
list_for_each_entry(req, &osd->o_requests, r_osd_item)
|
||||||
req->r_stamp = jiffies;
|
req->r_stamp = jiffies;
|
||||||
@ -1231,6 +1234,61 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ceph_osdc_set_request_linger);
|
EXPORT_SYMBOL(ceph_osdc_set_request_linger);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns whether a request should be blocked from being sent
|
||||||
|
* based on the current osdmap and osd_client settings.
|
||||||
|
*
|
||||||
|
* Caller should hold map_sem for read.
|
||||||
|
*/
|
||||||
|
static bool __req_should_be_paused(struct ceph_osd_client *osdc,
|
||||||
|
struct ceph_osd_request *req)
|
||||||
|
{
|
||||||
|
bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
|
||||||
|
bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
|
||||||
|
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
|
||||||
|
return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
|
||||||
|
(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate mapping of a request to a PG. Takes tiering into account.
|
||||||
|
*/
|
||||||
|
static int __calc_request_pg(struct ceph_osdmap *osdmap,
|
||||||
|
struct ceph_osd_request *req,
|
||||||
|
struct ceph_pg *pg_out)
|
||||||
|
{
|
||||||
|
bool need_check_tiering;
|
||||||
|
|
||||||
|
need_check_tiering = false;
|
||||||
|
if (req->r_target_oloc.pool == -1) {
|
||||||
|
req->r_target_oloc = req->r_base_oloc; /* struct */
|
||||||
|
need_check_tiering = true;
|
||||||
|
}
|
||||||
|
if (req->r_target_oid.name_len == 0) {
|
||||||
|
ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
|
||||||
|
need_check_tiering = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (need_check_tiering &&
|
||||||
|
(req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
|
||||||
|
struct ceph_pg_pool_info *pi;
|
||||||
|
|
||||||
|
pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
|
||||||
|
if (pi) {
|
||||||
|
if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
|
||||||
|
pi->read_tier >= 0)
|
||||||
|
req->r_target_oloc.pool = pi->read_tier;
|
||||||
|
if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
|
||||||
|
pi->write_tier >= 0)
|
||||||
|
req->r_target_oloc.pool = pi->write_tier;
|
||||||
|
}
|
||||||
|
/* !pi is caught in ceph_oloc_oid_to_pg() */
|
||||||
|
}
|
||||||
|
|
||||||
|
return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
|
||||||
|
&req->r_target_oid, pg_out);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
|
* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
|
||||||
* (as needed), and set the request r_osd appropriately. If there is
|
* (as needed), and set the request r_osd appropriately. If there is
|
||||||
@ -1248,10 +1306,11 @@ static int __map_request(struct ceph_osd_client *osdc,
|
|||||||
int acting[CEPH_PG_MAX_SIZE];
|
int acting[CEPH_PG_MAX_SIZE];
|
||||||
int o = -1, num = 0;
|
int o = -1, num = 0;
|
||||||
int err;
|
int err;
|
||||||
|
bool was_paused;
|
||||||
|
|
||||||
dout("map_request %p tid %lld\n", req, req->r_tid);
|
dout("map_request %p tid %lld\n", req, req->r_tid);
|
||||||
err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
|
|
||||||
ceph_file_layout_pg_pool(req->r_file_layout));
|
err = __calc_request_pg(osdc->osdmap, req, &pgid);
|
||||||
if (err) {
|
if (err) {
|
||||||
list_move(&req->r_req_lru_item, &osdc->req_notarget);
|
list_move(&req->r_req_lru_item, &osdc->req_notarget);
|
||||||
return err;
|
return err;
|
||||||
@ -1264,12 +1323,18 @@ static int __map_request(struct ceph_osd_client *osdc,
|
|||||||
num = err;
|
num = err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
was_paused = req->r_paused;
|
||||||
|
req->r_paused = __req_should_be_paused(osdc, req);
|
||||||
|
if (was_paused && !req->r_paused)
|
||||||
|
force_resend = 1;
|
||||||
|
|
||||||
if ((!force_resend &&
|
if ((!force_resend &&
|
||||||
req->r_osd && req->r_osd->o_osd == o &&
|
req->r_osd && req->r_osd->o_osd == o &&
|
||||||
req->r_sent >= req->r_osd->o_incarnation &&
|
req->r_sent >= req->r_osd->o_incarnation &&
|
||||||
req->r_num_pg_osds == num &&
|
req->r_num_pg_osds == num &&
|
||||||
memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
|
memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
|
||||||
(req->r_osd == NULL && o == -1))
|
(req->r_osd == NULL && o == -1) ||
|
||||||
|
req->r_paused)
|
||||||
return 0; /* no change */
|
return 0; /* no change */
|
||||||
|
|
||||||
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
|
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
|
||||||
@ -1331,7 +1396,7 @@ static void __send_request(struct ceph_osd_client *osdc,
|
|||||||
/* fill in message content that changes each time we send it */
|
/* fill in message content that changes each time we send it */
|
||||||
put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
|
put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
|
||||||
put_unaligned_le32(req->r_flags, req->r_request_flags);
|
put_unaligned_le32(req->r_flags, req->r_request_flags);
|
||||||
put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
|
put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
|
||||||
p = req->r_request_pgid;
|
p = req->r_request_pgid;
|
||||||
ceph_encode_64(&p, req->r_pgid.pool);
|
ceph_encode_64(&p, req->r_pgid.pool);
|
||||||
ceph_encode_32(&p, req->r_pgid.seed);
|
ceph_encode_32(&p, req->r_pgid.seed);
|
||||||
@ -1432,6 +1497,109 @@ static void handle_osds_timeout(struct work_struct *work)
|
|||||||
round_jiffies_relative(delay));
|
round_jiffies_relative(delay));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ceph_oloc_decode(void **p, void *end,
|
||||||
|
struct ceph_object_locator *oloc)
|
||||||
|
{
|
||||||
|
u8 struct_v, struct_cv;
|
||||||
|
u32 len;
|
||||||
|
void *struct_end;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
|
||||||
|
struct_v = ceph_decode_8(p);
|
||||||
|
struct_cv = ceph_decode_8(p);
|
||||||
|
if (struct_v < 3) {
|
||||||
|
pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
|
||||||
|
struct_v, struct_cv);
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
if (struct_cv > 6) {
|
||||||
|
pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
|
||||||
|
struct_v, struct_cv);
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
len = ceph_decode_32(p);
|
||||||
|
ceph_decode_need(p, end, len, e_inval);
|
||||||
|
struct_end = *p + len;
|
||||||
|
|
||||||
|
oloc->pool = ceph_decode_64(p);
|
||||||
|
*p += 4; /* skip preferred */
|
||||||
|
|
||||||
|
len = ceph_decode_32(p);
|
||||||
|
if (len > 0) {
|
||||||
|
pr_warn("ceph_object_locator::key is set\n");
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (struct_v >= 5) {
|
||||||
|
len = ceph_decode_32(p);
|
||||||
|
if (len > 0) {
|
||||||
|
pr_warn("ceph_object_locator::nspace is set\n");
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (struct_v >= 6) {
|
||||||
|
s64 hash = ceph_decode_64(p);
|
||||||
|
if (hash != -1) {
|
||||||
|
pr_warn("ceph_object_locator::hash is set\n");
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* skip the rest */
|
||||||
|
*p = struct_end;
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
e_inval:
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ceph_redirect_decode(void **p, void *end,
|
||||||
|
struct ceph_request_redirect *redir)
|
||||||
|
{
|
||||||
|
u8 struct_v, struct_cv;
|
||||||
|
u32 len;
|
||||||
|
void *struct_end;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
|
||||||
|
struct_v = ceph_decode_8(p);
|
||||||
|
struct_cv = ceph_decode_8(p);
|
||||||
|
if (struct_cv > 1) {
|
||||||
|
pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
|
||||||
|
struct_v, struct_cv);
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
len = ceph_decode_32(p);
|
||||||
|
ceph_decode_need(p, end, len, e_inval);
|
||||||
|
struct_end = *p + len;
|
||||||
|
|
||||||
|
ret = ceph_oloc_decode(p, end, &redir->oloc);
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
len = ceph_decode_32(p);
|
||||||
|
if (len > 0) {
|
||||||
|
pr_warn("ceph_request_redirect::object_name is set\n");
|
||||||
|
goto e_inval;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = ceph_decode_32(p);
|
||||||
|
*p += len; /* skip osd_instructions */
|
||||||
|
|
||||||
|
/* skip the rest */
|
||||||
|
*p = struct_end;
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
e_inval:
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
static void complete_request(struct ceph_osd_request *req)
|
static void complete_request(struct ceph_osd_request *req)
|
||||||
{
|
{
|
||||||
complete_all(&req->r_safe_completion); /* fsync waiter */
|
complete_all(&req->r_safe_completion); /* fsync waiter */
|
||||||
@ -1446,6 +1614,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
|
|||||||
{
|
{
|
||||||
void *p, *end;
|
void *p, *end;
|
||||||
struct ceph_osd_request *req;
|
struct ceph_osd_request *req;
|
||||||
|
struct ceph_request_redirect redir;
|
||||||
u64 tid;
|
u64 tid;
|
||||||
int object_len;
|
int object_len;
|
||||||
unsigned int numops;
|
unsigned int numops;
|
||||||
@ -1525,10 +1694,41 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
|
|||||||
for (i = 0; i < numops; i++)
|
for (i = 0; i < numops; i++)
|
||||||
req->r_reply_op_result[i] = ceph_decode_32(&p);
|
req->r_reply_op_result[i] = ceph_decode_32(&p);
|
||||||
|
|
||||||
|
if (le16_to_cpu(msg->hdr.version) >= 6) {
|
||||||
|
p += 8 + 4; /* skip replay_version */
|
||||||
|
p += 8; /* skip user_version */
|
||||||
|
|
||||||
|
err = ceph_redirect_decode(&p, end, &redir);
|
||||||
|
if (err)
|
||||||
|
goto bad_put;
|
||||||
|
} else {
|
||||||
|
redir.oloc.pool = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (redir.oloc.pool != -1) {
|
||||||
|
dout("redirect pool %lld\n", redir.oloc.pool);
|
||||||
|
|
||||||
|
__unregister_request(osdc, req);
|
||||||
|
mutex_unlock(&osdc->request_mutex);
|
||||||
|
|
||||||
|
req->r_target_oloc = redir.oloc; /* struct */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Start redirect requests with nofail=true. If
|
||||||
|
* mapping fails, request will end up on the notarget
|
||||||
|
* list, waiting for the new osdmap (which can take
|
||||||
|
* a while), even though the original request mapped
|
||||||
|
* successfully. In the future we might want to follow
|
||||||
|
* original request's nofail setting here.
|
||||||
|
*/
|
||||||
|
err = ceph_osdc_start_request(osdc, req, true);
|
||||||
|
BUG_ON(err);
|
||||||
|
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
already_completed = req->r_got_reply;
|
already_completed = req->r_got_reply;
|
||||||
|
|
||||||
if (!req->r_got_reply) {
|
if (!req->r_got_reply) {
|
||||||
|
|
||||||
req->r_result = result;
|
req->r_result = result;
|
||||||
dout("handle_reply result %d bytes %d\n", req->r_result,
|
dout("handle_reply result %d bytes %d\n", req->r_result,
|
||||||
bytes);
|
bytes);
|
||||||
@ -1581,6 +1781,13 @@ done:
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
bad_put:
|
bad_put:
|
||||||
|
req->r_result = -EIO;
|
||||||
|
__unregister_request(osdc, req);
|
||||||
|
if (req->r_callback)
|
||||||
|
req->r_callback(req, msg);
|
||||||
|
else
|
||||||
|
complete_all(&req->r_completion);
|
||||||
|
complete_request(req);
|
||||||
ceph_osdc_put_request(req);
|
ceph_osdc_put_request(req);
|
||||||
bad_mutex:
|
bad_mutex:
|
||||||
mutex_unlock(&osdc->request_mutex);
|
mutex_unlock(&osdc->request_mutex);
|
||||||
@ -1613,14 +1820,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
|
|||||||
*
|
*
|
||||||
* Caller should hold map_sem for read.
|
* Caller should hold map_sem for read.
|
||||||
*/
|
*/
|
||||||
static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
|
static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
|
||||||
|
bool force_resend_writes)
|
||||||
{
|
{
|
||||||
struct ceph_osd_request *req, *nreq;
|
struct ceph_osd_request *req, *nreq;
|
||||||
struct rb_node *p;
|
struct rb_node *p;
|
||||||
int needmap = 0;
|
int needmap = 0;
|
||||||
int err;
|
int err;
|
||||||
|
bool force_resend_req;
|
||||||
|
|
||||||
dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
|
dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
|
||||||
|
force_resend_writes ? " (force resend writes)" : "");
|
||||||
mutex_lock(&osdc->request_mutex);
|
mutex_lock(&osdc->request_mutex);
|
||||||
for (p = rb_first(&osdc->requests); p; ) {
|
for (p = rb_first(&osdc->requests); p; ) {
|
||||||
req = rb_entry(p, struct ceph_osd_request, r_node);
|
req = rb_entry(p, struct ceph_osd_request, r_node);
|
||||||
@ -1645,7 +1855,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = __map_request(osdc, req, force_resend);
|
force_resend_req = force_resend ||
|
||||||
|
(force_resend_writes &&
|
||||||
|
req->r_flags & CEPH_OSD_FLAG_WRITE);
|
||||||
|
err = __map_request(osdc, req, force_resend_req);
|
||||||
if (err < 0)
|
if (err < 0)
|
||||||
continue; /* error */
|
continue; /* error */
|
||||||
if (req->r_osd == NULL) {
|
if (req->r_osd == NULL) {
|
||||||
@ -1665,7 +1878,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
|
|||||||
r_linger_item) {
|
r_linger_item) {
|
||||||
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
|
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
|
||||||
|
|
||||||
err = __map_request(osdc, req, force_resend);
|
err = __map_request(osdc, req,
|
||||||
|
force_resend || force_resend_writes);
|
||||||
dout("__map_request returned %d\n", err);
|
dout("__map_request returned %d\n", err);
|
||||||
if (err == 0)
|
if (err == 0)
|
||||||
continue; /* no change and no osd was specified */
|
continue; /* no change and no osd was specified */
|
||||||
@ -1707,6 +1921,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
|
|||||||
struct ceph_osdmap *newmap = NULL, *oldmap;
|
struct ceph_osdmap *newmap = NULL, *oldmap;
|
||||||
int err;
|
int err;
|
||||||
struct ceph_fsid fsid;
|
struct ceph_fsid fsid;
|
||||||
|
bool was_full;
|
||||||
|
|
||||||
dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
|
dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
|
||||||
p = msg->front.iov_base;
|
p = msg->front.iov_base;
|
||||||
@ -1720,6 +1935,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
|
|||||||
|
|
||||||
down_write(&osdc->map_sem);
|
down_write(&osdc->map_sem);
|
||||||
|
|
||||||
|
was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
|
||||||
|
|
||||||
/* incremental maps */
|
/* incremental maps */
|
||||||
ceph_decode_32_safe(&p, end, nr_maps, bad);
|
ceph_decode_32_safe(&p, end, nr_maps, bad);
|
||||||
dout(" %d inc maps\n", nr_maps);
|
dout(" %d inc maps\n", nr_maps);
|
||||||
@ -1744,7 +1961,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
|
|||||||
ceph_osdmap_destroy(osdc->osdmap);
|
ceph_osdmap_destroy(osdc->osdmap);
|
||||||
osdc->osdmap = newmap;
|
osdc->osdmap = newmap;
|
||||||
}
|
}
|
||||||
kick_requests(osdc, 0);
|
was_full = was_full ||
|
||||||
|
ceph_osdmap_flag(osdc->osdmap,
|
||||||
|
CEPH_OSDMAP_FULL);
|
||||||
|
kick_requests(osdc, 0, was_full);
|
||||||
} else {
|
} else {
|
||||||
dout("ignoring incremental map %u len %d\n",
|
dout("ignoring incremental map %u len %d\n",
|
||||||
epoch, maplen);
|
epoch, maplen);
|
||||||
@ -1787,7 +2007,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
|
|||||||
skipped_map = 1;
|
skipped_map = 1;
|
||||||
ceph_osdmap_destroy(oldmap);
|
ceph_osdmap_destroy(oldmap);
|
||||||
}
|
}
|
||||||
kick_requests(osdc, skipped_map);
|
was_full = was_full ||
|
||||||
|
ceph_osdmap_flag(osdc->osdmap,
|
||||||
|
CEPH_OSDMAP_FULL);
|
||||||
|
kick_requests(osdc, skipped_map, was_full);
|
||||||
}
|
}
|
||||||
p += maplen;
|
p += maplen;
|
||||||
nr_maps--;
|
nr_maps--;
|
||||||
@ -1804,7 +2027,9 @@ done:
|
|||||||
* we find out when we are no longer full and stop returning
|
* we find out when we are no longer full and stop returning
|
||||||
* ENOSPC.
|
* ENOSPC.
|
||||||
*/
|
*/
|
||||||
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
|
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
|
||||||
|
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
|
||||||
|
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
|
||||||
ceph_monc_request_next_osdmap(&osdc->client->monc);
|
ceph_monc_request_next_osdmap(&osdc->client->monc);
|
||||||
|
|
||||||
mutex_lock(&osdc->request_mutex);
|
mutex_lock(&osdc->request_mutex);
|
||||||
@ -2068,10 +2293,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
|
|||||||
ceph_encode_32(&p, -1); /* preferred */
|
ceph_encode_32(&p, -1); /* preferred */
|
||||||
|
|
||||||
/* oid */
|
/* oid */
|
||||||
ceph_encode_32(&p, req->r_oid_len);
|
ceph_encode_32(&p, req->r_base_oid.name_len);
|
||||||
memcpy(p, req->r_oid, req->r_oid_len);
|
memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
|
||||||
dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
|
dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
|
||||||
p += req->r_oid_len;
|
req->r_base_oid.name, req->r_base_oid.name_len);
|
||||||
|
p += req->r_base_oid.name_len;
|
||||||
|
|
||||||
/* ops--can imply data */
|
/* ops--can imply data */
|
||||||
ceph_encode_16(&p, (u16)req->r_num_ops);
|
ceph_encode_16(&p, (u16)req->r_num_ops);
|
||||||
@ -2454,7 +2680,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
|
|||||||
struct ceph_osd_client *osdc = osd->o_osdc;
|
struct ceph_osd_client *osdc = osd->o_osdc;
|
||||||
struct ceph_msg *m;
|
struct ceph_msg *m;
|
||||||
struct ceph_osd_request *req;
|
struct ceph_osd_request *req;
|
||||||
int front = le32_to_cpu(hdr->front_len);
|
int front_len = le32_to_cpu(hdr->front_len);
|
||||||
int data_len = le32_to_cpu(hdr->data_len);
|
int data_len = le32_to_cpu(hdr->data_len);
|
||||||
u64 tid;
|
u64 tid;
|
||||||
|
|
||||||
@ -2474,12 +2700,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
|
|||||||
req->r_reply, req->r_reply->con);
|
req->r_reply, req->r_reply->con);
|
||||||
ceph_msg_revoke_incoming(req->r_reply);
|
ceph_msg_revoke_incoming(req->r_reply);
|
||||||
|
|
||||||
if (front > req->r_reply->front.iov_len) {
|
if (front_len > req->r_reply->front_alloc_len) {
|
||||||
pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n",
|
pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n",
|
||||||
front, (int)req->r_reply->front.iov_len,
|
front_len, req->r_reply->front_alloc_len,
|
||||||
(unsigned int)con->peer_name.type,
|
(unsigned int)con->peer_name.type,
|
||||||
le64_to_cpu(con->peer_name.num));
|
le64_to_cpu(con->peer_name.num));
|
||||||
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false);
|
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
|
||||||
|
false);
|
||||||
if (!m)
|
if (!m)
|
||||||
goto out;
|
goto out;
|
||||||
ceph_msg_put(req->r_reply);
|
ceph_msg_put(req->r_reply);
|
||||||
|
@ -464,6 +464,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
|
||||||
|
{
|
||||||
|
return __lookup_pg_pool(&map->pg_pools, id);
|
||||||
|
}
|
||||||
|
|
||||||
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
|
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
|
||||||
{
|
{
|
||||||
struct ceph_pg_pool_info *pi;
|
struct ceph_pg_pool_info *pi;
|
||||||
@ -514,8 +519,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
|
|||||||
pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
|
pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
if (cv > 7) {
|
if (cv > 9) {
|
||||||
pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
|
pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
len = ceph_decode_32(p);
|
len = ceph_decode_32(p);
|
||||||
@ -543,12 +548,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
|
|||||||
*p += len;
|
*p += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* skip removed snaps */
|
/* skip removed_snaps */
|
||||||
num = ceph_decode_32(p);
|
num = ceph_decode_32(p);
|
||||||
*p += num * (8 + 8);
|
*p += num * (8 + 8);
|
||||||
|
|
||||||
*p += 8; /* skip auid */
|
*p += 8; /* skip auid */
|
||||||
pi->flags = ceph_decode_64(p);
|
pi->flags = ceph_decode_64(p);
|
||||||
|
*p += 4; /* skip crash_replay_interval */
|
||||||
|
|
||||||
|
if (ev >= 7)
|
||||||
|
*p += 1; /* skip min_size */
|
||||||
|
|
||||||
|
if (ev >= 8)
|
||||||
|
*p += 8 + 8; /* skip quota_max_* */
|
||||||
|
|
||||||
|
if (ev >= 9) {
|
||||||
|
/* skip tiers */
|
||||||
|
num = ceph_decode_32(p);
|
||||||
|
*p += num * 8;
|
||||||
|
|
||||||
|
*p += 8; /* skip tier_of */
|
||||||
|
*p += 1; /* skip cache_mode */
|
||||||
|
|
||||||
|
pi->read_tier = ceph_decode_64(p);
|
||||||
|
pi->write_tier = ceph_decode_64(p);
|
||||||
|
} else {
|
||||||
|
pi->read_tier = -1;
|
||||||
|
pi->write_tier = -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* ignore the rest */
|
/* ignore the rest */
|
||||||
|
|
||||||
@ -1090,25 +1117,40 @@ invalid:
|
|||||||
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
|
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* calculate an object layout (i.e. pgid) from an oid,
|
* Calculate mapping of a (oloc, oid) pair to a PG. Should only be
|
||||||
* file_layout, and osdmap
|
* called with target's (oloc, oid), since tiering isn't taken into
|
||||||
|
* account.
|
||||||
*/
|
*/
|
||||||
int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
|
int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
|
||||||
struct ceph_osdmap *osdmap, uint64_t pool)
|
struct ceph_object_locator *oloc,
|
||||||
|
struct ceph_object_id *oid,
|
||||||
|
struct ceph_pg *pg_out)
|
||||||
{
|
{
|
||||||
struct ceph_pg_pool_info *pool_info;
|
struct ceph_pg_pool_info *pi;
|
||||||
|
|
||||||
BUG_ON(!osdmap);
|
pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
|
||||||
pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
|
if (!pi)
|
||||||
if (!pool_info)
|
|
||||||
return -EIO;
|
return -EIO;
|
||||||
pg->pool = pool;
|
|
||||||
pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
|
|
||||||
|
|
||||||
dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
|
pg_out->pool = oloc->pool;
|
||||||
|
pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
|
||||||
|
oid->name_len);
|
||||||
|
|
||||||
|
dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
|
||||||
|
pg_out->pool, pg_out->seed);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ceph_calc_ceph_pg);
|
EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
|
||||||
|
|
||||||
|
static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
|
||||||
|
int *result, int result_max,
|
||||||
|
const __u32 *weight, int weight_max)
|
||||||
|
{
|
||||||
|
int scratch[result_max * 3];
|
||||||
|
|
||||||
|
return crush_do_rule(map, ruleno, x, result, result_max,
|
||||||
|
weight, weight_max, scratch);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Calculate raw osd vector for the given pgid. Return pointer to osd
|
* Calculate raw osd vector for the given pgid. Return pointer to osd
|
||||||
@ -1163,9 +1205,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
|
|||||||
pool->pgp_num_mask) +
|
pool->pgp_num_mask) +
|
||||||
(unsigned)pgid.pool;
|
(unsigned)pgid.pool;
|
||||||
}
|
}
|
||||||
r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
|
r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
|
||||||
min_t(int, pool->size, *num),
|
osds, min_t(int, pool->size, *num),
|
||||||
osdmap->osd_weight);
|
osdmap->osd_weight, osdmap->max_osd);
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
|
pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
|
||||||
" size %d\n", r, pgid.pool, pool->crush_ruleset,
|
" size %d\n", r, pgid.pool, pool->crush_ruleset,
|
||||||
|
Loading…
Reference in New Issue
Block a user