linux/net/ceph/osdmap.c

2717 lines
62 KiB
C
Raw Normal View History

#include <linux/ceph/ceph_debug.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/module.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <asm/div64.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/osdmap.h>
#include <linux/ceph/decode.h>
#include <linux/crush/hash.h>
#include <linux/crush/mapper.h>
char *ceph_osdmap_state_str(char *str, int len, u32 state)
{
if (!len)
return str;
if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
snprintf(str, len, "exists, up");
else if (state & CEPH_OSD_EXISTS)
snprintf(str, len, "exists");
else if (state & CEPH_OSD_UP)
snprintf(str, len, "up");
else
snprintf(str, len, "doesn't exist");
return str;
}
/* maps */
static int calc_bits_of(unsigned int t)
{
int b = 0;
while (t) {
t = t >> 1;
b++;
}
return b;
}
/*
* the foo_mask is the smallest value 2^n-1 that is >= foo.
*/
static void calc_pg_masks(struct ceph_pg_pool_info *pi)
{
pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
}
/*
* decode crush map
*/
static int crush_decode_uniform_bucket(void **p, void *end,
struct crush_bucket_uniform *b)
{
dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
b->item_weight = ceph_decode_32(p);
return 0;
bad:
return -EINVAL;
}
static int crush_decode_list_bucket(void **p, void *end,
struct crush_bucket_list *b)
{
int j;
dout("crush_decode_list_bucket %p to %p\n", *p, end);
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->item_weights == NULL)
return -ENOMEM;
b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->sum_weights == NULL)
return -ENOMEM;
ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
for (j = 0; j < b->h.size; j++) {
b->item_weights[j] = ceph_decode_32(p);
b->sum_weights[j] = ceph_decode_32(p);
}
return 0;
bad:
return -EINVAL;
}
static int crush_decode_tree_bucket(void **p, void *end,
struct crush_bucket_tree *b)
{
int j;
dout("crush_decode_tree_bucket %p to %p\n", *p, end);
ceph_decode_8_safe(p, end, b->num_nodes, bad);
b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
if (b->node_weights == NULL)
return -ENOMEM;
ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
for (j = 0; j < b->num_nodes; j++)
b->node_weights[j] = ceph_decode_32(p);
return 0;
bad:
return -EINVAL;
}
static int crush_decode_straw_bucket(void **p, void *end,
struct crush_bucket_straw *b)
{
int j;
dout("crush_decode_straw_bucket %p to %p\n", *p, end);
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->item_weights == NULL)
return -ENOMEM;
b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->straws == NULL)
return -ENOMEM;
ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
for (j = 0; j < b->h.size; j++) {
b->item_weights[j] = ceph_decode_32(p);
b->straws[j] = ceph_decode_32(p);
}
return 0;
bad:
return -EINVAL;
}
static int crush_decode_straw2_bucket(void **p, void *end,
struct crush_bucket_straw2 *b)
{
int j;
dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->item_weights == NULL)
return -ENOMEM;
ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
for (j = 0; j < b->h.size; j++)
b->item_weights[j] = ceph_decode_32(p);
return 0;
bad:
return -EINVAL;
}
static struct crush_choose_arg_map *alloc_choose_arg_map(void)
{
struct crush_choose_arg_map *arg_map;
arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
if (!arg_map)
return NULL;
RB_CLEAR_NODE(&arg_map->node);
return arg_map;
}
static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
{
if (arg_map) {
int i, j;
WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
for (i = 0; i < arg_map->size; i++) {
struct crush_choose_arg *arg = &arg_map->args[i];
for (j = 0; j < arg->weight_set_size; j++)
kfree(arg->weight_set[j].weights);
kfree(arg->weight_set);
kfree(arg->ids);
}
kfree(arg_map->args);
kfree(arg_map);
}
}
DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
node);
void clear_choose_args(struct crush_map *c)
{
while (!RB_EMPTY_ROOT(&c->choose_args)) {
struct crush_choose_arg_map *arg_map =
rb_entry(rb_first(&c->choose_args),
struct crush_choose_arg_map, node);
erase_choose_arg_map(&c->choose_args, arg_map);
free_choose_arg_map(arg_map);
}
}
static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
{
u32 *a = NULL;
u32 len;
int ret;
ceph_decode_32_safe(p, end, len, e_inval);
if (len) {
u32 i;
a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
if (!a) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_need(p, end, len * sizeof(u32), e_inval);
for (i = 0; i < len; i++)
a[i] = ceph_decode_32(p);
}
*plen = len;
return a;
e_inval:
ret = -EINVAL;
fail:
kfree(a);
return ERR_PTR(ret);
}
/*
* Assumes @arg is zero-initialized.
*/
static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
{
int ret;
ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
if (arg->weight_set_size) {
u32 i;
arg->weight_set = kmalloc_array(arg->weight_set_size,
sizeof(*arg->weight_set),
GFP_NOIO);
if (!arg->weight_set)
return -ENOMEM;
for (i = 0; i < arg->weight_set_size; i++) {
struct crush_weight_set *w = &arg->weight_set[i];
w->weights = decode_array_32_alloc(p, end, &w->size);
if (IS_ERR(w->weights)) {
ret = PTR_ERR(w->weights);
w->weights = NULL;
return ret;
}
}
}
arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
if (IS_ERR(arg->ids)) {
ret = PTR_ERR(arg->ids);
arg->ids = NULL;
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static int decode_choose_args(void **p, void *end, struct crush_map *c)
{
struct crush_choose_arg_map *arg_map = NULL;
u32 num_choose_arg_maps, num_buckets;
int ret;
ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
while (num_choose_arg_maps--) {
arg_map = alloc_choose_arg_map();
if (!arg_map) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_64_safe(p, end, arg_map->choose_args_index,
e_inval);
arg_map->size = c->max_buckets;
arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
GFP_NOIO);
if (!arg_map->args) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_32_safe(p, end, num_buckets, e_inval);
while (num_buckets--) {
struct crush_choose_arg *arg;
u32 bucket_index;
ceph_decode_32_safe(p, end, bucket_index, e_inval);
if (bucket_index >= arg_map->size)
goto e_inval;
arg = &arg_map->args[bucket_index];
ret = decode_choose_arg(p, end, arg);
if (ret)
goto fail;
if (arg->ids_size &&
arg->ids_size != c->buckets[bucket_index]->size)
goto e_inval;
}
insert_choose_arg_map(&c->choose_args, arg_map);
}
return 0;
e_inval:
ret = -EINVAL;
fail:
free_choose_arg_map(arg_map);
return ret;
}
static void crush_finalize(struct crush_map *c)
{
__s32 b;
/* Space for the array of pointers to per-bucket workspace */
c->working_size = sizeof(struct crush_work) +
c->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < c->max_buckets; b++) {
if (!c->buckets[b])
continue;
switch (c->buckets[b]->alg) {
default:
/*
* The base case, permutation variables and
* the pointer to the permutation array.
*/
c->working_size += sizeof(struct crush_work_bucket);
break;
}
/* Every bucket has a permutation array. */
c->working_size += c->buckets[b]->size * sizeof(__u32);
}
}
static struct crush_map *crush_decode(void *pbyval, void *end)
{
struct crush_map *c;
int err;
int i, j;
void **p = &pbyval;
void *start = pbyval;
u32 magic;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
c = kzalloc(sizeof(*c), GFP_NOFS);
if (c == NULL)
return ERR_PTR(-ENOMEM);
c->choose_args = RB_ROOT;
/* set tunables to default values */
c->choose_local_tries = 2;
c->choose_local_fallback_tries = 5;
c->choose_total_tries = 19;
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed Add libceph support for a new CRUSH tunable recently added to Ceph servers. Consider the CRUSH rule step chooseleaf firstn 0 type <node_type> This rule means that <n> replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of <node_type>. When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the <node_type> bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular <node_type> bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to <node_type> is the outer descent, and the descent from <node_type> down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc. Signed-off-by: Jim Schutt <jaschut@sandia.gov> Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 16:15:25 +00:00
c->chooseleaf_descend_once = 0;
ceph_decode_need(p, end, 4*sizeof(u32), bad);
magic = ceph_decode_32(p);
if (magic != CRUSH_MAGIC) {
pr_err("crush_decode magic %x != current %x\n",
(unsigned int)magic, (unsigned int)CRUSH_MAGIC);
goto bad;
}
c->max_buckets = ceph_decode_32(p);
c->max_rules = ceph_decode_32(p);
c->max_devices = ceph_decode_32(p);
c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
if (c->buckets == NULL)
goto badmem;
c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
if (c->rules == NULL)
goto badmem;
/* buckets */
for (i = 0; i < c->max_buckets; i++) {
int size = 0;
u32 alg;
struct crush_bucket *b;
ceph_decode_32_safe(p, end, alg, bad);
if (alg == 0) {
c->buckets[i] = NULL;
continue;
}
dout("crush_decode bucket %d off %x %p to %p\n",
i, (int)(*p-start), *p, end);
switch (alg) {
case CRUSH_BUCKET_UNIFORM:
size = sizeof(struct crush_bucket_uniform);
break;
case CRUSH_BUCKET_LIST:
size = sizeof(struct crush_bucket_list);
break;
case CRUSH_BUCKET_TREE:
size = sizeof(struct crush_bucket_tree);
break;
case CRUSH_BUCKET_STRAW:
size = sizeof(struct crush_bucket_straw);
break;
case CRUSH_BUCKET_STRAW2:
size = sizeof(struct crush_bucket_straw2);
break;
default:
goto bad;
}
BUG_ON(size == 0);
b = c->buckets[i] = kzalloc(size, GFP_NOFS);
if (b == NULL)
goto badmem;
ceph_decode_need(p, end, 4*sizeof(u32), bad);
b->id = ceph_decode_32(p);
b->type = ceph_decode_16(p);
b->alg = ceph_decode_8(p);
b->hash = ceph_decode_8(p);
b->weight = ceph_decode_32(p);
b->size = ceph_decode_32(p);
dout("crush_decode bucket size %d off %x %p to %p\n",
b->size, (int)(*p-start), *p, end);
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
if (b->items == NULL)
goto badmem;
ceph_decode_need(p, end, b->size*sizeof(u32), bad);
for (j = 0; j < b->size; j++)
b->items[j] = ceph_decode_32(p);
switch (b->alg) {
case CRUSH_BUCKET_UNIFORM:
err = crush_decode_uniform_bucket(p, end,
(struct crush_bucket_uniform *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_LIST:
err = crush_decode_list_bucket(p, end,
(struct crush_bucket_list *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_TREE:
err = crush_decode_tree_bucket(p, end,
(struct crush_bucket_tree *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_STRAW:
err = crush_decode_straw_bucket(p, end,
(struct crush_bucket_straw *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_STRAW2:
err = crush_decode_straw2_bucket(p, end,
(struct crush_bucket_straw2 *)b);
if (err < 0)
goto fail;
break;
}
}
/* rules */
dout("rule vec is %p\n", c->rules);
for (i = 0; i < c->max_rules; i++) {
u32 yes;
struct crush_rule *r;
ceph_decode_32_safe(p, end, yes, bad);
if (!yes) {
dout("crush_decode NO rule %d off %x %p to %p\n",
i, (int)(*p-start), *p, end);
c->rules[i] = NULL;
continue;
}
dout("crush_decode rule %d off %x %p to %p\n",
i, (int)(*p-start), *p, end);
/* len */
ceph_decode_32_safe(p, end, yes, bad);
#if BITS_PER_LONG == 32
if (yes > (ULONG_MAX - sizeof(*r))
/ sizeof(struct crush_rule_step))
goto bad;
#endif
r = c->rules[i] = kmalloc(sizeof(*r) +
yes*sizeof(struct crush_rule_step),
GFP_NOFS);
if (r == NULL)
goto badmem;
dout(" rule %d is at %p\n", i, r);
r->len = yes;
ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
for (j = 0; j < r->len; j++) {
r->steps[j].op = ceph_decode_32(p);
r->steps[j].arg1 = ceph_decode_32(p);
r->steps[j].arg2 = ceph_decode_32(p);
}
}
ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
/* tunables */
ceph_decode_need(p, end, 3*sizeof(u32), done);
c->choose_local_tries = ceph_decode_32(p);
c->choose_local_fallback_tries = ceph_decode_32(p);
c->choose_total_tries = ceph_decode_32(p);
dout("crush decode tunable choose_local_tries = %d\n",
c->choose_local_tries);
dout("crush decode tunable choose_local_fallback_tries = %d\n",
c->choose_local_fallback_tries);
dout("crush decode tunable choose_total_tries = %d\n",
c->choose_total_tries);
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed Add libceph support for a new CRUSH tunable recently added to Ceph servers. Consider the CRUSH rule step chooseleaf firstn 0 type <node_type> This rule means that <n> replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of <node_type>. When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the <node_type> bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular <node_type> bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to <node_type> is the outer descent, and the descent from <node_type> down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc. Signed-off-by: Jim Schutt <jaschut@sandia.gov> Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 16:15:25 +00:00
ceph_decode_need(p, end, sizeof(u32), done);
c->chooseleaf_descend_once = ceph_decode_32(p);
dout("crush decode tunable chooseleaf_descend_once = %d\n",
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed Add libceph support for a new CRUSH tunable recently added to Ceph servers. Consider the CRUSH rule step chooseleaf firstn 0 type <node_type> This rule means that <n> replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of <node_type>. When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the <node_type> bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular <node_type> bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to <node_type> is the outer descent, and the descent from <node_type> down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc. Signed-off-by: Jim Schutt <jaschut@sandia.gov> Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 16:15:25 +00:00
c->chooseleaf_descend_once);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_vary_r = ceph_decode_8(p);
dout("crush decode tunable chooseleaf_vary_r = %d\n",
c->chooseleaf_vary_r);
/* skip straw_calc_version, allowed_bucket_algs */
ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
*p += sizeof(u8) + sizeof(u32);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_stable = ceph_decode_8(p);
dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable);
if (*p != end) {
/* class_map */
ceph_decode_skip_map(p, end, 32, 32, bad);
/* class_name */
ceph_decode_skip_map(p, end, 32, string, bad);
/* class_bucket */
ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
}
if (*p != end) {
err = decode_choose_args(p, end, c);
if (err)
goto fail;
}
done:
crush_finalize(c);
dout("crush_decode success\n");
return c;
badmem:
err = -ENOMEM;
fail:
dout("crush_decode fail %d\n", err);
crush_destroy(c);
return ERR_PTR(err);
bad:
err = -EINVAL;
goto fail;
}
int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
{
if (lhs->pool < rhs->pool)
return -1;
if (lhs->pool > rhs->pool)
return 1;
if (lhs->seed < rhs->seed)
return -1;
if (lhs->seed > rhs->seed)
return 1;
return 0;
}
int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
{
int ret;
ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
if (ret)
return ret;
if (lhs->shard < rhs->shard)
return -1;
if (lhs->shard > rhs->shard)
return 1;
return 0;
}
static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
{
struct ceph_pg_mapping *pg;
pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
if (!pg)
return NULL;
RB_CLEAR_NODE(&pg->node);
return pg;
}
static void free_pg_mapping(struct ceph_pg_mapping *pg)
{
WARN_ON(!RB_EMPTY_NODE(&pg->node));
kfree(pg);
}
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
*/
DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
RB_BYPTR, const struct ceph_pg *, node)
/*
* rbtree of pg pool info
*/
static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct ceph_pg_pool_info *pi = NULL;
while (*p) {
parent = *p;
pi = rb_entry(parent, struct ceph_pg_pool_info, node);
if (new->id < pi->id)
p = &(*p)->rb_left;
else if (new->id > pi->id)
p = &(*p)->rb_right;
else
return -EEXIST;
}
rb_link_node(&new->node, parent, p);
rb_insert_color(&new->node, root);
return 0;
}
static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
{
struct ceph_pg_pool_info *pi;
struct rb_node *n = root->rb_node;
while (n) {
pi = rb_entry(n, struct ceph_pg_pool_info, node);
if (id < pi->id)
n = n->rb_left;
else if (id > pi->id)
n = n->rb_right;
else
return pi;
}
return NULL;
}
struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
{
return __lookup_pg_pool(&map->pg_pools, id);
}
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
{
struct ceph_pg_pool_info *pi;
if (id == CEPH_NOPOOL)
return NULL;
if (WARN_ON_ONCE(id > (u64) INT_MAX))
return NULL;
pi = __lookup_pg_pool(&map->pg_pools, (int) id);
return pi ? pi->name : NULL;
}
EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
{
struct rb_node *rbp;
for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
struct ceph_pg_pool_info *pi =
rb_entry(rbp, struct ceph_pg_pool_info, node);
if (pi->name && strcmp(pi->name, name) == 0)
return pi->id;
}
return -ENOENT;
}
EXPORT_SYMBOL(ceph_pg_poolid_by_name);
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
{
rb_erase(&pi->node, root);
kfree(pi->name);
kfree(pi);
}
static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
{
u8 ev, cv;
unsigned len, num;
void *pool_end;
ceph_decode_need(p, end, 2 + 4, bad);
ev = ceph_decode_8(p); /* encoding version */
cv = ceph_decode_8(p); /* compat version */
if (ev < 5) {
pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
if (cv > 9) {
pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
len = ceph_decode_32(p);
ceph_decode_need(p, end, len, bad);
pool_end = *p + len;
pi->type = ceph_decode_8(p);
pi->size = ceph_decode_8(p);
pi->crush_ruleset = ceph_decode_8(p);
pi->object_hash = ceph_decode_8(p);
pi->pg_num = ceph_decode_32(p);
pi->pgp_num = ceph_decode_32(p);
*p += 4 + 4; /* skip lpg* */
*p += 4; /* skip last_change */
*p += 8 + 4; /* skip snap_seq, snap_epoch */
/* skip snaps */
num = ceph_decode_32(p);
while (num--) {
*p += 8; /* snapid key */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
}
/* skip removed_snaps */
num = ceph_decode_32(p);
*p += num * (8 + 8);
*p += 8; /* skip auid */
pi->flags = ceph_decode_64(p);
*p += 4; /* skip crash_replay_interval */
if (ev >= 7)
pi->min_size = ceph_decode_8(p);
else
pi->min_size = pi->size - pi->size / 2;
if (ev >= 8)
*p += 8 + 8; /* skip quota_max_* */
if (ev >= 9) {
/* skip tiers */
num = ceph_decode_32(p);
*p += num * 8;
*p += 8; /* skip tier_of */
*p += 1; /* skip cache_mode */
pi->read_tier = ceph_decode_64(p);
pi->write_tier = ceph_decode_64(p);
} else {
pi->read_tier = -1;
pi->write_tier = -1;
}
if (ev >= 10) {
/* skip properties */
num = ceph_decode_32(p);
while (num--) {
len = ceph_decode_32(p);
*p += len; /* key */
len = ceph_decode_32(p);
*p += len; /* val */
}
}
if (ev >= 11) {
/* skip hit_set_params */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
*p += 4; /* skip hit_set_period */
*p += 4; /* skip hit_set_count */
}
if (ev >= 12)
*p += 4; /* skip stripe_width */
if (ev >= 13) {
*p += 8; /* skip target_max_bytes */
*p += 8; /* skip target_max_objects */
*p += 4; /* skip cache_target_dirty_ratio_micro */
*p += 4; /* skip cache_target_full_ratio_micro */
*p += 4; /* skip cache_min_flush_age */
*p += 4; /* skip cache_min_evict_age */
}
if (ev >= 14) {
/* skip erasure_code_profile */
len = ceph_decode_32(p);
*p += len;
}
/*
* last_force_op_resend_preluminous, will be overridden if the
* map was encoded with RESEND_ON_SPLIT
*/
if (ev >= 15)
pi->last_force_request_resend = ceph_decode_32(p);
else
pi->last_force_request_resend = 0;
if (ev >= 16)
*p += 4; /* skip min_read_recency_for_promote */
if (ev >= 17)
*p += 8; /* skip expected_num_objects */
if (ev >= 19)
*p += 4; /* skip cache_target_dirty_high_ratio_micro */
if (ev >= 20)
*p += 4; /* skip min_write_recency_for_promote */
if (ev >= 21)
*p += 1; /* skip use_gmt_hitset */
if (ev >= 22)
*p += 1; /* skip fast_read */
if (ev >= 23) {
*p += 4; /* skip hit_set_grade_decay_rate */
*p += 4; /* skip hit_set_search_last_n */
}
if (ev >= 24) {
/* skip opts */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
}
if (ev >= 25)
pi->last_force_request_resend = ceph_decode_32(p);
/* ignore the rest */
*p = pool_end;
calc_pg_masks(pi);
return 0;
bad:
return -EINVAL;
}
static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
{
struct ceph_pg_pool_info *pi;
u32 num, len;
u64 pool;
ceph_decode_32_safe(p, end, num, bad);
dout(" %d pool names\n", num);
while (num--) {
ceph_decode_64_safe(p, end, pool, bad);
ceph_decode_32_safe(p, end, len, bad);
dout(" pool %llu len %d\n", pool, len);
ceph_decode_need(p, end, len, bad);
pi = __lookup_pg_pool(&map->pg_pools, pool);
if (pi) {
char *name = kstrndup(*p, len, GFP_NOFS);
if (!name)
return -ENOMEM;
kfree(pi->name);
pi->name = name;
dout(" name is %s\n", pi->name);
}
*p += len;
}
return 0;
bad:
return -EINVAL;
}
/*
* osd map
*/
struct ceph_osdmap *ceph_osdmap_alloc(void)
{
struct ceph_osdmap *map;
map = kzalloc(sizeof(*map), GFP_NOIO);
if (!map)
return NULL;
map->pg_pools = RB_ROOT;
map->pool_max = -1;
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
map->pg_upmap = RB_ROOT;
map->pg_upmap_items = RB_ROOT;
mutex_init(&map->crush_workspace_mutex);
return map;
}
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
if (map->crush)
crush_destroy(map->crush);
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp),
struct ceph_pg_mapping, node);
erase_pg_mapping(&map->pg_temp, pg);
free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->primary_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->primary_temp),
struct ceph_pg_mapping, node);
erase_pg_mapping(&map->primary_temp, pg);
free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_upmap),
struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_upmap);
kfree(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_upmap_items),
struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_upmap_items);
kfree(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
struct ceph_pg_pool_info *pi =
rb_entry(rb_first(&map->pg_pools),
struct ceph_pg_pool_info, node);
__remove_pg_pool(&map->pg_pools, pi);
}
kfree(map->osd_state);
kfree(map->osd_weight);
kfree(map->osd_addr);
kfree(map->osd_primary_affinity);
kfree(map->crush_workspace);
kfree(map);
}
/*
* Adjust max_osd value, (re)allocate arrays.
*
* The new elements are properly initialized.
*/
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
{
u32 *state;
u32 *weight;
struct ceph_entity_addr *addr;
int i;
state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
if (!state)
return -ENOMEM;
map->osd_state = state;
weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
if (!weight)
return -ENOMEM;
map->osd_weight = weight;
addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
if (!addr)
return -ENOMEM;
map->osd_addr = addr;
for (i = map->max_osd; i < max; i++) {
map->osd_state[i] = 0;
map->osd_weight[i] = CEPH_OSD_OUT;
memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
}
if (map->osd_primary_affinity) {
u32 *affinity;
affinity = krealloc(map->osd_primary_affinity,
max*sizeof(*affinity), GFP_NOFS);
if (!affinity)
return -ENOMEM;
map->osd_primary_affinity = affinity;
for (i = map->max_osd; i < max; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
}
map->max_osd = max;
return 0;
}
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
void *workspace;
size_t work_size;
if (IS_ERR(crush))
return PTR_ERR(crush);
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
workspace = kmalloc(work_size, GFP_NOIO);
if (!workspace) {
crush_destroy(crush);
return -ENOMEM;
}
crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
kfree(map->crush_workspace);
map->crush = crush;
map->crush_workspace = workspace;
return 0;
}
#define OSDMAP_WRAPPER_COMPAT_VER 7
#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
/*
* Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
* to struct_v of the client_data section for new (v7 and above)
* osdmaps.
*/
static int get_osdmap_client_data_v(void **p, void *end,
const char *prefix, u8 *v)
{
u8 struct_v;
ceph_decode_8_safe(p, end, struct_v, e_inval);
if (struct_v >= 7) {
u8 struct_compat;
ceph_decode_8_safe(p, end, struct_compat, e_inval);
if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
struct_v, struct_compat,
OSDMAP_WRAPPER_COMPAT_VER, prefix);
return -EINVAL;
}
*p += 4; /* ignore wrapper struct_len */
ceph_decode_8_safe(p, end, struct_v, e_inval);
ceph_decode_8_safe(p, end, struct_compat, e_inval);
if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
struct_v, struct_compat,
OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
return -EINVAL;
}
*p += 4; /* ignore client data struct_len */
} else {
u16 version;
*p -= 1;
ceph_decode_16_safe(p, end, version, e_inval);
if (version < 6) {
pr_warn("got v %d < 6 of %s ceph_osdmap\n",
version, prefix);
return -EINVAL;
}
/* old osdmap enconding */
struct_v = 0;
}
*v = struct_v;
return 0;
e_inval:
return -EINVAL;
}
static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
bool incremental)
{
u32 n;
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
struct ceph_pg_pool_info *pi;
u64 pool;
int ret;
ceph_decode_64_safe(p, end, pool, e_inval);
pi = __lookup_pg_pool(&map->pg_pools, pool);
if (!incremental || !pi) {
pi = kzalloc(sizeof(*pi), GFP_NOFS);
if (!pi)
return -ENOMEM;
pi->id = pool;
ret = __insert_pg_pool(&map->pg_pools, pi);
if (ret) {
kfree(pi);
return ret;
}
}
ret = decode_pool(p, end, pi);
if (ret)
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
{
return __decode_pools(p, end, map, false);
}
static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
{
return __decode_pools(p, end, map, true);
}
typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
decode_mapping_fn_t fn, bool incremental)
{
u32 n;
WARN_ON(!incremental && !fn);
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
struct ceph_pg_mapping *pg;
struct ceph_pg pgid;
int ret;
ret = ceph_decode_pgid(p, end, &pgid);
if (ret)
return ret;
pg = lookup_pg_mapping(mapping_root, &pgid);
if (pg) {
WARN_ON(!incremental);
erase_pg_mapping(mapping_root, pg);
free_pg_mapping(pg);
}
if (fn) {
pg = fn(p, end, incremental);
if (IS_ERR(pg))
return PTR_ERR(pg);
if (pg) {
pg->pgid = pgid; /* struct */
insert_pg_mapping(mapping_root, pg);
}
}
}
return 0;
e_inval:
return -EINVAL;
}
static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
bool incremental)
{
struct ceph_pg_mapping *pg;
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0 && incremental)
return NULL; /* new_pg_temp: [] to remove */
if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
return ERR_PTR(-EINVAL);
ceph_decode_need(p, end, len * sizeof(u32), e_inval);
pg = alloc_pg_mapping(len * sizeof(u32));
if (!pg)
return ERR_PTR(-ENOMEM);
pg->pg_temp.len = len;
for (i = 0; i < len; i++)
pg->pg_temp.osds[i] = ceph_decode_32(p);
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
false);
}
static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
true);
}
static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
bool incremental)
{
struct ceph_pg_mapping *pg;
u32 osd;
ceph_decode_32_safe(p, end, osd, e_inval);
if (osd == (u32)-1 && incremental)
return NULL; /* new_primary_temp: -1 to remove */
pg = alloc_pg_mapping(0);
if (!pg)
return ERR_PTR(-ENOMEM);
pg->primary_temp.osd = osd;
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->primary_temp,
__decode_primary_temp, false);
}
static int decode_new_primary_temp(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->primary_temp,
__decode_primary_temp, true);
}
u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
{
BUG_ON(osd >= map->max_osd);
if (!map->osd_primary_affinity)
return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
return map->osd_primary_affinity[osd];
}
static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
{
BUG_ON(osd >= map->max_osd);
if (!map->osd_primary_affinity) {
int i;
map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
GFP_NOFS);
if (!map->osd_primary_affinity)
return -ENOMEM;
for (i = 0; i < map->max_osd; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
}
map->osd_primary_affinity[osd] = aff;
return 0;
}
static int decode_primary_affinity(void **p, void *end,
struct ceph_osdmap *map)
{
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0) {
kfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL;
return 0;
}
if (len != map->max_osd)
goto e_inval;
ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
for (i = 0; i < map->max_osd; i++) {
int ret;
ret = set_primary_affinity(map, i, ceph_decode_32(p));
if (ret)
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static int decode_new_primary_affinity(void **p, void *end,
struct ceph_osdmap *map)
{
u32 n;
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
u32 osd, aff;
int ret;
ceph_decode_32_safe(p, end, osd, e_inval);
ceph_decode_32_safe(p, end, aff, e_inval);
ret = set_primary_affinity(map, osd, aff);
if (ret)
return ret;
pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
}
return 0;
e_inval:
return -EINVAL;
}
static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
bool __unused)
{
return __decode_pg_temp(p, end, false);
}
static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
false);
}
static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
true);
}
static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
}
static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
bool __unused)
{
struct ceph_pg_mapping *pg;
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
return ERR_PTR(-EINVAL);
ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
pg = alloc_pg_mapping(2 * len * sizeof(u32));
if (!pg)
return ERR_PTR(-ENOMEM);
pg->pg_upmap_items.len = len;
for (i = 0; i < len; i++) {
pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
}
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items,
__decode_pg_upmap_items, false);
}
static int decode_new_pg_upmap_items(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items,
__decode_pg_upmap_items, true);
}
static int decode_old_pg_upmap_items(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
}
/*
* decode a full map.
*/
static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
{
u8 struct_v;
u32 epoch = 0;
void *start = *p;
u32 max;
u32 len, i;
int err;
dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
err = get_osdmap_client_data_v(p, end, "full", &struct_v);
if (err)
goto bad;
/* fsid, epoch, created, modified */
ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
sizeof(map->created) + sizeof(map->modified), e_inval);
ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
epoch = map->epoch = ceph_decode_32(p);
ceph_decode_copy(p, &map->created, sizeof(map->created));
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
/* pools */
err = decode_pools(p, end, map);
if (err)
goto bad;
/* pool_name */
err = decode_pool_names(p, end, map);
if (err)
goto bad;
ceph_decode_32_safe(p, end, map->pool_max, e_inval);
ceph_decode_32_safe(p, end, map->flags, e_inval);
/* max_osd */
ceph_decode_32_safe(p, end, max, e_inval);
/* (re)alloc osd arrays */
err = osdmap_set_max_osd(map, max);
if (err)
goto bad;
/* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
map->max_osd*((struct_v >= 5 ? sizeof(u32) :
sizeof(u8)) +
sizeof(*map->osd_weight) +
sizeof(*map->osd_addr)), e_inval);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
if (struct_v >= 5) {
for (i = 0; i < map->max_osd; i++)
map->osd_state[i] = ceph_decode_32(p);
} else {
for (i = 0; i < map->max_osd; i++)
map->osd_state[i] = ceph_decode_8(p);
}
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
for (i = 0; i < map->max_osd; i++)
map->osd_weight[i] = ceph_decode_32(p);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
for (i = 0; i < map->max_osd; i++)
ceph_decode_addr(&map->osd_addr[i]);
/* pg_temp */
err = decode_pg_temp(p, end, map);
if (err)
goto bad;
/* primary_temp */
if (struct_v >= 1) {
err = decode_primary_temp(p, end, map);
if (err)
goto bad;
}
/* primary_affinity */
if (struct_v >= 2) {
err = decode_primary_affinity(p, end, map);
if (err)
goto bad;
} else {
WARN_ON(map->osd_primary_affinity);
}
/* crush */
ceph_decode_32_safe(p, end, len, e_inval);
err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
if (err)
goto bad;
*p += len;
if (struct_v >= 3) {
/* erasure_code_profiles */
ceph_decode_skip_map_of_map(p, end, string, string, string,
e_inval);
}
if (struct_v >= 4) {
err = decode_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_pg_upmap_items(p, end, map);
if (err)
goto bad;
} else {
WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
}
/* ignore the rest */
*p = end;
dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
return 0;
e_inval:
err = -EINVAL;
bad:
pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
err, epoch, (int)(*p - start), *p, start, end);
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
return err;
}
/*
* Allocate and decode a full map.
*/
struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
{
struct ceph_osdmap *map;
int ret;
map = ceph_osdmap_alloc();
if (!map)
return ERR_PTR(-ENOMEM);
ret = osdmap_decode(p, end, map);
if (ret) {
ceph_osdmap_destroy(map);
return ERR_PTR(ret);
}
return map;
}
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
/*
* Encoding order is (new_up_client, new_state, new_weight). Need to
* apply in the (new_weight, new_state, new_up_client) order, because
* an incremental map may look like e.g.
*
* new_up_client: { osd=6, addr=... } # set osd_state and addr
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
struct ceph_osdmap *map)
{
void *new_up_client;
void *new_state;
void *new_weight_end;
u32 len;
new_up_client = *p;
ceph_decode_32_safe(p, end, len, e_inval);
len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
ceph_decode_need(p, end, len, e_inval);
*p += len;
new_state = *p;
ceph_decode_32_safe(p, end, len, e_inval);
len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
ceph_decode_need(p, end, len, e_inval);
*p += len;
/* new_weight */
ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
s32 osd;
u32 w;
ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
osd = ceph_decode_32(p);
w = ceph_decode_32(p);
BUG_ON(osd >= map->max_osd);
pr_info("osd%d weight 0x%x %s\n", osd, w,
w == CEPH_OSD_IN ? "(in)" :
(w == CEPH_OSD_OUT ? "(out)" : ""));
map->osd_weight[osd] = w;
/*
* If we are marking in, set the EXISTS, and clear the
* AUTOOUT and NEW bits.
*/
if (w) {
map->osd_state[osd] |= CEPH_OSD_EXISTS;
map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
CEPH_OSD_NEW);
}
}
new_weight_end = *p;
/* new_state (up/down) */
*p = new_state;
len = ceph_decode_32(p);
while (len--) {
s32 osd;
u32 xorstate;
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
int ret;
osd = ceph_decode_32(p);
if (struct_v >= 5)
xorstate = ceph_decode_32(p);
else
xorstate = ceph_decode_8(p);
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
if (xorstate == 0)
xorstate = CEPH_OSD_UP;
BUG_ON(osd >= map->max_osd);
if ((map->osd_state[osd] & CEPH_OSD_UP) &&
(xorstate & CEPH_OSD_UP))
pr_info("osd%d down\n", osd);
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
(xorstate & CEPH_OSD_EXISTS)) {
pr_info("osd%d does not exist\n", osd);
ret = set_primary_affinity(map, osd,
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
if (ret)
return ret;
memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
map->osd_state[osd] = 0;
} else {
map->osd_state[osd] ^= xorstate;
}
}
/* new_up_client */
*p = new_up_client;
len = ceph_decode_32(p);
while (len--) {
s32 osd;
struct ceph_entity_addr addr;
osd = ceph_decode_32(p);
ceph_decode_copy(p, &addr, sizeof(addr));
ceph_decode_addr(&addr);
BUG_ON(osd >= map->max_osd);
pr_info("osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
}
*p = new_weight_end;
return 0;
e_inval:
return -EINVAL;
}
/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map)
{
struct ceph_fsid fsid;
u32 epoch = 0;
struct ceph_timespec modified;
s32 len;
u64 pool;
__s64 new_pool_max;
__s32 new_flags, max;
void *start = *p;
int err;
u8 struct_v;
dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
if (err)
goto bad;
/* fsid, epoch, modified, new_pool_max, new_flags */
ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
sizeof(u64) + sizeof(u32), e_inval);
ceph_decode_copy(p, &fsid, sizeof(fsid));
epoch = ceph_decode_32(p);
BUG_ON(epoch != map->epoch+1);
ceph_decode_copy(p, &modified, sizeof(modified));
new_pool_max = ceph_decode_64(p);
new_flags = ceph_decode_32(p);
/* full map? */
ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
return ceph_osdmap_decode(p, min(*p+len, end));
}
/* new crush? */
ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
err = osdmap_set_crush(map,
crush_decode(*p, min(*p + len, end)));
if (err)
goto bad;
*p += len;
}
/* new flags? */
if (new_flags >= 0)
map->flags = new_flags;
if (new_pool_max >= 0)
map->pool_max = new_pool_max;
/* new max? */
ceph_decode_32_safe(p, end, max, e_inval);
if (max >= 0) {
err = osdmap_set_max_osd(map, max);
if (err)
goto bad;
}
map->epoch++;
map->modified = modified;
/* new_pools */
err = decode_new_pools(p, end, map);
if (err)
goto bad;
/* new_pool_names */
err = decode_pool_names(p, end, map);
if (err)
goto bad;
/* old_pool */
ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
struct ceph_pg_pool_info *pi;
ceph_decode_64_safe(p, end, pool, e_inval);
pi = __lookup_pg_pool(&map->pg_pools, pool);
if (pi)
__remove_pg_pool(&map->pg_pools, pi);
}
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
/* new_up_client, new_state, new_weight */
err = decode_new_up_state_weight(p, end, struct_v, map);
libceph: apply new_state before new_up_client on incrementals Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
2016-07-19 01:50:28 +00:00
if (err)
goto bad;
/* new_pg_temp */
err = decode_new_pg_temp(p, end, map);
if (err)
goto bad;
/* new_primary_temp */
if (struct_v >= 1) {
err = decode_new_primary_temp(p, end, map);
if (err)
goto bad;
}
/* new_primary_affinity */
if (struct_v >= 2) {
err = decode_new_primary_affinity(p, end, map);
if (err)
goto bad;
}
if (struct_v >= 3) {
/* new_erasure_code_profiles */
ceph_decode_skip_map_of_map(p, end, string, string, string,
e_inval);
/* old_erasure_code_profiles */
ceph_decode_skip_set(p, end, string, e_inval);
}
if (struct_v >= 4) {
err = decode_new_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_old_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_new_pg_upmap_items(p, end, map);
if (err)
goto bad;
err = decode_old_pg_upmap_items(p, end, map);
if (err)
goto bad;
}
/* ignore the rest */
*p = end;
dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
return map;
e_inval:
err = -EINVAL;
bad:
pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
err, epoch, (int)(*p - start), *p, start, end);
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
return ERR_PTR(err);
}
void ceph_oloc_copy(struct ceph_object_locator *dest,
const struct ceph_object_locator *src)
{
ceph_oloc_destroy(dest);
dest->pool = src->pool;
if (src->pool_ns)
dest->pool_ns = ceph_get_string(src->pool_ns);
else
dest->pool_ns = NULL;
}
EXPORT_SYMBOL(ceph_oloc_copy);
void ceph_oloc_destroy(struct ceph_object_locator *oloc)
{
ceph_put_string(oloc->pool_ns);
}
EXPORT_SYMBOL(ceph_oloc_destroy);
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
ceph_oid_destroy(dest);
if (src->name != src->inline_name) {
/* very rare, see ceph_object_id definition */
dest->name = kmalloc(src->name_len + 1,
GFP_NOIO | __GFP_NOFAIL);
} else {
dest->name = dest->inline_name;
}
memcpy(dest->name, src->name, src->name_len + 1);
dest->name_len = src->name_len;
}
EXPORT_SYMBOL(ceph_oid_copy);
static __printf(2, 0)
int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
{
int len;
WARN_ON(!ceph_oid_empty(oid));
len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
if (len >= sizeof(oid->inline_name))
return len;
oid->name_len = len;
return 0;
}
/*
* If oid doesn't fit into inline buffer, BUG.
*/
void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
BUG_ON(oid_printf_vargs(oid, fmt, ap));
va_end(ap);
}
EXPORT_SYMBOL(ceph_oid_printf);
static __printf(3, 0)
int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, va_list ap)
{
va_list aq;
int len;
va_copy(aq, ap);
len = oid_printf_vargs(oid, fmt, aq);
va_end(aq);
if (len) {
char *external_name;
external_name = kmalloc(len + 1, gfp);
if (!external_name)
return -ENOMEM;
oid->name = external_name;
WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
oid->name_len = len;
}
return 0;
}
/*
* If oid doesn't fit into inline buffer, allocate.
*/
int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
va_end(ap);
return ret;
}
EXPORT_SYMBOL(ceph_oid_aprintf);
void ceph_oid_destroy(struct ceph_object_id *oid)
{
if (oid->name != oid->inline_name)
kfree(oid->name);
}
EXPORT_SYMBOL(ceph_oid_destroy);
/*
* osds only
*/
static bool __osds_equal(const struct ceph_osds *lhs,
const struct ceph_osds *rhs)
{
if (lhs->size == rhs->size &&
!memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
return true;
return false;
}
/*
* osds + primary
*/
static bool osds_equal(const struct ceph_osds *lhs,
const struct ceph_osds *rhs)
{
if (__osds_equal(lhs, rhs) &&
lhs->primary == rhs->primary)
return true;
return false;
}
static bool osds_valid(const struct ceph_osds *set)
{
/* non-empty set */
if (set->size > 0 && set->primary >= 0)
return true;
/* empty can_shift_osds set */
if (!set->size && set->primary == -1)
return true;
/* empty !can_shift_osds set - all NONE */
if (set->size > 0 && set->primary == -1) {
int i;
for (i = 0; i < set->size; i++) {
if (set->osds[i] != CRUSH_ITEM_NONE)
break;
}
if (i == set->size)
return true;
}
return false;
}
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
{
memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
dest->size = src->size;
dest->primary = src->primary;
}
bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
u32 new_pg_num)
{
int old_bits = calc_bits_of(old_pg_num);
int old_mask = (1 << old_bits) - 1;
int n;
WARN_ON(pgid->seed >= old_pg_num);
if (new_pg_num <= old_pg_num)
return false;
for (n = 1; ; n++) {
int next_bit = n << (old_bits - 1);
u32 s = next_bit | pgid->seed;
if (s < old_pg_num || s == pgid->seed)
continue;
if (s >= new_pg_num)
break;
s = ceph_stable_mod(s, old_pg_num, old_mask);
if (s == pgid->seed)
return true;
}
return false;
}
bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
const struct ceph_osds *old_up,
const struct ceph_osds *new_up,
int old_size,
int new_size,
int old_min_size,
int new_min_size,
u32 old_pg_num,
u32 new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
bool old_recovery_deletes,
bool new_recovery_deletes,
const struct ceph_pg *pgid)
{
return !osds_equal(old_acting, new_acting) ||
!osds_equal(old_up, new_up) ||
old_size != new_size ||
old_min_size != new_min_size ||
ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
old_sort_bitwise != new_sort_bitwise ||
old_recovery_deletes != new_recovery_deletes;
}
static int calc_pg_rank(int osd, const struct ceph_osds *acting)
{
int i;
for (i = 0; i < acting->size; i++) {
if (acting->osds[i] == osd)
return i;
}
return -1;
}
static bool primary_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting)
{
if (!old_acting->size && !new_acting->size)
return false; /* both still empty */
if (!old_acting->size ^ !new_acting->size)
return true; /* was empty, now not, or vice versa */
if (old_acting->primary != new_acting->primary)
return true; /* primary changed */
if (calc_pg_rank(old_acting->primary, old_acting) !=
calc_pg_rank(new_acting->primary, new_acting))
return true;
return false; /* same primary (tho replicas may have changed) */
}
bool ceph_osds_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
bool any_change)
{
if (primary_changed(old_acting, new_acting))
return true;
if (any_change && !__osds_equal(old_acting, new_acting))
return true;
return false;
}
/*
* calculate file layout from given offset, length.
* fill in correct oid, logical length, and object extent
* offset, length.
*
* for now, we write only a single su, until we can
* pass a stride back to the caller.
*/
int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len,
u64 *ono,
u64 *oxoff, u64 *oxlen)
{
u32 osize = layout->object_size;
u32 su = layout->stripe_unit;
u32 sc = layout->stripe_count;
u32 bl, stripeno, stripepos, objsetno;
u32 su_per_object;
u64 t, su_offset;
dout("mapping %llu~%llu osize %u fl_su %u\n", off, len,
osize, su);
if (su == 0 || sc == 0)
goto invalid;
su_per_object = osize / su;
if (su_per_object == 0)
goto invalid;
dout("osize %u / su %u = su_per_object %u\n", osize, su,
su_per_object);
if ((su & ~PAGE_MASK) != 0)
goto invalid;
/* bl = *off / su; */
t = off;
do_div(t, su);
bl = t;
dout("off %llu / su %u = bl %u\n", off, su, bl);
stripeno = bl / sc;
stripepos = bl % sc;
objsetno = stripeno / su_per_object;
*ono = objsetno * sc + stripepos;
dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono);
/* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
t = off;
su_offset = do_div(t, su);
*oxoff = su_offset + (stripeno % su_per_object) * su;
/*
* Calculate the length of the extent being written to the selected
* object. This is the minimum of the full length requested (len) or
* the remainder of the current stripe being written to.
*/
*oxlen = min_t(u64, len, su - su_offset);
dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
return 0;
invalid:
dout(" invalid layout\n");
*ono = 0;
*oxoff = 0;
*oxlen = 0;
return -EINVAL;
}
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/*
* Map an object into a PG.
*
* Should only be called with target_oid and target_oloc (as opposed to
* base_oid and base_oloc), since tiering isn't taken into account.
*/
int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid)
{
WARN_ON(pi->id != oloc->pool);
if (!oloc->pool_ns) {
raw_pgid->pool = oloc->pool;
raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
oid->name_len);
dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
raw_pgid->pool, raw_pgid->seed);
} else {
char stack_buf[256];
char *buf = stack_buf;
int nsl = oloc->pool_ns->len;
size_t total = nsl + 1 + oid->name_len;
if (total > sizeof(stack_buf)) {
buf = kmalloc(total, GFP_NOIO);
if (!buf)
return -ENOMEM;
}
memcpy(buf, oloc->pool_ns->str, nsl);
buf[nsl] = '\037';
memcpy(buf + nsl + 1, oid->name, oid->name_len);
raw_pgid->pool = oloc->pool;
raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
if (buf != stack_buf)
kfree(buf);
dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
oid->name, nsl, oloc->pool_ns->str,
raw_pgid->pool, raw_pgid->seed);
}
return 0;
}
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
return -ENOENT;
return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
/*
* Map a raw PG (full precision ps) into an actual PG.
*/
static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_pg *pgid)
{
pgid->pool = raw_pgid->pool;
pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
pi->pg_num_mask);
}
/*
* Map a raw PG (full precision ps) into a placement ps (placement
* seed). Include pool id in that value so that different pools don't
* use the same seeds.
*/
static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid)
{
if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
/* hash pool id and seed so that pool PGs do not overlap */
return crush_hash32_2(CRUSH_HASH_RJENKINS1,
ceph_stable_mod(raw_pgid->seed,
pi->pgp_num,
pi->pgp_num_mask),
raw_pgid->pool);
} else {
/*
* legacy behavior: add ps and pool together. this is
* not a great approach because the PGs from each pool
* will overlap on top of each other: 0.5 == 1.4 ==
* 2.3 == ...
*/
return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
pi->pgp_num_mask) +
(unsigned)raw_pgid->pool;
}
}
/*
* Magic value used for a "default" fallback choose_args, used if the
* crush_choose_arg_map passed to do_crush() does not exist. If this
* also doesn't exist, fall back to canonical weights.
*/
#define CEPH_DEFAULT_CHOOSE_ARGS -1
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
const __u32 *weight, int weight_max,
s64 choose_args_index)
{
struct crush_choose_arg_map *arg_map;
int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
choose_args_index);
if (!arg_map)
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
CEPH_DEFAULT_CHOOSE_ARGS);
mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
weight, weight_max, map->crush_workspace,
arg_map ? arg_map->args : NULL);
mutex_unlock(&map->crush_workspace_mutex);
return r;
}
static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
struct ceph_osds *set)
{
int i;
if (ceph_can_shift_osds(pi)) {
int removed = 0;
/* shift left */
for (i = 0; i < set->size; i++) {
if (!ceph_osd_exists(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
set->osds[i - removed] = set->osds[i];
}
set->size -= removed;
} else {
/* set dne devices to NONE */
for (i = 0; i < set->size; i++) {
if (!ceph_osd_exists(osdmap, set->osds[i]))
set->osds[i] = CRUSH_ITEM_NONE;
}
}
}
/*
* Calculate raw set (CRUSH output) for given PG and filter out
* nonexistent OSDs. ->primary is undefined for a raw set.
*
* Placement seed (CRUSH input) is returned through @ppps.
*/
static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *raw,
u32 *ppps)
{
u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno;
int len;
ceph_osds_init(raw);
if (ppps)
*ppps = pps;
ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
pi->size);
if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
pi->id, pi->crush_ruleset, pi->type, pi->size);
return;
}
if (pi->size > ARRAY_SIZE(raw->osds)) {
pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
pi->id, pi->crush_ruleset, pi->type, pi->size,
ARRAY_SIZE(raw->osds));
return;
}
len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
osdmap->osd_weight, osdmap->max_osd, pi->id);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
len, ruleno, pi->id, pi->crush_ruleset, pi->type,
pi->size);
return;
}
raw->size = len;
remove_nonexistent_osds(osdmap, pi, raw);
}
/* apply pg_upmap[_items] mappings */
static void apply_upmap(struct ceph_osdmap *osdmap,
const struct ceph_pg *pgid,
struct ceph_osds *raw)
{
struct ceph_pg_mapping *pg;
int i, j;
pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
if (pg) {
/* make sure targets aren't marked out */
for (i = 0; i < pg->pg_upmap.len; i++) {
int osd = pg->pg_upmap.osds[i];
if (osd != CRUSH_ITEM_NONE &&
osd < osdmap->max_osd &&
osdmap->osd_weight[osd] == 0) {
/* reject/ignore explicit mapping */
return;
}
}
for (i = 0; i < pg->pg_upmap.len; i++)
raw->osds[i] = pg->pg_upmap.osds[i];
raw->size = pg->pg_upmap.len;
/* check and apply pg_upmap_items, if any */
}
pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
if (pg) {
/*
* Note: this approach does not allow a bidirectional swap,
* e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
*/
for (i = 0; i < pg->pg_upmap_items.len; i++) {
int from = pg->pg_upmap_items.from_to[i][0];
int to = pg->pg_upmap_items.from_to[i][1];
int pos = -1;
bool exists = false;
/* make sure replacement doesn't already appear */
for (j = 0; j < raw->size; j++) {
int osd = raw->osds[j];
if (osd == to) {
exists = true;
break;
}
/* ignore mapping if target is marked out */
if (osd == from && pos < 0 &&
!(to != CRUSH_ITEM_NONE &&
to < osdmap->max_osd &&
osdmap->osd_weight[to] == 0)) {
pos = j;
}
}
if (!exists && pos >= 0)
raw->osds[pos] = to;
}
}
}
/*
* Given raw set, calculate up set and up primary. By definition of an
* up set, the result won't contain nonexistent or down OSDs.
*
* This is done in-place - on return @set is the up set. If it's
* empty, ->primary will remain undefined.
*/
static void raw_to_up_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
struct ceph_osds *set)
{
int i;
/* ->primary is undefined for a raw set */
BUG_ON(set->primary != -1);
if (ceph_can_shift_osds(pi)) {
int removed = 0;
/* shift left */
for (i = 0; i < set->size; i++) {
if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
set->osds[i - removed] = set->osds[i];
}
set->size -= removed;
if (set->size > 0)
set->primary = set->osds[0];
} else {
/* set down/dne devices to NONE */
for (i = set->size - 1; i >= 0; i--) {
if (ceph_osd_is_down(osdmap, set->osds[i]))
set->osds[i] = CRUSH_ITEM_NONE;
else
set->primary = set->osds[i];
}
}
}
static void apply_primary_affinity(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
u32 pps,
struct ceph_osds *up)
{
int i;
int pos = -1;
/*
* Do we have any non-default primary_affinity values for these
* osds?
*/
if (!osdmap->osd_primary_affinity)
return;
for (i = 0; i < up->size; i++) {
int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] !=
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
break;
}
}
if (i == up->size)
return;
/*
* Pick the primary. Feed both the seed (for the pg) and the
* osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary.
*/
for (i = 0; i < up->size; i++) {
int osd = up->osds[i];
u32 aff;
if (osd == CRUSH_ITEM_NONE)
continue;
aff = osdmap->osd_primary_affinity[osd];
if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
(crush_hash32_2(CRUSH_HASH_RJENKINS1,
pps, osd) >> 16) >= aff) {
/*
* We chose not to use this primary. Note it
* anyway as a fallback in case we don't pick
* anyone else, but keep looking.
*/
if (pos < 0)
pos = i;
} else {
pos = i;
break;
}
}
if (pos < 0)
return;
up->primary = up->osds[pos];
if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */
for (i = pos; i > 0; i--)
up->osds[i] = up->osds[i - 1];
up->osds[0] = up->primary;
}
}
/*
* Get pg_temp and primary_temp mappings for given PG.
*
* Note that a PG may have none, only pg_temp, only primary_temp or
* both pg_temp and primary_temp mappings. This means @temp isn't
* always a valid OSD set on return: in the "only primary_temp" case,
* @temp will have its ->primary >= 0 but ->size == 0.
*/
static void get_temp_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *pgid,
struct ceph_osds *temp)
{
struct ceph_pg_mapping *pg;
int i;
ceph_osds_init(temp);
/* pg_temp? */
pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
if (ceph_can_shift_osds(pi))
continue;
temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else {
temp->osds[temp->size++] = pg->pg_temp.osds[i];
}
}
/* apply pg_temp's primary */
for (i = 0; i < temp->size; i++) {
if (temp->osds[i] != CRUSH_ITEM_NONE) {
temp->primary = temp->osds[i];
break;
}
}
}
/* primary_temp? */
pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
temp->primary = pg->primary_temp.osd;
}
/*
* Map a PG to its acting set as well as its up set.
*
* Acting set is used for data mapping purposes, while up set can be
* recorded for detecting interval changes and deciding whether to
* resend a request.
*/
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *up,
struct ceph_osds *acting)
{
struct ceph_pg pgid;
u32 pps;
WARN_ON(pi->id != raw_pgid->pool);
raw_pg_to_pg(pi, raw_pgid, &pgid);
pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
apply_upmap(osdmap, &pgid, up);
raw_to_up_osds(osdmap, pi, up);
apply_primary_affinity(osdmap, pi, pps, up);
get_temp_osds(osdmap, pi, &pgid, acting);
if (!acting->size) {
memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
acting->size = up->size;
if (acting->primary == -1)
acting->primary = up->primary;
}
WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_spg *spgid)
{
struct ceph_pg pgid;
struct ceph_osds up, acting;
int i;
WARN_ON(pi->id != raw_pgid->pool);
raw_pg_to_pg(pi, raw_pgid, &pgid);
if (ceph_can_shift_osds(pi)) {
spgid->pgid = pgid; /* struct */
spgid->shard = CEPH_SPG_NOSHARD;
return true;
}
ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
for (i = 0; i < acting.size; i++) {
if (acting.osds[i] == acting.primary) {
spgid->pgid = pgid; /* struct */
spgid->shard = i;
return true;
}
}
return false;
}
/*
* Return acting primary for given PG, or -1 if none.
*/
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
struct ceph_osds up, acting;
pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
if (!pi)
return -1;
ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
return acting.primary;
}
EXPORT_SYMBOL(ceph_pg_to_acting_primary);