crush: add chooseleaf_vary_r tunable
The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
This commit is contained in:
parent
6ed1002f36
commit
e2b149cc4b
@ -173,6 +173,12 @@ struct crush_map {
|
|||||||
* apply to a collision: in that case we will retry as we used
|
* apply to a collision: in that case we will retry as we used
|
||||||
* to. */
|
* to. */
|
||||||
__u32 chooseleaf_descend_once;
|
__u32 chooseleaf_descend_once;
|
||||||
|
|
||||||
|
/* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
|
||||||
|
* bits. a value of 1 is best for new clusters. for legacy clusters
|
||||||
|
* that want to limit reshuffling, a value of 3 or 4 will make the
|
||||||
|
* mappings line up a bit better with previous mappings. */
|
||||||
|
__u8 chooseleaf_vary_r;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map,
|
|||||||
* @local_retries: localized retries
|
* @local_retries: localized retries
|
||||||
* @local_fallback_retries: localized fallback retries
|
* @local_fallback_retries: localized fallback retries
|
||||||
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
|
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
|
||||||
|
* @vary_r: pass r to recursive calls
|
||||||
* @out2: second output vector for leaf items (if @recurse_to_leaf)
|
* @out2: second output vector for leaf items (if @recurse_to_leaf)
|
||||||
|
* @parent_r: r value passed from the parent
|
||||||
*/
|
*/
|
||||||
static int crush_choose_firstn(const struct crush_map *map,
|
static int crush_choose_firstn(const struct crush_map *map,
|
||||||
struct crush_bucket *bucket,
|
struct crush_bucket *bucket,
|
||||||
@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map,
|
|||||||
unsigned int local_retries,
|
unsigned int local_retries,
|
||||||
unsigned int local_fallback_retries,
|
unsigned int local_fallback_retries,
|
||||||
int recurse_to_leaf,
|
int recurse_to_leaf,
|
||||||
int *out2)
|
unsigned int vary_r,
|
||||||
|
int *out2,
|
||||||
|
int parent_r)
|
||||||
{
|
{
|
||||||
int rep;
|
int rep;
|
||||||
unsigned int ftotal, flocal;
|
unsigned int ftotal, flocal;
|
||||||
@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
|
|||||||
int itemtype;
|
int itemtype;
|
||||||
int collide, reject;
|
int collide, reject;
|
||||||
|
|
||||||
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
|
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
|
||||||
bucket->id, x, outpos, numrep);
|
recurse_to_leaf ? "_LEAF" : "",
|
||||||
|
bucket->id, x, outpos, numrep,
|
||||||
|
tries, recurse_tries, local_retries, local_fallback_retries,
|
||||||
|
parent_r);
|
||||||
|
|
||||||
for (rep = outpos; rep < numrep; rep++) {
|
for (rep = outpos; rep < numrep; rep++) {
|
||||||
/* keep trying until we get a non-out, non-colliding item */
|
/* keep trying until we get a non-out, non-colliding item */
|
||||||
@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
|
|||||||
do {
|
do {
|
||||||
collide = 0;
|
collide = 0;
|
||||||
retry_bucket = 0;
|
retry_bucket = 0;
|
||||||
r = rep;
|
r = rep + parent_r;
|
||||||
/* r' = r + f_total */
|
/* r' = r + f_total */
|
||||||
r += ftotal;
|
r += ftotal;
|
||||||
|
|
||||||
@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map,
|
|||||||
reject = 0;
|
reject = 0;
|
||||||
if (!collide && recurse_to_leaf) {
|
if (!collide && recurse_to_leaf) {
|
||||||
if (item < 0) {
|
if (item < 0) {
|
||||||
|
int sub_r;
|
||||||
|
if (vary_r)
|
||||||
|
sub_r = r >> (vary_r-1);
|
||||||
|
else
|
||||||
|
sub_r = 0;
|
||||||
if (crush_choose_firstn(map,
|
if (crush_choose_firstn(map,
|
||||||
map->buckets[-1-item],
|
map->buckets[-1-item],
|
||||||
weight, weight_max,
|
weight, weight_max,
|
||||||
@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map,
|
|||||||
local_retries,
|
local_retries,
|
||||||
local_fallback_retries,
|
local_fallback_retries,
|
||||||
0,
|
0,
|
||||||
NULL) <= outpos)
|
vary_r,
|
||||||
|
NULL,
|
||||||
|
sub_r) <= outpos)
|
||||||
/* didn't get leaf */
|
/* didn't get leaf */
|
||||||
reject = 1;
|
reject = 1;
|
||||||
} else {
|
} else {
|
||||||
@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map,
|
|||||||
int choose_local_retries = map->choose_local_tries;
|
int choose_local_retries = map->choose_local_tries;
|
||||||
int choose_local_fallback_retries = map->choose_local_fallback_tries;
|
int choose_local_fallback_retries = map->choose_local_fallback_tries;
|
||||||
|
|
||||||
|
int vary_r = map->chooseleaf_vary_r;
|
||||||
|
|
||||||
if ((__u32)ruleno >= map->max_rules) {
|
if ((__u32)ruleno >= map->max_rules) {
|
||||||
dprintk(" bad ruleno %d\n", ruleno);
|
dprintk(" bad ruleno %d\n", ruleno);
|
||||||
return 0;
|
return 0;
|
||||||
@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map,
|
|||||||
choose_local_retries,
|
choose_local_retries,
|
||||||
choose_local_fallback_retries,
|
choose_local_fallback_retries,
|
||||||
recurse_to_leaf,
|
recurse_to_leaf,
|
||||||
c+osize);
|
vary_r,
|
||||||
|
c+osize,
|
||||||
|
0);
|
||||||
} else {
|
} else {
|
||||||
crush_choose_indep(
|
crush_choose_indep(
|
||||||
map,
|
map,
|
||||||
|
Loading…
Reference in New Issue
Block a user