crush_do_rule中,用了一个scratch空间来完成item的搜索。
scratch空间总共有3个max_result这么大,并且按照max_result长度划分为三个部分(下图中的a、b、c,其中c只在recursive_to_leaf时用到,本文不涉及)。
a、b两个部分就用来生成result。a、b两个部分分别由o、w两个数组指针来指向,在每完成一个select step后,o、w互换指向的位置,上一次的o将变成本次的w,成为本次step遍历的对象,而上次的w将变成本次的o用于存放本次step的output items。
以Sage Weil论文中的例子来演示此过程:
take(root) ->root
select(1, row) ->row2 //图例step1
select(3, cabinet) ->cab21 cab23 cab24 //图例step2
select(1, disk) ->disk2107 disk2313 disk2437 //图例step3
emit //图例step4 (从scratch数组中将选中的item拷贝到result数组中)
简化后的骨干代码如下:
1、省略了itemid的合法性校验代码
2、仅考虑firstn的情况(即仅考虑replica策略)
3、不考虑recursive_to_leaf的情况(此为工程优化,非Sage Weil论文的核心内容)
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max,
int *scratch)
{
int osize, wsize = ;
int *w, *o;
w = scratch;
o = scratch + result_max; struct crush_rule *rule = map->rules[ruleno]; for (__u32 step = ; step < rule->len; step++) {
struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) {
case CRUSH_RULE_TAKE:
w[] = curstep->arg1;
wsize = ;
break; //Elar:
//1. only consider fistn's situation
//2. ignore recurse_to_leaf situation
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN: /* reset output */
osize = ;
for (int i = ; i < wsize; i++) {
int numrep = curstep->arg1;
int outpos = ; int type = curstep->arg2;
int bno = - - w[i];//Elar: get bucketId
struct crush_bucket *bucket = map->buckets[bno];
osize += crush_choose_firstn(map, bucket, weight, weight_max, x, numrep, type, o+osize, outpos);
} /* swap o and w arrays */
int *tmp = o; o = w; w = tmp;
wsize = osize;
break; case CRUSH_RULE_EMIT:
int i = ;
result_len = ;
for (; i < wsize && result_len < result_max; i++) {
result[result_len] = w[i];
result_len++;
}
wsize = ;
break; default:
break;
}
}
return result_len;
} static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket,
const __u32 *weight,
int weight_max,
int x,
int numrep,
int type,
int *out)
{
for (int rep = ; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
unsigned int ftotal = ;
bool skip_rep = ;
do {
unsigned int flocal = ;
bool retry_descent = false;
struct crush_bucket *in = bucket; do {
bool retry_bucket = false;
int r = rep + parent_r;
/* r' = r + f_total */
r += ftotal; /* bucket choose */
int item = crush_bucket_choose(in, x, r); int itemtype;
if (item < )//Elar: if item is a bucket, then get its type
itemtype = map->buckets[--item]->type;
else//Elar: if item is a device, then its type=0
itemtype = ; //Elar: if this item's type is not what we expected, then keep going until we get an match one!
if (itemtype != type) {
in = map->buckets[--item];
retry_bucket = ;
continue;
} // Elar: check if item has already been in the output array
bool collide = false;
for (i = ; i < outpos; i++) {
if (out[i] == item) {
collide = true;
break;
}
} bool reject = false;
if (itemtype == ){
//Elar: check if this item has been marked as "out"
reject = is_out(map, weight,weight_max,item, x);
}else{
reject = false;
} reject:
if (reject || collide) {
ftotal++;
flocal++; if still can try locally(with in the same bucket, try other items)
retry_bucket = true;
else if still can try descent(parent's or grandparent's sibling buckets)
/* then retry descent */
retry_descent = true;
else
/* else give up */
skip_rep = true;
}
} while (true == retry_bucket);
} while (true == retry_descent); if (true == skip_rep) {
continue;
} out[outpos] = item;
outpos++;
}
return outpos;
}
完整代码请移步git: