diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index bfd765cf0eb7..b6d2b16358be 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -91,6 +91,9 @@ enum { RTTDSCALE = 3, RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE, RTTDEV_INIT = RTTAVG_INIT / 4, + + HARD_SCORN_SECS = 10, /* try another remote port after this */ + MAX_TAINT = 1000, /* cap on aoetgt taint */ }; struct buf { @@ -103,6 +106,10 @@ struct buf { struct request *rq; }; +enum frame_flags { + FFL_PROBE = 1, +}; + struct frame { struct list_head head; u32 tag; @@ -118,6 +125,7 @@ struct frame { struct bio_vec *bv; ulong bcnt; ulong bv_off; + char flags; }; struct aoeif { @@ -138,8 +146,10 @@ struct aoetgt { ushort next_cwnd; /* incr maxout after decrementing to zero */ ushort ssthresh; /* slow start threshold */ ulong falloc; /* number of allocated frames */ + int taint; /* how much we want to avoid this aoetgt */ int minbcnt; int wpkts, rpkts; + char nout_probes; }; struct aoedev { @@ -174,7 +184,6 @@ struct aoedev { struct list_head rexmitq; /* deferred retransmissions */ struct aoetgt *targets[NTARGETS]; struct aoetgt **tgt; /* target in use when working */ - struct aoetgt *htgt; /* target needing rexmit assistance */ ulong ntargets; ulong kicked; char ident[512]; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 391dd8ee2009..000f7fb48841 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -22,6 +22,7 @@ #define MAXIOC (8192) /* default meant to avoid most soft lockups */ static void ktcomplete(struct frame *, struct sk_buff *); +static int count_targets(struct aoedev *d, int *untainted); static struct buf *nextbuf(struct aoedev *); @@ -43,6 +44,8 @@ static struct { spinlock_t lock; } iocq; +static struct page *empty_page; + static struct sk_buff * new_skb(ulong len) { @@ -179,8 +182,10 @@ aoe_freetframe(struct frame *f) t = f->t; f->buf = NULL; + f->lba = 0; f->bv = NULL; f->r_skb = NULL; + f->flags = 0; list_add(&f->head, &t->ffree); } @@ -234,20 +239,25 @@ newframe(struct aoedev *d) struct frame *f; struct aoetgt *t, **tt; int totout = 0; + int use_tainted; + int has_untainted; if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */ printk(KERN_ERR "aoe: NULL TARGETS!\n"); return NULL; } tt = d->tgt; /* last used target */ - for (;;) { + for (use_tainted = 0, has_untainted = 0;;) { tt++; if (tt >= &d->targets[NTARGETS] || !*tt) tt = d->targets; t = *tt; - totout += t->nout; + if (!t->taint) { + has_untainted = 1; + totout += t->nout; + } if (t->nout < t->maxout - && t != d->htgt + && (use_tainted || !t->taint) && t->ifp->nd) { f = newtframe(d, t); if (f) { @@ -256,8 +266,12 @@ newframe(struct aoedev *d) return f; } } - if (tt == d->tgt) /* we've looped and found nada */ - break; + if (tt == d->tgt) { /* we've looped and found nada */ + if (!use_tainted && !has_untainted) + use_tainted = 1; + else + break; + } } if (totout == 0) { d->kicked++; @@ -294,21 +308,68 @@ fhash(struct frame *f) list_add_tail(&f->head, &d->factive[n]); } +static void +ata_rw_frameinit(struct frame *f) +{ + struct aoetgt *t; + struct aoe_hdr *h; + struct aoe_atahdr *ah; + struct sk_buff *skb; + char writebit, extbit; + + skb = f->skb; + h = (struct aoe_hdr *) skb_mac_header(skb); + ah = (struct aoe_atahdr *) (h + 1); + skb_put(skb, sizeof(*h) + sizeof(*ah)); + memset(h, 0, skb->len); + + writebit = 0x10; + extbit = 0x4; + + t = f->t; + f->tag = aoehdr_atainit(t->d, t, h); + fhash(f); + t->nout++; + f->waited = 0; + f->waited_total = 0; + if (f->buf) + f->lba = f->buf->sector; + + /* set up ata header */ + ah->scnt = f->bcnt >> 9; + put_lba(ah, f->lba); + if (t->d->flags & DEVFL_EXT) { + ah->aflags |= AOEAFL_EXT; + } else { + extbit = 0; + ah->lba3 &= 0x0f; + ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ + } + if (f->buf && bio_data_dir(f->buf->bio) == WRITE) { + skb_fillup(skb, f->bv, f->bv_off, f->bcnt); + ah->aflags |= AOEAFL_WRITE; + skb->len += f->bcnt; + skb->data_len = f->bcnt; + skb->truesize += f->bcnt; + t->wpkts++; + } else { + t->rpkts++; + writebit = 0; + } + + ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; + skb->dev = t->ifp->nd; +} + static int aoecmd_ata_rw(struct aoedev *d) { struct frame *f; - struct aoe_hdr *h; - struct aoe_atahdr *ah; struct buf *buf; struct aoetgt *t; struct sk_buff *skb; struct sk_buff_head queue; ulong bcnt, fbcnt; - char writebit, extbit; - - writebit = 0x10; - extbit = 0x4; buf = nextbuf(d); if (buf == NULL) @@ -343,50 +404,15 @@ aoecmd_ata_rw(struct aoedev *d) } while (fbcnt); /* initialize the headers & frame */ - skb = f->skb; - h = (struct aoe_hdr *) skb_mac_header(skb); - ah = (struct aoe_atahdr *) (h+1); - skb_put(skb, sizeof *h + sizeof *ah); - memset(h, 0, skb->len); - f->tag = aoehdr_atainit(d, t, h); - fhash(f); - t->nout++; - f->waited = 0; - f->waited_total = 0; f->buf = buf; f->bcnt = bcnt; - f->lba = buf->sector; - - /* set up ata header */ - ah->scnt = bcnt >> 9; - put_lba(ah, buf->sector); - if (d->flags & DEVFL_EXT) { - ah->aflags |= AOEAFL_EXT; - } else { - extbit = 0; - ah->lba3 &= 0x0f; - ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ - } - if (bio_data_dir(buf->bio) == WRITE) { - skb_fillup(skb, f->bv, f->bv_off, bcnt); - ah->aflags |= AOEAFL_WRITE; - skb->len += bcnt; - skb->data_len = bcnt; - skb->truesize += bcnt; - t->wpkts++; - } else { - t->rpkts++; - writebit = 0; - } - - ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; + ata_rw_frameinit(f); /* mark all tracking fields and load out */ buf->nframesout += 1; buf->sector += bcnt >> 9; - skb->dev = t->ifp->nd; - skb = skb_clone(skb, GFP_ATOMIC); + skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { do_gettimeofday(&f->sent); f->sent_jiffs = (u32) jiffies; @@ -462,11 +488,14 @@ resend(struct aoedev *d, struct frame *f) h = (struct aoe_hdr *) skb_mac_header(skb); ah = (struct aoe_atahdr *) (h+1); - snprintf(buf, sizeof buf, - "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n", - "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n, - h->src, h->dst, t->nout); - aoechr_error(buf); + if (!(f->flags & FFL_PROBE)) { + snprintf(buf, sizeof(buf), + "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n", + "retransmit", d->aoemajor, d->aoeminor, + f->tag, jiffies, n, + h->src, h->dst, t->nout); + aoechr_error(buf); + } f->tag = n; fhash(f); @@ -558,18 +587,18 @@ ejectif(struct aoetgt *t, struct aoeif *ifp) } static struct frame * -reassign_frame(struct list_head *pos) +reassign_frame(struct frame *f) { - struct frame *f; struct frame *nf; struct sk_buff *skb; - f = list_entry(pos, struct frame, head); nf = newframe(f->t->d); if (!nf) return NULL; - - list_del(pos); + if (nf->t == f->t) { + aoe_freetframe(nf); + return NULL; + } skb = nf->skb; nf->skb = f->skb; @@ -583,52 +612,67 @@ reassign_frame(struct list_head *pos) nf->sent = f->sent; nf->sent_jiffs = f->sent_jiffs; f->skb = skb; - aoe_freetframe(f); - f->t->nout--; - nf->t->nout++; return nf; } -static int -sthtith(struct aoedev *d) +static void +probe(struct aoetgt *t) { - struct frame *f, *nf; - struct list_head *nx, *pos, *head; - struct aoetgt *ht = d->htgt; - int i; + struct aoedev *d; + struct frame *f; + struct sk_buff *skb; + struct sk_buff_head queue; + size_t n, m; + int frag; - /* look through the active and pending retransmit frames */ - for (i = 0; i < NFACTIVE; i++) { - head = &d->factive[i]; - list_for_each_safe(pos, nx, head) { - f = list_entry(pos, struct frame, head); - if (f->t != ht) - continue; - nf = reassign_frame(pos); - if (!nf) - return 0; - resend(d, nf); - } + d = t->d; + f = newtframe(d, t); + if (!f) { + pr_err("%s %pm for e%ld.%d: %s\n", + "aoe: cannot probe remote address", + t->addr, + (long) d->aoemajor, d->aoeminor, + "no frame available"); + return; } - head = &d->rexmitq; - list_for_each_safe(pos, nx, head) { - f = list_entry(pos, struct frame, head); - if (f->t != ht) - continue; - nf = reassign_frame(pos); - if (!nf) - return 0; - resend(d, nf); + f->flags |= FFL_PROBE; + ifrotate(t); + f->bcnt = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT; + ata_rw_frameinit(f); + skb = f->skb; + for (frag = 0, n = f->bcnt; n > 0; ++frag, n -= m) { + if (n < PAGE_SIZE) + m = n; + else + m = PAGE_SIZE; + skb_fill_page_desc(skb, frag, empty_page, 0, m); } - /* We've cleaned up the outstanding so take away his - * interfaces so he won't be used. We should remove him from - * the target array here, but cleaning up a target is - * involved. PUNT! - */ - memset(ht->ifs, 0, sizeof ht->ifs); - d->htgt = NULL; - return 1; + skb->len += f->bcnt; + skb->data_len = f->bcnt; + skb->truesize += f->bcnt; + + skb = skb_clone(f->skb, GFP_ATOMIC); + if (skb) { + do_gettimeofday(&f->sent); + f->sent_jiffs = (u32) jiffies; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); + } +} + +static long +rto(struct aoedev *d) +{ + long t; + + t = 2 * d->rttavg >> RTTSCALE; + t += 8 * d->rttdev >> RTTDSCALE; + if (t == 0) + t = 1; + + return t; } static void @@ -636,17 +680,53 @@ rexmit_deferred(struct aoedev *d) { struct aoetgt *t; struct frame *f; + struct frame *nf; struct list_head *pos, *nx, *head; int since; + int untainted; + + count_targets(d, &untainted); head = &d->rexmitq; list_for_each_safe(pos, nx, head) { f = list_entry(pos, struct frame, head); t = f->t; + if (t->taint) { + if (!(f->flags & FFL_PROBE)) { + nf = reassign_frame(f); + if (nf) { + if (t->nout_probes == 0 + && untainted > 0) { + probe(t); + t->nout_probes++; + } + list_replace(&f->head, &nf->head); + pos = &nf->head; + aoe_freetframe(f); + f = nf; + t = f->t; + } + } else if (untainted < 1) { + /* don't probe w/o other untainted aoetgts */ + goto stop_probe; + } else if (tsince_hr(f) < t->taint * rto(d)) { + /* reprobe slowly when taint is high */ + continue; + } + } else if (f->flags & FFL_PROBE) { +stop_probe: /* don't probe untainted aoetgts */ + list_del(pos); + aoe_freetframe(f); + /* leaving d->kicked, because this is routine */ + f->t->d->flags |= DEVFL_KICKME; + continue; + } if (t->nout >= t->maxout) continue; list_del(pos); t->nout++; + if (f->flags & FFL_PROBE) + t->nout_probes++; since = tsince_hr(f); f->waited += since; f->waited_total += since; @@ -654,6 +734,36 @@ rexmit_deferred(struct aoedev *d) } } +/* An aoetgt accumulates demerits quickly, and successful + * probing redeems the aoetgt slowly. + */ +static void +scorn(struct aoetgt *t) +{ + int n; + + n = t->taint++; + t->taint += t->taint * 2; + if (n > t->taint) + t->taint = n; + if (t->taint > MAX_TAINT) + t->taint = MAX_TAINT; +} + +static int +count_targets(struct aoedev *d, int *untainted) +{ + int i, good; + + for (i = good = 0; i < d->ntargets && d->targets[i]; ++i) + if (d->targets[i]->taint == 0) + good++; + + if (untainted) + *untainted = good; + return i; +} + static void rexmit_timer(ulong vp) { @@ -666,6 +776,7 @@ rexmit_timer(ulong vp) register long timeout; ulong flags, n; int i; + int utgts; /* number of aoetgt descriptors (not slots) */ int since; d = (struct aoedev *) vp; @@ -673,10 +784,9 @@ rexmit_timer(ulong vp) spin_lock_irqsave(&d->lock, flags); /* timeout based on observed timings and variations */ - timeout = 2 * d->rttavg >> RTTSCALE; - timeout += 8 * d->rttdev >> RTTDSCALE; - if (timeout == 0) - timeout = 1; + timeout = rto(d); + + utgts = count_targets(d, NULL); if (d->flags & DEVFL_TKILL) { spin_unlock_irqrestore(&d->lock, flags); @@ -702,7 +812,7 @@ rexmit_timer(ulong vp) since = tsince_hr(f); n = f->waited_total + since; n /= USEC_PER_SEC; - if (n > aoe_deadsecs) { + if (n > aoe_deadsecs && !(f->flags & FFL_PROBE)) { /* Waited too long. Device failure. * Hang all frames on first hash bucket for downdev * to clean up. @@ -713,19 +823,26 @@ rexmit_timer(ulong vp) } t = f->t; - if (n > aoe_deadsecs/2) - d->htgt = t; /* see if another target can help */ + n = f->waited + since; + n /= USEC_PER_SEC; + if (aoe_deadsecs && utgts > 0 + && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS)) + scorn(t); /* avoid this target */ if (t->maxout != 1) { t->ssthresh = t->maxout / 2; t->maxout = 1; } - ifp = getif(t, f->skb->dev); - if (ifp && ++ifp->lost > (t->nframes << 1) - && (ifp != t->ifs || t->ifs[1].nd)) { - ejectif(t, ifp); - ifp = NULL; + if (f->flags & FFL_PROBE) { + t->nout_probes--; + } else { + ifp = getif(t, f->skb->dev); + if (ifp && ++ifp->lost > (t->nframes << 1) + && (ifp != t->ifs || t->ifs[1].nd)) { + ejectif(t, ifp); + ifp = NULL; + } } list_move_tail(pos, &d->rexmitq); t->nout--; @@ -733,7 +850,7 @@ rexmit_timer(ulong vp) rexmit_deferred(d); out: - if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) { + if ((d->flags & DEVFL_KICKME) && d->blkq) { d->flags &= ~DEVFL_KICKME; d->blkq->request_fn(d->blkq); } @@ -854,8 +971,6 @@ nextbuf(struct aoedev *d) void aoecmd_work(struct aoedev *d) { - if (d->htgt && !sthtith(d)) - return; rexmit_deferred(d); while (aoecmd_ata_rw(d)) ; @@ -1065,19 +1180,22 @@ ktiocomplete(struct frame *f) struct aoeif *ifp; struct aoedev *d; long n; + int untainted; if (f == NULL) return; t = f->t; d = t->d; + skb = f->r_skb; + buf = f->buf; + if (f->flags & FFL_PROBE) + goto out; + if (!skb) /* just fail the buf. */ + goto noskb; hout = (struct aoe_hdr *) skb_mac_header(f->skb); ahout = (struct aoe_atahdr *) (hout+1); - buf = f->buf; - skb = f->r_skb; - if (skb == NULL) - goto noskb; /* just fail the buf. */ hin = (struct aoe_hdr *) skb->data; skb_pull(skb, sizeof(*hin)); @@ -1089,7 +1207,7 @@ ktiocomplete(struct frame *f) d->aoemajor, d->aoeminor); noskb: if (buf) clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); - goto badrsp; + goto out; } n = ahout->scnt << 9; @@ -1109,8 +1227,6 @@ noskb: if (buf) ifp = getif(t, skb->dev); if (ifp) ifp->lost = 0; - if (d->htgt == t) /* I'll help myself, thank you. */ - d->htgt = NULL; spin_unlock_irq(&d->lock); break; case ATA_CMD_ID_ATA: @@ -1131,8 +1247,17 @@ noskb: if (buf) be16_to_cpu(get_unaligned(&hin->major)), hin->minor); } -badrsp: +out: spin_lock_irq(&d->lock); + if (t->taint > 0 + && --t->taint > 0 + && t->nout_probes == 0) { + count_targets(d, &untainted); + if (untainted > 0) { + probe(t); + t->nout_probes++; + } + } aoe_freetframe(f); @@ -1261,6 +1386,8 @@ aoecmd_ata_rsp(struct sk_buff *skb) if (f) { calc_rttavg(d, f->t, tsince_hr(f)); f->t->nout--; + if (f->flags & FFL_PROBE) + f->t->nout_probes--; } else { f = getframe_deferred(d, n); if (f) { @@ -1379,6 +1506,7 @@ addtgt(struct aoedev *d, char *addr, ulong nframes) memcpy(t->addr, addr, sizeof t->addr); t->ifp = t->ifs; aoecmd_wreset(t); + t->maxout = t->nframes / 2; INIT_LIST_HEAD(&t->ffree); return *tt = t; } @@ -1584,6 +1712,14 @@ aoe_flush_iocq(void) int __init aoecmd_init(void) { + void *p; + + /* get_zeroed_page returns page with ref count 1 */ + p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); + if (!p) + return -ENOMEM; + empty_page = virt_to_page(p); + INIT_LIST_HEAD(&iocq.head); spin_lock_init(&iocq.lock); init_waitqueue_head(&ktiowq); @@ -1599,4 +1735,7 @@ aoecmd_exit(void) { aoe_ktstop(&kts); aoe_flush_iocq(); + + free_page((unsigned long) page_address(empty_page)); + empty_page = NULL; } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index aaaea662a72a..f0c0c7416aed 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -223,7 +223,6 @@ aoedev_downdev(struct aoedev *d) /* clean out the in-process request (if any) */ aoe_failip(d); - d->htgt = NULL; /* fast fail all pending I/O */ if (d->blkq) {