drbd: Removed the BIO_RW_BARRIER support form the receiver/epoch code
[pandora-kernel.git] / drivers / block / drbd / drbd_receiver.c
index fe30864..2952c12 100644 (file)
 
 #include "drbd_vli.h"
 
-struct flush_work {
-       struct drbd_work w;
-       struct drbd_epoch *epoch;
-};
-
 enum finish_epoch {
        FE_STILL_LIVE,
        FE_DESTROYED,
@@ -66,16 +61,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
 static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
 
-static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
-{
-       struct drbd_epoch *prev;
-       spin_lock(&mdev->epoch_lock);
-       prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
-       if (prev == epoch || prev == mdev->current_epoch)
-               prev = NULL;
-       spin_unlock(&mdev->epoch_lock);
-       return prev;
-}
 
 #define GFP_TRY        (__GFP_HIGHMEM | __GFP_NOWARN)
 
@@ -241,7 +226,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
        spin_unlock_irq(&mdev->req_lock);
 
        list_for_each_entry_safe(e, t, &reclaimed, w.list)
-               drbd_free_ee(mdev, e);
+               drbd_free_net_ee(mdev, e);
 }
 
 /**
@@ -298,9 +283,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
  * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
  * Either links the page chain back to the global pool,
  * or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
 {
+       atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
        int i;
+
        if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
                i = page_chain_free(page);
        else {
@@ -311,10 +298,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
                drbd_pp_vacant += i;
                spin_unlock(&drbd_pp_lock);
        }
-       atomic_sub(i, &mdev->pp_in_use);
-       i = atomic_read(&mdev->pp_in_use);
+       i = atomic_sub_return(i, a);
        if (i < 0)
-               dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+               dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+                       is_net ? "pp_in_use_by_net" : "pp_in_use", i);
        wake_up(&drbd_pp_wait);
 }
 
@@ -374,11 +361,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
        return NULL;
 }
 
-void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
 {
        if (e->flags & EE_HAS_DIGEST)
                kfree(e->digest);
-       drbd_pp_free(mdev, e->pages);
+       drbd_pp_free(mdev, e->pages, is_net);
        D_ASSERT(atomic_read(&e->pending_bios) == 0);
        D_ASSERT(hlist_unhashed(&e->colision));
        mempool_free(e, drbd_ee_mempool);
@@ -389,13 +376,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
        LIST_HEAD(work_list);
        struct drbd_epoch_entry *e, *t;
        int count = 0;
+       int is_net = list == &mdev->net_ee;
 
        spin_lock_irq(&mdev->req_lock);
        list_splice_init(list, &work_list);
        spin_unlock_irq(&mdev->req_lock);
 
        list_for_each_entry_safe(e, t, &work_list, w.list) {
-               drbd_free_ee(mdev, e);
+               drbd_free_some_ee(mdev, e, is_net);
                count++;
        }
        return count;
@@ -424,7 +412,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
        spin_unlock_irq(&mdev->req_lock);
 
        list_for_each_entry_safe(e, t, &reclaimed, w.list)
-               drbd_free_ee(mdev, e);
+               drbd_free_net_ee(mdev, e);
 
        /* possible callbacks here:
         * e_end_block, and e_end_resync_block, e_send_discard_ack.
@@ -925,6 +913,11 @@ retry:
 
        drbd_thread_start(&mdev->asender);
 
+       if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
+               drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
+               put_ldev(mdev);
+       }
+
        if (!drbd_send_protocol(mdev))
                return -1;
        drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -962,9 +955,10 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
                *cmd = be16_to_cpu(h->h95.command);
                *packet_size = be32_to_cpu(h->h95.length);
        } else {
-               dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
-                   (long)be32_to_cpu(h->h80.magic),
-                   h->h80.command, h->h80.length);
+               dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+                   be32_to_cpu(h->h80.magic),
+                   be16_to_cpu(h->h80.command),
+                   be16_to_cpu(h->h80.length));
                return FALSE;
        }
        mdev->last_received = jiffies;
@@ -972,7 +966,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
        return TRUE;
 }
 
-static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+static void drbd_flush(struct drbd_conf *mdev)
 {
        int rv;
 
@@ -988,24 +982,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
                }
                put_ldev(mdev);
        }
-
-       return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-}
-
-static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-       struct flush_work *fw = (struct flush_work *)w;
-       struct drbd_epoch *epoch = fw->epoch;
-
-       kfree(w);
-
-       if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
-               drbd_flush_after_epoch(mdev, epoch);
-
-       drbd_may_finish_epoch(mdev, epoch, EV_PUT |
-                             (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
-
-       return 1;
 }
 
 /**
@@ -1018,15 +994,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
                                               struct drbd_epoch *epoch,
                                               enum epoch_event ev)
 {
-       int finish, epoch_size;
+       int epoch_size;
        struct drbd_epoch *next_epoch;
-       int schedule_flush = 0;
        enum finish_epoch rv = FE_STILL_LIVE;
 
        spin_lock(&mdev->epoch_lock);
        do {
                next_epoch = NULL;
-               finish = 0;
 
                epoch_size = atomic_read(&epoch->epoch_size);
 
@@ -1036,16 +1010,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
                        break;
                case EV_GOT_BARRIER_NR:
                        set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
-
-                       /* Special case: If we just switched from WO_bio_barrier to
-                          WO_bdev_flush we should not finish the current epoch */
-                       if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
-                           mdev->write_ordering != WO_bio_barrier &&
-                           epoch == mdev->current_epoch)
-                               clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
-                       break;
-               case EV_BARRIER_DONE:
-                       set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
                        break;
                case EV_BECAME_LAST:
                        /* nothing to do*/
@@ -1054,23 +1018,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
                if (epoch_size != 0 &&
                    atomic_read(&epoch->active) == 0 &&
-                   test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
-                   epoch->list.prev == &mdev->current_epoch->list &&
-                   !test_bit(DE_IS_FINISHING, &epoch->flags)) {
-                       /* Nearly all conditions are met to finish that epoch... */
-                       if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
-                           mdev->write_ordering == WO_none ||
-                           (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
-                           ev & EV_CLEANUP) {
-                               finish = 1;
-                               set_bit(DE_IS_FINISHING, &epoch->flags);
-                       } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
-                                mdev->write_ordering == WO_bio_barrier) {
-                               atomic_inc(&epoch->active);
-                               schedule_flush = 1;
-                       }
-               }
-               if (finish) {
+                   test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
                        if (!(ev & EV_CLEANUP)) {
                                spin_unlock(&mdev->epoch_lock);
                                drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1093,6 +1041,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
                                /* atomic_set(&epoch->active, 0); is already zero */
                                if (rv == FE_STILL_LIVE)
                                        rv = FE_RECYCLED;
+                               wake_up(&mdev->ee_wait);
                        }
                }
 
@@ -1104,22 +1053,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
        spin_unlock(&mdev->epoch_lock);
 
-       if (schedule_flush) {
-               struct flush_work *fw;
-               fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
-               if (fw) {
-                       fw->w.cb = w_flush;
-                       fw->epoch = epoch;
-                       drbd_queue_work(&mdev->data.work, &fw->w);
-               } else {
-                       dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
-                       set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
-                       /* That is not a recursion, only one level */
-                       drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-                       drbd_may_finish_epoch(mdev, epoch, EV_PUT);
-               }
-       }
-
        return rv;
 }
 
@@ -1135,19 +1068,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
                [WO_none] = "none",
                [WO_drain_io] = "drain",
                [WO_bdev_flush] = "flush",
-               [WO_bio_barrier] = "barrier",
        };
 
        pwo = mdev->write_ordering;
        wo = min(pwo, wo);
-       if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
-               wo = WO_bdev_flush;
        if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
                wo = WO_drain_io;
        if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
                wo = WO_none;
        mdev->write_ordering = wo;
-       if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+       if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
                dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
 }
 
@@ -1183,7 +1113,7 @@ next_bio:
        bio->bi_sector = sector;
        bio->bi_bdev = mdev->ldev->backing_bdev;
        /* we special case some flags in the multi-bio case, see below
-        * (REQ_UNPLUG, REQ_HARDBARRIER) */
+        * (REQ_UNPLUG) */
        bio->bi_rw = rw;
        bio->bi_private = e;
        bio->bi_end_io = drbd_endio_sec;
@@ -1217,11 +1147,6 @@ next_bio:
                        bio->bi_rw &= ~REQ_UNPLUG;
 
                drbd_generic_make_request(mdev, fault_type, bio);
-
-               /* strip off REQ_HARDBARRIER,
-                * unless it is the first or last bio */
-               if (bios && bios->bi_next)
-                       bios->bi_rw &= ~REQ_HARDBARRIER;
        } while (bios);
        maybe_kick_lo(mdev);
        return 0;
@@ -1235,45 +1160,9 @@ fail:
        return -ENOMEM;
 }
 
-/**
- * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
- * @mdev:      DRBD device.
- * @w:         work object.
- * @cancel:    The connection will be closed anyways (unused in this callback)
- */
-int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
-{
-       struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-       /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
-          (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
-          so that we can finish that epoch in drbd_may_finish_epoch().
-          That is necessary if we already have a long chain of Epochs, before
-          we realize that REQ_HARDBARRIER is actually not supported */
-
-       /* As long as the -ENOTSUPP on the barrier is reported immediately
-          that will never trigger. If it is reported late, we will just
-          print that warning and continue correctly for all future requests
-          with WO_bdev_flush */
-       if (previous_epoch(mdev, e->epoch))
-               dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
-
-       /* we still have a local reference,
-        * get_ldev was done in receive_Data. */
-
-       e->w.cb = e_end_block;
-       if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
-               /* drbd_submit_ee fails for one reason only:
-                * if was not able to allocate sufficient bios.
-                * requeue, try again later. */
-               e->w.cb = w_e_reissue;
-               drbd_queue_work(&mdev->data.work, &e->w);
-       }
-       return 1;
-}
-
 static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-       int rv, issue_flush;
+       int rv;
        struct p_barrier *p = &mdev->data.rbuf.barrier;
        struct drbd_epoch *epoch;
 
@@ -1291,44 +1180,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
         * Therefore we must send the barrier_ack after the barrier request was
         * completed. */
        switch (mdev->write_ordering) {
-       case WO_bio_barrier:
        case WO_none:
                if (rv == FE_RECYCLED)
                        return TRUE;
-               break;
+
+               /* receiver context, in the writeout path of the other node.
+                * avoid potential distributed deadlock */
+               epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+               if (epoch)
+                       break;
+               else
+                       dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+                       /* Fall through */
 
        case WO_bdev_flush:
        case WO_drain_io:
-               if (rv == FE_STILL_LIVE) {
-                       set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
-                       drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-                       rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
-               }
-               if (rv == FE_RECYCLED)
-                       return TRUE;
-
-               /* The asender will send all the ACKs and barrier ACKs out, since
-                  all EEs moved from the active_ee to the done_ee. We need to
-                  provide a new epoch object for the EEs that come in soon */
-               break;
-       }
-
-       /* receiver context, in the writeout path of the other node.
-        * avoid potential distributed deadlock */
-       epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
-       if (!epoch) {
-               dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
-               issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
                drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-               if (issue_flush) {
-                       rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
-                       if (rv == FE_RECYCLED)
-                               return TRUE;
+               drbd_flush(mdev);
+
+               if (atomic_read(&mdev->current_epoch->epoch_size)) {
+                       epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+                       if (epoch)
+                               break;
                }
 
-               drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+               epoch = mdev->current_epoch;
+               wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
+
+               D_ASSERT(atomic_read(&epoch->active) == 0);
+               D_ASSERT(epoch->flags == 0);
 
                return TRUE;
+       default:
+               dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
+               return FALSE;
        }
 
        epoch->flags = 0;
@@ -1455,7 +1340,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
                data_size -= rr;
        }
        kunmap(page);
-       drbd_pp_free(mdev, page);
+       drbd_pp_free(mdev, page, 0);
        return rv;
 }
 
@@ -1564,6 +1449,13 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
        if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
                return TRUE;
 
+       /* drbd_submit_ee currently fails for one reason only:
+        * not being able to allocate enough bios.
+        * Is dropping the connection going to help? */
+       spin_lock_irq(&mdev->req_lock);
+       list_del(&e->w.list);
+       spin_unlock_irq(&mdev->req_lock);
+
        drbd_free_ee(mdev, e);
 fail:
        put_ldev(mdev);
@@ -1621,7 +1513,7 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un
 
                ok = drbd_drain_block(mdev, data_size);
 
-               drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+               drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
        }
 
        atomic_add(data_size >> 9, &mdev->rs_sect_in);
@@ -1636,15 +1528,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
        struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
        sector_t sector = e->sector;
-       struct drbd_epoch *epoch;
        int ok = 1, pcmd;
 
-       if (e->flags & EE_IS_BARRIER) {
-               epoch = previous_epoch(mdev, e->epoch);
-               if (epoch)
-                       drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
-       }
-
        if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
                if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                        pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1748,6 +1633,18 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
        return ret;
 }
 
+static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+{
+       if (mdev->agreed_pro_version >= 95)
+               return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+                       (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
+                       (dpf & DP_FUA ? REQ_FUA : 0) |
+                       (dpf & DP_FLUSH ? REQ_FUA : 0) |
+                       (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+       else
+               return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+}
+
 /* mirrored write */
 static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
@@ -1766,7 +1663,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
                        mdev->peer_seq++;
                spin_unlock(&mdev->peer_seq_lock);
 
-               drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+               drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
                atomic_inc(&mdev->current_epoch->epoch_size);
                return drbd_drain_block(mdev, data_size);
        }
@@ -1789,36 +1686,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
        e->epoch = mdev->current_epoch;
        atomic_inc(&e->epoch->epoch_size);
        atomic_inc(&e->epoch->active);
-
-       if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
-               struct drbd_epoch *epoch;
-               /* Issue a barrier if we start a new epoch, and the previous epoch
-                  was not a epoch containing a single request which already was
-                  a Barrier. */
-               epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
-               if (epoch == e->epoch) {
-                       set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-                       rw |= REQ_HARDBARRIER;
-                       e->flags |= EE_IS_BARRIER;
-               } else {
-                       if (atomic_read(&epoch->epoch_size) > 1 ||
-                           !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
-                               set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
-                               set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-                               rw |= REQ_HARDBARRIER;
-                               e->flags |= EE_IS_BARRIER;
-                       }
-               }
-       }
        spin_unlock(&mdev->epoch_lock);
 
        dp_flags = be32_to_cpu(p->dp_flags);
-       if (dp_flags & DP_HARDBARRIER) {
-               dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
-               /* rw |= REQ_HARDBARRIER; */
-       }
-       if (dp_flags & DP_RW_SYNC)
-               rw |= REQ_SYNC | REQ_UNPLUG;
+       rw |= write_flags_to_bio(mdev, dp_flags);
+
        if (dp_flags & DP_MAY_SET_IN_SYNC)
                e->flags |= EE_MAY_SET_IN_SYNC;
 
@@ -1971,16 +1843,27 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
                break;
        }
 
-       if (mdev->state.pdsk == D_DISKLESS) {
+       if (mdev->state.pdsk < D_INCONSISTENT) {
                /* In case we have the only disk of the cluster, */
                drbd_set_out_of_sync(mdev, e->sector, e->size);
                e->flags |= EE_CALL_AL_COMPLETE_IO;
+               e->flags &= ~EE_MAY_SET_IN_SYNC;
                drbd_al_begin_io(mdev, e->sector);
        }
 
        if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
                return TRUE;
 
+       /* drbd_submit_ee currently fails for one reason only:
+        * not being able to allocate enough bios.
+        * Is dropping the connection going to help? */
+       spin_lock_irq(&mdev->req_lock);
+       list_del(&e->w.list);
+       hlist_del_init(&e->colision);
+       spin_unlock_irq(&mdev->req_lock);
+       if (e->flags & EE_CALL_AL_COMPLETE_IO)
+               drbd_al_complete_io(mdev, e->sector);
+
 out_interrupted:
        /* yes, the epoch_size now is imbalanced.
         * but we drop the connection anyways, so we don't have a chance to
@@ -2045,7 +1928,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
        const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
        struct drbd_epoch_entry *e;
        struct digest_info *di = NULL;
-       int size;
+       int size, verb;
        unsigned int fault_type;
        struct p_block_req *p = &mdev->data.rbuf.block_req;
 
@@ -2064,12 +1947,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
        }
 
        if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
-               if (__ratelimit(&drbd_ratelimit_state))
+               verb = 1;
+               switch (cmd) {
+               case P_DATA_REQUEST:
+                       drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+                       break;
+               case P_RS_DATA_REQUEST:
+               case P_CSUM_RS_REQUEST:
+               case P_OV_REQUEST:
+                       drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+                       break;
+               case P_OV_REPLY:
+                       verb = 0;
+                       dec_rs_pending(mdev);
+                       drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+                       break;
+               default:
+                       dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+                               cmdname(cmd));
+               }
+               if (verb && __ratelimit(&drbd_ratelimit_state))
                        dev_err(DEV, "Can not satisfy peer's read request, "
                            "no local data.\n");
-               drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY :
-                                P_NEG_RS_DREPLY , p);
-               return TRUE;
+
+               /* drain possibly payload */
+               return drbd_drain_block(mdev, digest_size);
        }
 
        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -2122,10 +2024,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
                break;
 
        case P_OV_REQUEST:
-               if (mdev->state.conn >= C_CONNECTED &&
-                   mdev->state.conn != C_VERIFY_T)
-                       dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
-                               drbd_conn_str(mdev->state.conn));
                if (mdev->ov_start_sector == ~(sector_t)0 &&
                    mdev->agreed_pro_version >= 90) {
                        mdev->ov_start_sector = sector;
@@ -2184,6 +2082,14 @@ submit:
        if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
                return TRUE;
 
+       /* drbd_submit_ee currently fails for one reason only:
+        * not being able to allocate enough bios.
+        * Is dropping the connection going to help? */
+       spin_lock_irq(&mdev->req_lock);
+       list_del(&e->w.list);
+       spin_unlock_irq(&mdev->req_lock);
+       /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
 out_free_e:
        put_ldev(mdev);
        drbd_free_ee(mdev, e);
@@ -3071,6 +2977,8 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
                if (mdev->agreed_pro_version < 94)
                        max_seg_s = be32_to_cpu(p->max_segment_size);
+               else if (mdev->agreed_pro_version == 94)
+                       max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
                else /* drbd 8.3.8 onwards */
                        max_seg_s = DRBD_MAX_SEGMENT_SIZE;
 
@@ -3222,8 +3130,7 @@ static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
        struct p_state *p = &mdev->data.rbuf.state;
-       enum drbd_conns nconn, oconn;
-       union drbd_state ns, peer_state;
+       union drbd_state os, ns, peer_state;
        enum drbd_disk_state real_peer_disk;
        enum chg_state_flags cs_flags;
        int rv;
@@ -3238,40 +3145,74 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
        spin_lock_irq(&mdev->req_lock);
  retry:
-       oconn = nconn = mdev->state.conn;
+       os = ns = mdev->state;
        spin_unlock_irq(&mdev->req_lock);
 
-       if (nconn == C_WF_REPORT_PARAMS)
-               nconn = C_CONNECTED;
+       /* peer says his disk is uptodate, while we think it is inconsistent,
+        * and this happens while we think we have a sync going on. */
+       if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+           os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+               /* If we are (becoming) SyncSource, but peer is still in sync
+                * preparation, ignore its uptodate-ness to avoid flapping, it
+                * will change to inconsistent once the peer reaches active
+                * syncing states.
+                * It may have changed syncer-paused flags, however, so we
+                * cannot ignore this completely. */
+               if (peer_state.conn > C_CONNECTED &&
+                   peer_state.conn < C_SYNC_SOURCE)
+                       real_peer_disk = D_INCONSISTENT;
+
+               /* if peer_state changes to connected at the same time,
+                * it explicitly notifies us that it finished resync.
+                * Maybe we should finish it up, too? */
+               else if (os.conn >= C_SYNC_SOURCE &&
+                        peer_state.conn == C_CONNECTED) {
+                       if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+                               drbd_resync_finished(mdev);
+                       return TRUE;
+               }
+       }
+
+       /* peer says his disk is inconsistent, while we think it is uptodate,
+        * and this happens while the peer still thinks we have a sync going on,
+        * but we think we are already done with the sync.
+        * We ignore this to avoid flapping pdsk.
+        * This should not happen, if the peer is a recent version of drbd. */
+       if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+           os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+               real_peer_disk = D_UP_TO_DATE;
+
+       if (ns.conn == C_WF_REPORT_PARAMS)
+               ns.conn = C_CONNECTED;
 
        if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
            get_ldev_if_state(mdev, D_NEGOTIATING)) {
                int cr; /* consider resync */
 
                /* if we established a new connection */
-               cr  = (oconn < C_CONNECTED);
+               cr  = (os.conn < C_CONNECTED);
                /* if we had an established connection
                 * and one of the nodes newly attaches a disk */
-               cr |= (oconn == C_CONNECTED &&
+               cr |= (os.conn == C_CONNECTED &&
                       (peer_state.disk == D_NEGOTIATING ||
-                       mdev->state.disk == D_NEGOTIATING));
+                       os.disk == D_NEGOTIATING));
                /* if we have both been inconsistent, and the peer has been
                 * forced to be UpToDate with --overwrite-data */
                cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
                /* if we had been plain connected, and the admin requested to
                 * start a sync by "invalidate" or "invalidate-remote" */
-               cr |= (oconn == C_CONNECTED &&
+               cr |= (os.conn == C_CONNECTED &&
                                (peer_state.conn >= C_STARTING_SYNC_S &&
                                 peer_state.conn <= C_WF_BITMAP_T));
 
                if (cr)
-                       nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+                       ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
 
                put_ldev(mdev);
-               if (nconn == C_MASK) {
-                       nconn = C_CONNECTED;
+               if (ns.conn == C_MASK) {
+                       ns.conn = C_CONNECTED;
                        if (mdev->state.disk == D_NEGOTIATING) {
-                               drbd_force_state(mdev, NS(disk, D_DISKLESS));
+                               drbd_force_state(mdev, NS(disk, D_FAILED));
                        } else if (peer_state.disk == D_NEGOTIATING) {
                                dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
                                peer_state.disk = D_DISKLESS;
@@ -3279,7 +3220,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
                        } else {
                                if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
                                        return FALSE;
-                               D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+                               D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
                                drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
                                return FALSE;
                        }
@@ -3287,18 +3228,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
        }
 
        spin_lock_irq(&mdev->req_lock);
-       if (mdev->state.conn != oconn)
+       if (mdev->state.i != os.i)
                goto retry;
        clear_bit(CONSIDER_RESYNC, &mdev->flags);
-       ns.i = mdev->state.i;
-       ns.conn = nconn;
        ns.peer = peer_state.role;
        ns.pdsk = real_peer_disk;
        ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
-       if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+       if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
                ns.disk = mdev->new_state_tmp.disk;
-       cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
-       if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
+       cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+       if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
            test_bit(NEW_CUR_UUID, &mdev->flags)) {
                /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
                   for temporal network outages! */
@@ -3319,8 +3258,8 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
                return FALSE;
        }
 
-       if (oconn > C_WF_REPORT_PARAMS) {
-               if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+       if (os.conn > C_WF_REPORT_PARAMS) {
+               if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
                    peer_state.disk != D_NEGOTIATING ) {
                        /* we want resync, peer has not yet decided to sync... */
                        /* Nowadays only used when forcing a node into primary role and
@@ -3402,7 +3341,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
        u64 tmp;
        unsigned long s = c->bit_offset;
        unsigned long e;
-       int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+       int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
        int toggle = DCBP_get_start(p);
        int have;
        int bits;
@@ -3551,8 +3490,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
                        memcpy(p, h, sizeof(*h));
                        if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
                                goto out;
-                       if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
-                               dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+                       if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+                               dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
                                return FAILED;
                        }
                        ret = decode_bitmap_c(mdev, p, &c);
@@ -3714,6 +3653,9 @@ static void drbdd(struct drbd_conf *mdev)
        err_out:
                drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
        }
+       /* If we leave here, we probably want to update at least the
+        * "Connected" indicator on stable storage. Do so explicitly here. */
+       drbd_md_sync(mdev);
 }
 
 void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -3798,7 +3740,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 
        /* make sure syncer is stopped and w_resume_next_sg queued */
        del_timer_sync(&mdev->resync_timer);
-       set_bit(STOP_SYNC_TIMER, &mdev->flags);
        resync_timer_fn((unsigned long)mdev);
 
        /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
@@ -3813,7 +3754,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
        kfree(mdev->p_uuid);
        mdev->p_uuid = NULL;
 
-       if (!mdev->state.susp)
+       if (!is_susp(mdev->state))
                tl_clear(mdev);
 
        dev_info(DEV, "Connection closed\n");
@@ -3842,7 +3783,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
        if (os.conn == C_DISCONNECTING) {
                wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
 
-               if (!mdev->state.susp) {
+               if (!is_susp(mdev->state)) {
                        /* we must not free the tl_hash
                         * while application io is still on the fly */
                        wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
@@ -3867,6 +3808,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
        i = drbd_release_ee(mdev, &mdev->net_ee);
        if (i)
                dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+       i = atomic_read(&mdev->pp_in_use_by_net);
+       if (i)
+               dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
        i = atomic_read(&mdev->pp_in_use);
        if (i)
                dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@ -4223,10 +4167,13 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
 
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-       drbd_rs_complete_io(mdev, sector);
-       drbd_set_in_sync(mdev, sector, blksize);
-       /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
-       mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+       if (get_ldev(mdev)) {
+               drbd_rs_complete_io(mdev, sector);
+               drbd_set_in_sync(mdev, sector, blksize);
+               /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+               mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+               put_ldev(mdev);
+       }
        dec_rs_pending(mdev);
        atomic_add(blksize >> 9, &mdev->rs_sect_in);
 
@@ -4405,6 +4352,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
        else
                ov_oos_print(mdev);
 
+       if (!get_ldev(mdev))
+               return TRUE;
+
        drbd_rs_complete_io(mdev, sector);
        dec_rs_pending(mdev);
 
@@ -4419,6 +4369,7 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
                        drbd_resync_finished(mdev);
                }
        }
+       put_ldev(mdev);
        return TRUE;
 }
 
@@ -4492,10 +4443,8 @@ int drbd_asender(struct drbd_thread *thi)
                while (1) {
                        clear_bit(SIGNAL_ASENDER, &mdev->flags);
                        flush_signals(current);
-                       if (!drbd_process_done_ee(mdev)) {
-                               dev_err(DEV, "process_done_ee() = NOT_OK\n");
+                       if (!drbd_process_done_ee(mdev))
                                goto reconnect;
-                       }
                        /* to avoid race with newly queued ACKs */
                        set_bit(SIGNAL_ASENDER, &mdev->flags);
                        spin_lock_irq(&mdev->req_lock);
@@ -4554,17 +4503,19 @@ int drbd_asender(struct drbd_thread *thi)
 
                if (received == expect && cmd == NULL) {
                        if (unlikely(h->magic != BE_DRBD_MAGIC)) {
-                               dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
-                                   (long)be32_to_cpu(h->magic),
-                                   h->command, h->length);
+                               dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+                                   be32_to_cpu(h->magic),
+                                   be16_to_cpu(h->command),
+                                   be16_to_cpu(h->length));
                                goto reconnect;
                        }
                        cmd = get_asender_cmd(be16_to_cpu(h->command));
                        len = be16_to_cpu(h->length);
                        if (unlikely(cmd == NULL)) {
-                               dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
-                                   (long)be32_to_cpu(h->magic),
-                                   h->command, h->length);
+                               dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+                                   be32_to_cpu(h->magic),
+                                   be16_to_cpu(h->command),
+                                   be16_to_cpu(h->length));
                                goto disconnect;
                        }
                        expect = cmd->pkt_size;
@@ -4586,10 +4537,12 @@ int drbd_asender(struct drbd_thread *thi)
        if (0) {
 reconnect:
                drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+               drbd_md_sync(mdev);
        }
        if (0) {
 disconnect:
                drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+               drbd_md_sync(mdev);
        }
        clear_bit(SIGNAL_ASENDER, &mdev->flags);