Merge commit 'v2.6.39-rc3' into for-2.6.39

[pandora-kernel.git] / drivers / block / drbd / drbd_req.c
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c

index ad3fc62..5c0c8be 100644 (file)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -140,9 +140,14 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
         struct hlist_node *n;
         struct hlist_head *slot;
  
-       /* before we can signal completion to the upper layers,
-        * we may need to close the current epoch */
+       /* Before we can signal completion to the upper layers,
+        * we may need to close the current epoch.
+        * We can skip this, if this request has not even been sent, because we
+        * did not have a fully established connection yet/anymore, during
+        * bitmap exchange, or while we are C_AHEAD due to congestion policy.
+        */
         if (mdev->state.conn >= C_CONNECTED &&
+           (s & RQ_NET_SENT) != 0 &&
             req->epoch == mdev->newest_tle->br_number)
                 queue_barrier(mdev);
  
@@ -440,7 +445,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 req->rq_state |= RQ_LOCAL_COMPLETED;
                 req->rq_state &= ~RQ_LOCAL_PENDING;
  
-               __drbd_chk_io_error(mdev, FALSE);
+               __drbd_chk_io_error(mdev, false);
                 _req_may_be_done_not_susp(req, m);
                 put_ldev(mdev);
                 break;
@@ -461,7 +466,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
  
                 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
  
-               __drbd_chk_io_error(mdev, FALSE);
+               __drbd_chk_io_error(mdev, false);
                 put_ldev(mdev);
  
                 /* no point in retrying if there is no good remote data,
@@ -545,6 +550,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
  
                 break;
  
+       case queue_for_send_oos:
+               req->rq_state |= RQ_NET_QUEUED;
+               req->w.cb =  w_send_oos;
+               drbd_queue_work(&mdev->data.work, &req->w);
+               break;
+
+       case oos_handed_to_network:
+               /* actually the same */
         case send_canceled:
                 /* treat it the same */
         case send_failed:
@@ -558,6 +571,9 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
  
         case handed_over_to_network:
                 /* assert something? */
+               if (bio_data_dir(req->master_bio) == WRITE)
+                       atomic_add(req->size>>9, &mdev->ap_in_flight);
+
                 if (bio_data_dir(req->master_bio) == WRITE &&
                     mdev->net_conf->wire_protocol == DRBD_PROT_A) {
                         /* this is what is dangerous about protocol A:
@@ -591,6 +607,9 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                         dec_ap_pending(mdev);
                 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
                 req->rq_state |= RQ_NET_DONE;
+               if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
+                       atomic_sub(req->size>>9, &mdev->ap_in_flight);
+
                 /* if it is still queued, we may not complete it here.
                  * it will be canceled soon. */
                 if (!(req->rq_state & RQ_NET_QUEUED))
@@ -628,14 +647,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 req->rq_state |= RQ_NET_OK;
                 D_ASSERT(req->rq_state & RQ_NET_PENDING);
                 dec_ap_pending(mdev);
+               atomic_sub(req->size>>9, &mdev->ap_in_flight);
                 req->rq_state &= ~RQ_NET_PENDING;
                 _req_may_be_done_not_susp(req, m);
                 break;
  
         case neg_acked:
                 /* assert something? */
-               if (req->rq_state & RQ_NET_PENDING)
+               if (req->rq_state & RQ_NET_PENDING) {
                         dec_ap_pending(mdev);
+                       atomic_sub(req->size>>9, &mdev->ap_in_flight);
+               }
                 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
  
                 req->rq_state |= RQ_NET_DONE;
@@ -690,8 +712,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                         dev_err(DEV, "FIXME (barrier_acked but pending)\n");
                         list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
                 }
-               D_ASSERT(req->rq_state & RQ_NET_SENT);
-               req->rq_state |= RQ_NET_DONE;
+               if ((req->rq_state & RQ_NET_MASK) != 0) {
+                       req->rq_state |= RQ_NET_DONE;
+                       if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+                               atomic_sub(req->size>>9, &mdev->ap_in_flight);
+               }
                 _req_may_be_done(req, m); /* Allowed while state.susp */
                 break;
  
@@ -738,14 +763,14 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
         return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
  }
  
-static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
+static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
  {
         const int rw = bio_rw(bio);
         const int size = bio->bi_size;
         const sector_t sector = bio->bi_sector;
         struct drbd_tl_epoch *b = NULL;
         struct drbd_request *req;
-       int local, remote;
+       int local, remote, send_oos = 0;
         int err = -EIO;
         int ret = 0;
  
@@ -759,6 +784,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
                 bio_endio(bio, -ENOMEM);
                 return 0;
         }
+       req->start_time = start_time;
  
         local = get_ldev(mdev);
         if (!local) {
@@ -808,9 +834,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
                 drbd_al_begin_io(mdev, sector);
         }
  
-       remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
-                           (mdev->state.pdsk == D_INCONSISTENT &&
-                            mdev->state.conn >= C_CONNECTED));
+       remote = remote && drbd_should_do_remote(mdev->state);
+       send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+       D_ASSERT(!(remote && send_oos));
  
         if (!(local || remote) && !is_susp(mdev->state)) {
                 if (__ratelimit(&drbd_ratelimit_state))
@@ -824,7 +850,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
          * but there is a race between testing the bit and pointer outside the
          * spinlock, and grabbing the spinlock.
          * if we lost that race, we retry.  */
-       if (rw == WRITE && remote &&
+       if (rw == WRITE && (remote || send_oos) &&
             mdev->unused_spare_tle == NULL &&
             test_bit(CREATE_BARRIER, &mdev->flags)) {
  allocate_barrier:
@@ -842,18 +868,19 @@ allocate_barrier:
         if (is_susp(mdev->state)) {
                 /* If we got suspended, use the retry mechanism of
                    generic_make_request() to restart processing of this
-                  bio. In the next call to drbd_make_request_26
+                  bio. In the next call to drbd_make_request
                    we sleep in inc_ap_bio() */
                 ret = 1;
                 spin_unlock_irq(&mdev->req_lock);
                 goto fail_free_complete;
         }
  
-       if (remote) {
-               remote = (mdev->state.pdsk == D_UP_TO_DATE ||
-                           (mdev->state.pdsk == D_INCONSISTENT &&
-                            mdev->state.conn >= C_CONNECTED));
-               if (!remote)
+       if (remote || send_oos) {
+               remote = drbd_should_do_remote(mdev->state);
+               send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+               D_ASSERT(!(remote && send_oos));
+
+               if (!(remote || send_oos))
                         dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
                 if (!(local || remote)) {
                         dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
@@ -866,7 +893,7 @@ allocate_barrier:
                 mdev->unused_spare_tle = b;
                 b = NULL;
         }
-       if (rw == WRITE && remote &&
+       if (rw == WRITE && (remote || send_oos) &&
             mdev->unused_spare_tle == NULL &&
             test_bit(CREATE_BARRIER, &mdev->flags)) {
                 /* someone closed the current epoch
@@ -889,7 +916,7 @@ allocate_barrier:
          * barrier packet.  To get the write ordering right, we only have to
          * make sure that, if this is a write request and it triggered a
          * barrier packet, this request is queued within the same spinlock. */
-       if (remote && mdev->unused_spare_tle &&
+       if ((remote || send_oos) && mdev->unused_spare_tle &&
             test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
                 _tl_add_barrier(mdev, mdev->unused_spare_tle);
                 mdev->unused_spare_tle = NULL;
@@ -937,6 +964,34 @@ allocate_barrier:
                                 ? queue_for_net_write
                                 : queue_for_net_read);
         }
+       if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
+               _req_mod(req, queue_for_send_oos);
+
+       if (remote &&
+           mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
+               int congested = 0;
+
+               if (mdev->net_conf->cong_fill &&
+                   atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+                       dev_info(DEV, "Congestion-fill threshold reached\n");
+                       congested = 1;
+               }
+
+               if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+                       dev_info(DEV, "Congestion-extents threshold reached\n");
+                       congested = 1;
+               }
+
+               if (congested) {
+                       queue_barrier(mdev); /* last barrier, after mirrored writes */
+
+                       if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+                               _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+                       else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+                               _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+               }
+       }
+
         spin_unlock_irq(&mdev->req_lock);
         kfree(b); /* if someone else has beaten us to it... */
  
@@ -949,9 +1004,9 @@ allocate_barrier:
                  * stable storage, and this is a WRITE, we may not even submit
                  * this bio. */
                 if (get_ldev(mdev)) {
-                       if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
-                                            : rw == READ  ? DRBD_FAULT_DT_RD
-                                            :               DRBD_FAULT_DT_RA))
+                       if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
+                                                   : rw == READ  ? DRBD_FAULT_DT_RD
+                                                   :               DRBD_FAULT_DT_RA))
                                 bio_endio(req->private_bio, -EIO);
                         else
                                 generic_make_request(req->private_bio);
@@ -1018,16 +1073,19 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
         return 0;
  }
  
-int drbd_make_request_26(struct request_queue *q, struct bio *bio)
+int drbd_make_request(struct request_queue *q, struct bio *bio)
  {
         unsigned int s_enr, e_enr;
         struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+       unsigned long start_time;
  
         if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
                 bio_endio(bio, -EPERM);
                 return 0;
         }
  
+       start_time = jiffies;
+
         /*
          * what we "blindly" assume:
          */
@@ -1042,12 +1100,12 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
  
         if (likely(s_enr == e_enr)) {
                 inc_ap_bio(mdev, 1);
-               return drbd_make_request_common(mdev, bio);
+               return drbd_make_request_common(mdev, bio, start_time);
         }
  
         /* can this bio be split generically?
          * Maybe add our own split-arbitrary-bios function. */
-       if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
+       if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
                 /* rather error out here than BUG in bio_split */
                 dev_err(DEV, "bio would need to, but cannot, be split: "
                     "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
@@ -1069,11 +1127,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
                 const int sps = 1 << HT_SHIFT; /* sectors per slot */
                 const int mask = sps - 1;
                 const sector_t first_sectors = sps - (sect & mask);
-               bp = bio_split(bio,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-                               bio_split_pool,
-#endif
-                               first_sectors);
+               bp = bio_split(bio, first_sectors);
  
                 /* we need to get a "reference count" (ap_bio_cnt)
                  * to avoid races with the disconnect/reconnect/suspend code.
@@ -1084,10 +1138,10 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
  
                 D_ASSERT(e_enr == s_enr + 1);
  
-               while (drbd_make_request_common(mdev, &bp->bio1))
+               while (drbd_make_request_common(mdev, &bp->bio1, start_time))
                         inc_ap_bio(mdev, 1);
  
-               while (drbd_make_request_common(mdev, &bp->bio2))
+               while (drbd_make_request_common(mdev, &bp->bio2, start_time))
                         inc_ap_bio(mdev, 1);
  
                 dec_ap_bio(mdev);
@@ -1098,7 +1152,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
  }
  
  /* This is called by bio_add_page().  With this function we reduce
- * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
+ * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
   * units (was AL_EXTENTs).
   *
   * we do the calculation within the lower 32bit of the byte offsets,
@@ -1108,7 +1162,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
   * As long as the BIO is empty we have to allow at least one bvec,
   * regardless of size and offset.  so the resulting bio may still
   * cross extent boundaries.  those are dealt with (bio_split) in
- * drbd_make_request_26.
+ * drbd_make_request.
   */
  int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
  {
@@ -1118,8 +1172,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
         unsigned int bio_size = bvm->bi_size;
         int limit, backing_limit;
  
-       limit = DRBD_MAX_SEGMENT_SIZE
-             - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
+       limit = DRBD_MAX_BIO_SIZE
+             - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
         if (limit < 0)
                 limit = 0;
         if (bio_size == 0) {
@@ -1136,3 +1190,42 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
         }
         return limit;
  }
+
+void request_timer_fn(unsigned long data)
+{
+       struct drbd_conf *mdev = (struct drbd_conf *) data;
+       struct drbd_request *req; /* oldest request */
+       struct list_head *le;
+       unsigned long et = 0; /* effective timeout = ko_count * timeout */
+
+       if (get_net_conf(mdev)) {
+               et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
+               put_net_conf(mdev);
+       }
+       if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
+               return; /* Recurring timer stopped */
+
+       spin_lock_irq(&mdev->req_lock);
+       le = &mdev->oldest_tle->requests;
+       if (list_empty(le)) {
+               spin_unlock_irq(&mdev->req_lock);
+               mod_timer(&mdev->request_timer, jiffies + et);
+               return;
+       }
+
+       le = le->prev;
+       req = list_entry(le, struct drbd_request, tl_requests);
+       if (time_is_before_eq_jiffies(req->start_time + et)) {
+               if (req->rq_state & RQ_NET_PENDING) {
+                       dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+                       _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
+               } else {
+                       dev_warn(DEV, "Local backing block device frozen?\n");
+                       mod_timer(&mdev->request_timer, jiffies + et);
+               }
+       } else {
+               mod_timer(&mdev->request_timer, req->start_time + et);
+       }
+
+       spin_unlock_irq(&mdev->req_lock);
+}