net/rds/recv.c

   1 /*
   2  * Copyright (c) 2006 Oracle.  All rights reserved.
   3  *
   4  * This software is available to you under a choice of one of two
   5  * licenses.  You may choose to be licensed under the terms of the GNU
   6  * General Public License (GPL) Version 2, available from the file
   7  * COPYING in the main directory of this source tree, or the
   8  * OpenIB.org BSD license below:
   9  *
  10  *     Redistribution and use in source and binary forms, with or
  11  *     without modification, are permitted provided that the following
  12  *     conditions are met:
  13  *
  14  *      - Redistributions of source code must retain the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer.
  17  *
  18  *      - Redistributions in binary form must reproduce the above
  19  *        copyright notice, this list of conditions and the following
  20  *        disclaimer in the documentation and/or other materials
  21  *        provided with the distribution.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30  * SOFTWARE.
  31  *
  32  */
  33 #include <linux/kernel.h>
  34 #include <linux/slab.h>
  35 #include <net/sock.h>
  36 #include <linux/in.h>
  37
  38 #include "rds.h"
  39
  40 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
  41                   __be32 saddr)
  42 {
  43         atomic_set(&inc->i_refcount, 1);
  44         INIT_LIST_HEAD(&inc->i_item);
  45         inc->i_conn = conn;
  46         inc->i_saddr = saddr;
  47         inc->i_rdma_cookie = 0;
  48 }
  49 EXPORT_SYMBOL_GPL(rds_inc_init);
  50
  51 static void rds_inc_addref(struct rds_incoming *inc)
  52 {
  53         rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
  54         atomic_inc(&inc->i_refcount);
  55 }
  56
  57 void rds_inc_put(struct rds_incoming *inc)
  58 {
  59         rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
  60         if (atomic_dec_and_test(&inc->i_refcount)) {
  61                 BUG_ON(!list_empty(&inc->i_item));
  62
  63                 inc->i_conn->c_trans->inc_free(inc);
  64         }
  65 }
  66 EXPORT_SYMBOL_GPL(rds_inc_put);
  67
  68 static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
  69                                   struct rds_cong_map *map,
  70                                   int delta, __be16 port)
  71 {
  72         int now_congested;
  73
  74         if (delta == 0)
  75                 return;
  76
  77         rs->rs_rcv_bytes += delta;
  78         now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
  79
  80         rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
  81           "now_cong %d delta %d\n",
  82           rs, &rs->rs_bound_addr,
  83           ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
  84           rds_sk_rcvbuf(rs), now_congested, delta);
  85
  86         /* wasn't -> am congested */
  87         if (!rs->rs_congested && now_congested) {
  88                 rs->rs_congested = 1;
  89                 rds_cong_set_bit(map, port);
  90                 rds_cong_queue_updates(map);
  91         }
  92         /* was -> aren't congested */
  93         /* Require more free space before reporting uncongested to prevent
  94            bouncing cong/uncong state too often */
  95         else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
  96                 rs->rs_congested = 0;
  97                 rds_cong_clear_bit(map, port);
  98                 rds_cong_queue_updates(map);
  99         }
 100
 101         /* do nothing if no change in cong state */
 102 }
 103
 104 /*
 105  * Process all extension headers that come with this message.
 106  */
 107 static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
 108 {
 109         struct rds_header *hdr = &inc->i_hdr;
 110         unsigned int pos = 0, type, len;
 111         union {
 112                 struct rds_ext_header_version version;
 113                 struct rds_ext_header_rdma rdma;
 114                 struct rds_ext_header_rdma_dest rdma_dest;
 115         } buffer;
 116
 117         while (1) {
 118                 len = sizeof(buffer);
 119                 type = rds_message_next_extension(hdr, &pos, &buffer, &len);
 120                 if (type == RDS_EXTHDR_NONE)
 121                         break;
 122                 /* Process extension header here */
 123                 switch (type) {
 124                 case RDS_EXTHDR_RDMA:
 125                         rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
 126                         break;
 127
 128                 case RDS_EXTHDR_RDMA_DEST:
 129                         /* We ignore the size for now. We could stash it
 130                          * somewhere and use it for error checking. */
 131                         inc->i_rdma_cookie = rds_rdma_make_cookie(
 132                                         be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
 133                                         be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
 134
 135                         break;
 136                 }
 137         }
 138 }
 139
 140 /*
 141  * The transport must make sure that this is serialized against other
 142  * rx and conn reset on this specific conn.
 143  *
 144  * We currently assert that only one fragmented message will be sent
 145  * down a connection at a time.  This lets us reassemble in the conn
 146  * instead of per-flow which means that we don't have to go digging through
 147  * flows to tear down partial reassembly progress on conn failure and
 148  * we save flow lookup and locking for each frag arrival.  It does mean
 149  * that small messages will wait behind large ones.  Fragmenting at all
 150  * is only to reduce the memory consumption of pre-posted buffers.
 151  *
 152  * The caller passes in saddr and daddr instead of us getting it from the
 153  * conn.  This lets loopback, who only has one conn for both directions,
 154  * tell us which roles the addrs in the conn are playing for this message.
 155  */
 156 void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 157                        struct rds_incoming *inc, gfp_t gfp, enum km_type km)
 158 {
 159         struct rds_sock *rs = NULL;
 160         struct sock *sk;
 161         unsigned long flags;
 162
 163         inc->i_conn = conn;
 164         inc->i_rx_jiffies = jiffies;
 165
 166         rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
 167                  "flags 0x%x rx_jiffies %lu\n", conn,
 168                  (unsigned long long)conn->c_next_rx_seq,
 169                  inc,
 170                  (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
 171                  be32_to_cpu(inc->i_hdr.h_len),
 172                  be16_to_cpu(inc->i_hdr.h_sport),
 173                  be16_to_cpu(inc->i_hdr.h_dport),
 174                  inc->i_hdr.h_flags,
 175                  inc->i_rx_jiffies);
 176
 177         /*
 178          * Sequence numbers should only increase.  Messages get their
 179          * sequence number as they're queued in a sending conn.  They
 180          * can be dropped, though, if the sending socket is closed before
 181          * they hit the wire.  So sequence numbers can skip forward
 182          * under normal operation.  They can also drop back in the conn
 183          * failover case as previously sent messages are resent down the
 184          * new instance of a conn.  We drop those, otherwise we have
 185          * to assume that the next valid seq does not come after a
 186          * hole in the fragment stream.
 187          *
 188          * The headers don't give us a way to realize if fragments of
 189          * a message have been dropped.  We assume that frags that arrive
 190          * to a flow are part of the current message on the flow that is
 191          * being reassembled.  This means that senders can't drop messages
 192          * from the sending conn until all their frags are sent.
 193          *
 194          * XXX we could spend more on the wire to get more robust failure
 195          * detection, arguably worth it to avoid data corruption.
 196          */
 197         if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
 198             (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
 199                 rds_stats_inc(s_recv_drop_old_seq);
 200                 goto out;
 201         }
 202         conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
 203
 204         if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
 205                 rds_stats_inc(s_recv_ping);
 206                 rds_send_pong(conn, inc->i_hdr.h_sport);
 207                 goto out;
 208         }
 209
 210         rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
 211         if (!rs) {
 212                 rds_stats_inc(s_recv_drop_no_sock);
 213                 goto out;
 214         }
 215
 216         /* Process extension headers */
 217         rds_recv_incoming_exthdrs(inc, rs);
 218
 219         /* We can be racing with rds_release() which marks the socket dead. */
 220         sk = rds_rs_to_sk(rs);
 221
 222         /* serialize with rds_release -> sock_orphan */
 223         write_lock_irqsave(&rs->rs_recv_lock, flags);
 224         if (!sock_flag(sk, SOCK_DEAD)) {
 225                 rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
 226                 rds_stats_inc(s_recv_queued);
 227                 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
 228                                       be32_to_cpu(inc->i_hdr.h_len),
 229                                       inc->i_hdr.h_dport);
 230                 rds_inc_addref(inc);
 231                 list_add_tail(&inc->i_item, &rs->rs_recv_queue);
 232                 __rds_wake_sk_sleep(sk);
 233         } else {
 234                 rds_stats_inc(s_recv_drop_dead_sock);
 235         }
 236         write_unlock_irqrestore(&rs->rs_recv_lock, flags);
 237
 238 out:
 239         if (rs)
 240                 rds_sock_put(rs);
 241 }
 242 EXPORT_SYMBOL_GPL(rds_recv_incoming);
 243
 244 /*
 245  * be very careful here.  This is being called as the condition in
 246  * wait_event_*() needs to cope with being called many times.
 247  */
 248 static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
 249 {
 250         unsigned long flags;
 251
 252         if (!*inc) {
 253                 read_lock_irqsave(&rs->rs_recv_lock, flags);
 254                 if (!list_empty(&rs->rs_recv_queue)) {
 255                         *inc = list_entry(rs->rs_recv_queue.next,
 256                                           struct rds_incoming,
 257                                           i_item);
 258                         rds_inc_addref(*inc);
 259                 }
 260                 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 261         }
 262
 263         return *inc != NULL;
 264 }
 265
 266 static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
 267                             int drop)
 268 {
 269         struct sock *sk = rds_rs_to_sk(rs);
 270         int ret = 0;
 271         unsigned long flags;
 272
 273         write_lock_irqsave(&rs->rs_recv_lock, flags);
 274         if (!list_empty(&inc->i_item)) {
 275                 ret = 1;
 276                 if (drop) {
 277                         /* XXX make sure this i_conn is reliable */
 278                         rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
 279                                               -be32_to_cpu(inc->i_hdr.h_len),
 280                                               inc->i_hdr.h_dport);
 281                         list_del_init(&inc->i_item);
 282                         rds_inc_put(inc);
 283                 }
 284         }
 285         write_unlock_irqrestore(&rs->rs_recv_lock, flags);
 286
 287         rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
 288         return ret;
 289 }
 290
 291 /*
 292  * Pull errors off the error queue.
 293  * If msghdr is NULL, we will just purge the error queue.
 294  */
 295 int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 296 {
 297         struct rds_notifier *notifier;
 298         struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
 299         unsigned int count = 0, max_messages = ~0U;
 300         unsigned long flags;
 301         LIST_HEAD(copy);
 302         int err = 0;
 303
 304
 305         /* put_cmsg copies to user space and thus may sleep. We can't do this
 306          * with rs_lock held, so first grab as many notifications as we can stuff
 307          * in the user provided cmsg buffer. We don't try to copy more, to avoid
 308          * losing notifications - except when the buffer is so small that it wouldn't
 309          * even hold a single notification. Then we give him as much of this single
 310          * msg as we can squeeze in, and set MSG_CTRUNC.
 311          */
 312         if (msghdr) {
 313                 max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
 314                 if (!max_messages)
 315                         max_messages = 1;
 316         }
 317
 318         spin_lock_irqsave(&rs->rs_lock, flags);
 319         while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
 320                 notifier = list_entry(rs->rs_notify_queue.next,
 321                                 struct rds_notifier, n_list);
 322                 list_move(&notifier->n_list, &copy);
 323                 count++;
 324         }
 325         spin_unlock_irqrestore(&rs->rs_lock, flags);
 326
 327         if (!count)
 328                 return 0;
 329
 330         while (!list_empty(&copy)) {
 331                 notifier = list_entry(copy.next, struct rds_notifier, n_list);
 332
 333                 if (msghdr) {
 334                         cmsg.user_token = notifier->n_user_token;
 335                         cmsg.status = notifier->n_status;
 336
 337                         err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
 338                                        sizeof(cmsg), &cmsg);
 339                         if (err)
 340                                 break;
 341                 }
 342
 343                 list_del_init(&notifier->n_list);
 344                 kfree(notifier);
 345         }
 346
 347         /* If we bailed out because of an error in put_cmsg,
 348          * we may be left with one or more notifications that we
 349          * didn't process. Return them to the head of the list. */
 350         if (!list_empty(&copy)) {
 351                 spin_lock_irqsave(&rs->rs_lock, flags);
 352                 list_splice(&copy, &rs->rs_notify_queue);
 353                 spin_unlock_irqrestore(&rs->rs_lock, flags);
 354         }
 355
 356         return err;
 357 }
 358
 359 /*
 360  * Queue a congestion notification
 361  */
 362 static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
 363 {
 364         uint64_t notify = rs->rs_cong_notify;
 365         unsigned long flags;
 366         int err;
 367
 368         err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
 369                         sizeof(notify), &notify);
 370         if (err)
 371                 return err;
 372
 373         spin_lock_irqsave(&rs->rs_lock, flags);
 374         rs->rs_cong_notify &= ~notify;
 375         spin_unlock_irqrestore(&rs->rs_lock, flags);
 376
 377         return 0;
 378 }
 379
 380 /*
 381  * Receive any control messages.
 382  */
 383 static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
 384 {
 385         int ret = 0;
 386
 387         if (inc->i_rdma_cookie) {
 388                 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
 389                                 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
 390                 if (ret)
 391                         return ret;
 392         }
 393
 394         return 0;
 395 }
 396
 397 int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 398                 size_t size, int msg_flags)
 399 {
 400         struct sock *sk = sock->sk;
 401         struct rds_sock *rs = rds_sk_to_rs(sk);
 402         long timeo;
 403         int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
 404         struct sockaddr_in *sin;
 405         struct rds_incoming *inc = NULL;
 406
 407         /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
 408         timeo = sock_rcvtimeo(sk, nonblock);
 409
 410         rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
 411
 412         if (msg_flags & MSG_OOB)
 413                 goto out;
 414
 415         while (1) {
 416                 /* If there are pending notifications, do those - and nothing else */
 417                 if (!list_empty(&rs->rs_notify_queue)) {
 418                         ret = rds_notify_queue_get(rs, msg);
 419                         break;
 420                 }
 421
 422                 if (rs->rs_cong_notify) {
 423                         ret = rds_notify_cong(rs, msg);
 424                         break;
 425                 }
 426
 427                 if (!rds_next_incoming(rs, &inc)) {
 428                         if (nonblock) {
 429                                 ret = -EAGAIN;
 430                                 break;
 431                         }
 432
 433                         timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
 434                                         (!list_empty(&rs->rs_notify_queue) ||
 435                                          rs->rs_cong_notify ||
 436                                          rds_next_incoming(rs, &inc)), timeo);
 437                         rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
 438                                  timeo);
 439                         if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
 440                                 continue;
 441
 442                         ret = timeo;
 443                         if (ret == 0)
 444                                 ret = -ETIMEDOUT;
 445                         break;
 446                 }
 447
 448                 rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
 449                          &inc->i_conn->c_faddr,
 450                          ntohs(inc->i_hdr.h_sport));
 451                 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
 452                                                              size);
 453                 if (ret < 0)
 454                         break;
 455
 456                 /*
 457                  * if the message we just copied isn't at the head of the
 458                  * recv queue then someone else raced us to return it, try
 459                  * to get the next message.
 460                  */
 461                 if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
 462                         rds_inc_put(inc);
 463                         inc = NULL;
 464                         rds_stats_inc(s_recv_deliver_raced);
 465                         continue;
 466                 }
 467
 468                 if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
 469                         if (msg_flags & MSG_TRUNC)
 470                                 ret = be32_to_cpu(inc->i_hdr.h_len);
 471                         msg->msg_flags |= MSG_TRUNC;
 472                 }
 473
 474                 if (rds_cmsg_recv(inc, msg)) {
 475                         ret = -EFAULT;
 476                         goto out;
 477                 }
 478
 479                 rds_stats_inc(s_recv_delivered);
 480
 481                 sin = (struct sockaddr_in *)msg->msg_name;
 482                 if (sin) {
 483                         sin->sin_family = AF_INET;
 484                         sin->sin_port = inc->i_hdr.h_sport;
 485                         sin->sin_addr.s_addr = inc->i_saddr;
 486                         memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 487                 }
 488                 break;
 489         }
 490
 491         if (inc)
 492                 rds_inc_put(inc);
 493
 494 out:
 495         return ret;
 496 }
 497
 498 /*
 499  * The socket is being shut down and we're asked to drop messages that were
 500  * queued for recvmsg.  The caller has unbound the socket so the receive path
 501  * won't queue any more incoming fragments or messages on the socket.
 502  */
 503 void rds_clear_recv_queue(struct rds_sock *rs)
 504 {
 505         struct sock *sk = rds_rs_to_sk(rs);
 506         struct rds_incoming *inc, *tmp;
 507         unsigned long flags;
 508
 509         write_lock_irqsave(&rs->rs_recv_lock, flags);
 510         list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
 511                 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
 512                                       -be32_to_cpu(inc->i_hdr.h_len),
 513                                       inc->i_hdr.h_dport);
 514                 list_del_init(&inc->i_item);
 515                 rds_inc_put(inc);
 516         }
 517         write_unlock_irqrestore(&rs->rs_recv_lock, flags);
 518 }
 519
 520 /*
 521  * inc->i_saddr isn't used here because it is only set in the receive
 522  * path.
 523  */
 524 void rds_inc_info_copy(struct rds_incoming *inc,
 525                        struct rds_info_iterator *iter,
 526                        __be32 saddr, __be32 daddr, int flip)
 527 {
 528         struct rds_info_message minfo;
 529
 530         minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
 531         minfo.len = be32_to_cpu(inc->i_hdr.h_len);
 532
 533         if (flip) {
 534                 minfo.laddr = daddr;
 535                 minfo.faddr = saddr;
 536                 minfo.lport = inc->i_hdr.h_dport;
 537                 minfo.fport = inc->i_hdr.h_sport;
 538         } else {
 539                 minfo.laddr = saddr;
 540                 minfo.faddr = daddr;
 541                 minfo.lport = inc->i_hdr.h_sport;
 542                 minfo.fport = inc->i_hdr.h_dport;
 543         }
 544
 545         rds_info_copy(iter, &minfo, sizeof(minfo));
 546 }