dccp: Policy-based packet dequeueing infrastructure
[pandora-kernel.git] / net / dccp / proto.c
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *      This program is free software; you can redistribute it and/or modify it
8  *      under the terms of the GNU General Public License version 2 as
9  *      published by the Free Software Foundation.
10  */
11
12 #include <linux/dccp.h>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
17 #include <linux/skbuff.h>
18 #include <linux/netdevice.h>
19 #include <linux/in.h>
20 #include <linux/if_arp.h>
21 #include <linux/init.h>
22 #include <linux/random.h>
23 #include <linux/slab.h>
24 #include <net/checksum.h>
25
26 #include <net/inet_sock.h>
27 #include <net/sock.h>
28 #include <net/xfrm.h>
29
30 #include <asm/ioctls.h>
31 #include <linux/spinlock.h>
32 #include <linux/timer.h>
33 #include <linux/delay.h>
34 #include <linux/poll.h>
35
36 #include "ccid.h"
37 #include "dccp.h"
38 #include "feat.h"
39
40 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
41
42 EXPORT_SYMBOL_GPL(dccp_statistics);
43
44 struct percpu_counter dccp_orphan_count;
45 EXPORT_SYMBOL_GPL(dccp_orphan_count);
46
47 struct inet_hashinfo dccp_hashinfo;
48 EXPORT_SYMBOL_GPL(dccp_hashinfo);
49
50 /* the maximum queue length for tx in packets. 0 is no limit */
51 int sysctl_dccp_tx_qlen __read_mostly = 5;
52
53 #ifdef CONFIG_IP_DCCP_DEBUG
54 static const char *dccp_state_name(const int state)
55 {
56         static const char *const dccp_state_names[] = {
57         [DCCP_OPEN]             = "OPEN",
58         [DCCP_REQUESTING]       = "REQUESTING",
59         [DCCP_PARTOPEN]         = "PARTOPEN",
60         [DCCP_LISTEN]           = "LISTEN",
61         [DCCP_RESPOND]          = "RESPOND",
62         [DCCP_CLOSING]          = "CLOSING",
63         [DCCP_ACTIVE_CLOSEREQ]  = "CLOSEREQ",
64         [DCCP_PASSIVE_CLOSE]    = "PASSIVE_CLOSE",
65         [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
66         [DCCP_TIME_WAIT]        = "TIME_WAIT",
67         [DCCP_CLOSED]           = "CLOSED",
68         };
69
70         if (state >= DCCP_MAX_STATES)
71                 return "INVALID STATE!";
72         else
73                 return dccp_state_names[state];
74 }
75 #endif
76
77 void dccp_set_state(struct sock *sk, const int state)
78 {
79         const int oldstate = sk->sk_state;
80
81         dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
82                       dccp_state_name(oldstate), dccp_state_name(state));
83         WARN_ON(state == oldstate);
84
85         switch (state) {
86         case DCCP_OPEN:
87                 if (oldstate != DCCP_OPEN)
88                         DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
89                 /* Client retransmits all Confirm options until entering OPEN */
90                 if (oldstate == DCCP_PARTOPEN)
91                         dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
92                 break;
93
94         case DCCP_CLOSED:
95                 if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
96                     oldstate == DCCP_CLOSING)
97                         DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
98
99                 sk->sk_prot->unhash(sk);
100                 if (inet_csk(sk)->icsk_bind_hash != NULL &&
101                     !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
102                         inet_put_port(sk);
103                 /* fall through */
104         default:
105                 if (oldstate == DCCP_OPEN)
106                         DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
107         }
108
109         /* Change state AFTER socket is unhashed to avoid closed
110          * socket sitting in hash tables.
111          */
112         sk->sk_state = state;
113 }
114
115 EXPORT_SYMBOL_GPL(dccp_set_state);
116
117 static void dccp_finish_passive_close(struct sock *sk)
118 {
119         switch (sk->sk_state) {
120         case DCCP_PASSIVE_CLOSE:
121                 /* Node (client or server) has received Close packet. */
122                 dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
123                 dccp_set_state(sk, DCCP_CLOSED);
124                 break;
125         case DCCP_PASSIVE_CLOSEREQ:
126                 /*
127                  * Client received CloseReq. We set the `active' flag so that
128                  * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
129                  */
130                 dccp_send_close(sk, 1);
131                 dccp_set_state(sk, DCCP_CLOSING);
132         }
133 }
134
135 void dccp_done(struct sock *sk)
136 {
137         dccp_set_state(sk, DCCP_CLOSED);
138         dccp_clear_xmit_timers(sk);
139
140         sk->sk_shutdown = SHUTDOWN_MASK;
141
142         if (!sock_flag(sk, SOCK_DEAD))
143                 sk->sk_state_change(sk);
144         else
145                 inet_csk_destroy_sock(sk);
146 }
147
148 EXPORT_SYMBOL_GPL(dccp_done);
149
150 const char *dccp_packet_name(const int type)
151 {
152         static const char *const dccp_packet_names[] = {
153                 [DCCP_PKT_REQUEST]  = "REQUEST",
154                 [DCCP_PKT_RESPONSE] = "RESPONSE",
155                 [DCCP_PKT_DATA]     = "DATA",
156                 [DCCP_PKT_ACK]      = "ACK",
157                 [DCCP_PKT_DATAACK]  = "DATAACK",
158                 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
159                 [DCCP_PKT_CLOSE]    = "CLOSE",
160                 [DCCP_PKT_RESET]    = "RESET",
161                 [DCCP_PKT_SYNC]     = "SYNC",
162                 [DCCP_PKT_SYNCACK]  = "SYNCACK",
163         };
164
165         if (type >= DCCP_NR_PKT_TYPES)
166                 return "INVALID";
167         else
168                 return dccp_packet_names[type];
169 }
170
171 EXPORT_SYMBOL_GPL(dccp_packet_name);
172
173 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
174 {
175         struct dccp_sock *dp = dccp_sk(sk);
176         struct inet_connection_sock *icsk = inet_csk(sk);
177
178         icsk->icsk_rto          = DCCP_TIMEOUT_INIT;
179         icsk->icsk_syn_retries  = sysctl_dccp_request_retries;
180         sk->sk_state            = DCCP_CLOSED;
181         sk->sk_write_space      = dccp_write_space;
182         icsk->icsk_sync_mss     = dccp_sync_mss;
183         dp->dccps_mss_cache     = 536;
184         dp->dccps_rate_last     = jiffies;
185         dp->dccps_role          = DCCP_ROLE_UNDEFINED;
186         dp->dccps_service       = DCCP_SERVICE_CODE_IS_ABSENT;
187         dp->dccps_l_ack_ratio   = dp->dccps_r_ack_ratio = 1;
188         dp->dccps_tx_qlen       = sysctl_dccp_tx_qlen;
189
190         dccp_init_xmit_timers(sk);
191
192         INIT_LIST_HEAD(&dp->dccps_featneg);
193         /* control socket doesn't need feat nego */
194         if (likely(ctl_sock_initialized))
195                 return dccp_feat_init(sk);
196         return 0;
197 }
198
199 EXPORT_SYMBOL_GPL(dccp_init_sock);
200
201 void dccp_destroy_sock(struct sock *sk)
202 {
203         struct dccp_sock *dp = dccp_sk(sk);
204
205         /*
206          * DCCP doesn't use sk_write_queue, just sk_send_head
207          * for retransmissions
208          */
209         if (sk->sk_send_head != NULL) {
210                 kfree_skb(sk->sk_send_head);
211                 sk->sk_send_head = NULL;
212         }
213
214         /* Clean up a referenced DCCP bind bucket. */
215         if (inet_csk(sk)->icsk_bind_hash != NULL)
216                 inet_put_port(sk);
217
218         kfree(dp->dccps_service_list);
219         dp->dccps_service_list = NULL;
220
221         if (dp->dccps_hc_rx_ackvec != NULL) {
222                 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
223                 dp->dccps_hc_rx_ackvec = NULL;
224         }
225         ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
226         ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
227         dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
228
229         /* clean up feature negotiation state */
230         dccp_feat_list_purge(&dp->dccps_featneg);
231 }
232
233 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
234
235 static inline int dccp_listen_start(struct sock *sk, int backlog)
236 {
237         struct dccp_sock *dp = dccp_sk(sk);
238
239         dp->dccps_role = DCCP_ROLE_LISTEN;
240         /* do not start to listen if feature negotiation setup fails */
241         if (dccp_feat_finalise_settings(dp))
242                 return -EPROTO;
243         return inet_csk_listen_start(sk, backlog);
244 }
245
246 static inline int dccp_need_reset(int state)
247 {
248         return state != DCCP_CLOSED && state != DCCP_LISTEN &&
249                state != DCCP_REQUESTING;
250 }
251
252 int dccp_disconnect(struct sock *sk, int flags)
253 {
254         struct inet_connection_sock *icsk = inet_csk(sk);
255         struct inet_sock *inet = inet_sk(sk);
256         int err = 0;
257         const int old_state = sk->sk_state;
258
259         if (old_state != DCCP_CLOSED)
260                 dccp_set_state(sk, DCCP_CLOSED);
261
262         /*
263          * This corresponds to the ABORT function of RFC793, sec. 3.8
264          * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
265          */
266         if (old_state == DCCP_LISTEN) {
267                 inet_csk_listen_stop(sk);
268         } else if (dccp_need_reset(old_state)) {
269                 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
270                 sk->sk_err = ECONNRESET;
271         } else if (old_state == DCCP_REQUESTING)
272                 sk->sk_err = ECONNRESET;
273
274         dccp_clear_xmit_timers(sk);
275
276         __skb_queue_purge(&sk->sk_receive_queue);
277         __skb_queue_purge(&sk->sk_write_queue);
278         if (sk->sk_send_head != NULL) {
279                 __kfree_skb(sk->sk_send_head);
280                 sk->sk_send_head = NULL;
281         }
282
283         inet->inet_dport = 0;
284
285         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
286                 inet_reset_saddr(sk);
287
288         sk->sk_shutdown = 0;
289         sock_reset_flag(sk, SOCK_DONE);
290
291         icsk->icsk_backoff = 0;
292         inet_csk_delack_init(sk);
293         __sk_dst_reset(sk);
294
295         WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
296
297         sk->sk_error_report(sk);
298         return err;
299 }
300
301 EXPORT_SYMBOL_GPL(dccp_disconnect);
302
303 /*
304  *      Wait for a DCCP event.
305  *
306  *      Note that we don't need to lock the socket, as the upper poll layers
307  *      take care of normal races (between the test and the event) and we don't
308  *      go look at any of the socket buffers directly.
309  */
310 unsigned int dccp_poll(struct file *file, struct socket *sock,
311                        poll_table *wait)
312 {
313         unsigned int mask;
314         struct sock *sk = sock->sk;
315
316         sock_poll_wait(file, sk_sleep(sk), wait);
317         if (sk->sk_state == DCCP_LISTEN)
318                 return inet_csk_listen_poll(sk);
319
320         /* Socket is not locked. We are protected from async events
321            by poll logic and correct handling of state changes
322            made by another threads is impossible in any case.
323          */
324
325         mask = 0;
326         if (sk->sk_err)
327                 mask = POLLERR;
328
329         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
330                 mask |= POLLHUP;
331         if (sk->sk_shutdown & RCV_SHUTDOWN)
332                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
333
334         /* Connected? */
335         if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
336                 if (atomic_read(&sk->sk_rmem_alloc) > 0)
337                         mask |= POLLIN | POLLRDNORM;
338
339                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
340                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
341                                 mask |= POLLOUT | POLLWRNORM;
342                         } else {  /* send SIGIO later */
343                                 set_bit(SOCK_ASYNC_NOSPACE,
344                                         &sk->sk_socket->flags);
345                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
346
347                                 /* Race breaker. If space is freed after
348                                  * wspace test but before the flags are set,
349                                  * IO signal will be lost.
350                                  */
351                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
352                                         mask |= POLLOUT | POLLWRNORM;
353                         }
354                 }
355         }
356         return mask;
357 }
358
359 EXPORT_SYMBOL_GPL(dccp_poll);
360
361 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
362 {
363         int rc = -ENOTCONN;
364
365         lock_sock(sk);
366
367         if (sk->sk_state == DCCP_LISTEN)
368                 goto out;
369
370         switch (cmd) {
371         case SIOCINQ: {
372                 struct sk_buff *skb;
373                 unsigned long amount = 0;
374
375                 skb = skb_peek(&sk->sk_receive_queue);
376                 if (skb != NULL) {
377                         /*
378                          * We will only return the amount of this packet since
379                          * that is all that will be read.
380                          */
381                         amount = skb->len;
382                 }
383                 rc = put_user(amount, (int __user *)arg);
384         }
385                 break;
386         default:
387                 rc = -ENOIOCTLCMD;
388                 break;
389         }
390 out:
391         release_sock(sk);
392         return rc;
393 }
394
395 EXPORT_SYMBOL_GPL(dccp_ioctl);
396
397 static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
398                                    char __user *optval, unsigned int optlen)
399 {
400         struct dccp_sock *dp = dccp_sk(sk);
401         struct dccp_service_list *sl = NULL;
402
403         if (service == DCCP_SERVICE_INVALID_VALUE ||
404             optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
405                 return -EINVAL;
406
407         if (optlen > sizeof(service)) {
408                 sl = kmalloc(optlen, GFP_KERNEL);
409                 if (sl == NULL)
410                         return -ENOMEM;
411
412                 sl->dccpsl_nr = optlen / sizeof(u32) - 1;
413                 if (copy_from_user(sl->dccpsl_list,
414                                    optval + sizeof(service),
415                                    optlen - sizeof(service)) ||
416                     dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
417                         kfree(sl);
418                         return -EFAULT;
419                 }
420         }
421
422         lock_sock(sk);
423         dp->dccps_service = service;
424
425         kfree(dp->dccps_service_list);
426
427         dp->dccps_service_list = sl;
428         release_sock(sk);
429         return 0;
430 }
431
432 static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
433 {
434         u8 *list, len;
435         int i, rc;
436
437         if (cscov < 0 || cscov > 15)
438                 return -EINVAL;
439         /*
440          * Populate a list of permissible values, in the range cscov...15. This
441          * is necessary since feature negotiation of single values only works if
442          * both sides incidentally choose the same value. Since the list starts
443          * lowest-value first, negotiation will pick the smallest shared value.
444          */
445         if (cscov == 0)
446                 return 0;
447         len = 16 - cscov;
448
449         list = kmalloc(len, GFP_KERNEL);
450         if (list == NULL)
451                 return -ENOBUFS;
452
453         for (i = 0; i < len; i++)
454                 list[i] = cscov++;
455
456         rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
457
458         if (rc == 0) {
459                 if (rx)
460                         dccp_sk(sk)->dccps_pcrlen = cscov;
461                 else
462                         dccp_sk(sk)->dccps_pcslen = cscov;
463         }
464         kfree(list);
465         return rc;
466 }
467
468 static int dccp_setsockopt_ccid(struct sock *sk, int type,
469                                 char __user *optval, unsigned int optlen)
470 {
471         u8 *val;
472         int rc = 0;
473
474         if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
475                 return -EINVAL;
476
477         val = memdup_user(optval, optlen);
478         if (IS_ERR(val))
479                 return PTR_ERR(val);
480
481         lock_sock(sk);
482         if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
483                 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
484
485         if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
486                 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
487         release_sock(sk);
488
489         kfree(val);
490         return rc;
491 }
492
493 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
494                 char __user *optval, unsigned int optlen)
495 {
496         struct dccp_sock *dp = dccp_sk(sk);
497         int val, err = 0;
498
499         switch (optname) {
500         case DCCP_SOCKOPT_PACKET_SIZE:
501                 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
502                 return 0;
503         case DCCP_SOCKOPT_CHANGE_L:
504         case DCCP_SOCKOPT_CHANGE_R:
505                 DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
506                 return 0;
507         case DCCP_SOCKOPT_CCID:
508         case DCCP_SOCKOPT_RX_CCID:
509         case DCCP_SOCKOPT_TX_CCID:
510                 return dccp_setsockopt_ccid(sk, optname, optval, optlen);
511         }
512
513         if (optlen < (int)sizeof(int))
514                 return -EINVAL;
515
516         if (get_user(val, (int __user *)optval))
517                 return -EFAULT;
518
519         if (optname == DCCP_SOCKOPT_SERVICE)
520                 return dccp_setsockopt_service(sk, val, optval, optlen);
521
522         lock_sock(sk);
523         switch (optname) {
524         case DCCP_SOCKOPT_SERVER_TIMEWAIT:
525                 if (dp->dccps_role != DCCP_ROLE_SERVER)
526                         err = -EOPNOTSUPP;
527                 else
528                         dp->dccps_server_timewait = (val != 0);
529                 break;
530         case DCCP_SOCKOPT_SEND_CSCOV:
531                 err = dccp_setsockopt_cscov(sk, val, false);
532                 break;
533         case DCCP_SOCKOPT_RECV_CSCOV:
534                 err = dccp_setsockopt_cscov(sk, val, true);
535                 break;
536         case DCCP_SOCKOPT_QPOLICY_ID:
537                 if (sk->sk_state != DCCP_CLOSED)
538                         err = -EISCONN;
539                 else if (val < 0 || val >= DCCPQ_POLICY_MAX)
540                         err = -EINVAL;
541                 else
542                         dp->dccps_qpolicy = val;
543                 break;
544         case DCCP_SOCKOPT_QPOLICY_TXQLEN:
545                 if (val < 0)
546                         err = -EINVAL;
547                 else
548                         dp->dccps_tx_qlen = val;
549                 break;
550         default:
551                 err = -ENOPROTOOPT;
552                 break;
553         }
554         release_sock(sk);
555
556         return err;
557 }
558
559 int dccp_setsockopt(struct sock *sk, int level, int optname,
560                     char __user *optval, unsigned int optlen)
561 {
562         if (level != SOL_DCCP)
563                 return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
564                                                              optname, optval,
565                                                              optlen);
566         return do_dccp_setsockopt(sk, level, optname, optval, optlen);
567 }
568
569 EXPORT_SYMBOL_GPL(dccp_setsockopt);
570
571 #ifdef CONFIG_COMPAT
572 int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
573                            char __user *optval, unsigned int optlen)
574 {
575         if (level != SOL_DCCP)
576                 return inet_csk_compat_setsockopt(sk, level, optname,
577                                                   optval, optlen);
578         return do_dccp_setsockopt(sk, level, optname, optval, optlen);
579 }
580
581 EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
582 #endif
583
584 static int dccp_getsockopt_service(struct sock *sk, int len,
585                                    __be32 __user *optval,
586                                    int __user *optlen)
587 {
588         const struct dccp_sock *dp = dccp_sk(sk);
589         const struct dccp_service_list *sl;
590         int err = -ENOENT, slen = 0, total_len = sizeof(u32);
591
592         lock_sock(sk);
593         if ((sl = dp->dccps_service_list) != NULL) {
594                 slen = sl->dccpsl_nr * sizeof(u32);
595                 total_len += slen;
596         }
597
598         err = -EINVAL;
599         if (total_len > len)
600                 goto out;
601
602         err = 0;
603         if (put_user(total_len, optlen) ||
604             put_user(dp->dccps_service, optval) ||
605             (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
606                 err = -EFAULT;
607 out:
608         release_sock(sk);
609         return err;
610 }
611
612 static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
613                     char __user *optval, int __user *optlen)
614 {
615         struct dccp_sock *dp;
616         int val, len;
617
618         if (get_user(len, optlen))
619                 return -EFAULT;
620
621         if (len < (int)sizeof(int))
622                 return -EINVAL;
623
624         dp = dccp_sk(sk);
625
626         switch (optname) {
627         case DCCP_SOCKOPT_PACKET_SIZE:
628                 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
629                 return 0;
630         case DCCP_SOCKOPT_SERVICE:
631                 return dccp_getsockopt_service(sk, len,
632                                                (__be32 __user *)optval, optlen);
633         case DCCP_SOCKOPT_GET_CUR_MPS:
634                 val = dp->dccps_mss_cache;
635                 break;
636         case DCCP_SOCKOPT_AVAILABLE_CCIDS:
637                 return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
638         case DCCP_SOCKOPT_TX_CCID:
639                 val = ccid_get_current_tx_ccid(dp);
640                 if (val < 0)
641                         return -ENOPROTOOPT;
642                 break;
643         case DCCP_SOCKOPT_RX_CCID:
644                 val = ccid_get_current_rx_ccid(dp);
645                 if (val < 0)
646                         return -ENOPROTOOPT;
647                 break;
648         case DCCP_SOCKOPT_SERVER_TIMEWAIT:
649                 val = dp->dccps_server_timewait;
650                 break;
651         case DCCP_SOCKOPT_SEND_CSCOV:
652                 val = dp->dccps_pcslen;
653                 break;
654         case DCCP_SOCKOPT_RECV_CSCOV:
655                 val = dp->dccps_pcrlen;
656                 break;
657         case DCCP_SOCKOPT_QPOLICY_ID:
658                 val = dp->dccps_qpolicy;
659                 break;
660         case DCCP_SOCKOPT_QPOLICY_TXQLEN:
661                 val = dp->dccps_tx_qlen;
662                 break;
663         case 128 ... 191:
664                 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
665                                              len, (u32 __user *)optval, optlen);
666         case 192 ... 255:
667                 return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
668                                              len, (u32 __user *)optval, optlen);
669         default:
670                 return -ENOPROTOOPT;
671         }
672
673         len = sizeof(val);
674         if (put_user(len, optlen) || copy_to_user(optval, &val, len))
675                 return -EFAULT;
676
677         return 0;
678 }
679
680 int dccp_getsockopt(struct sock *sk, int level, int optname,
681                     char __user *optval, int __user *optlen)
682 {
683         if (level != SOL_DCCP)
684                 return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
685                                                              optname, optval,
686                                                              optlen);
687         return do_dccp_getsockopt(sk, level, optname, optval, optlen);
688 }
689
690 EXPORT_SYMBOL_GPL(dccp_getsockopt);
691
692 #ifdef CONFIG_COMPAT
693 int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
694                            char __user *optval, int __user *optlen)
695 {
696         if (level != SOL_DCCP)
697                 return inet_csk_compat_getsockopt(sk, level, optname,
698                                                   optval, optlen);
699         return do_dccp_getsockopt(sk, level, optname, optval, optlen);
700 }
701
702 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
703 #endif
704
705 static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
706 {
707         struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
708
709         /*
710          * Assign an (opaque) qpolicy priority value to skb->priority.
711          *
712          * We are overloading this skb field for use with the qpolicy subystem.
713          * The skb->priority is normally used for the SO_PRIORITY option, which
714          * is initialised from sk_priority. Since the assignment of sk_priority
715          * to skb->priority happens later (on layer 3), we overload this field
716          * for use with queueing priorities as long as the skb is on layer 4.
717          * The default priority value (if nothing is set) is 0.
718          */
719         skb->priority = 0;
720
721         for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
722
723                 if (!CMSG_OK(msg, cmsg))
724                         return -EINVAL;
725
726                 if (cmsg->cmsg_level != SOL_DCCP)
727                         continue;
728
729                 switch (cmsg->cmsg_type) {
730                 case DCCP_SCM_PRIORITY:
731                         if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
732                                 return -EINVAL;
733                         skb->priority = *(__u32 *)CMSG_DATA(cmsg);
734                         break;
735                 default:
736                         return -EINVAL;
737                 }
738         }
739         return 0;
740 }
741
742 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
743                  size_t len)
744 {
745         const struct dccp_sock *dp = dccp_sk(sk);
746         const int flags = msg->msg_flags;
747         const int noblock = flags & MSG_DONTWAIT;
748         struct sk_buff *skb;
749         int rc, size;
750         long timeo;
751
752         if (len > dp->dccps_mss_cache)
753                 return -EMSGSIZE;
754
755         lock_sock(sk);
756
757         if (dccp_qpolicy_full(sk)) {
758                 rc = -EAGAIN;
759                 goto out_release;
760         }
761
762         timeo = sock_sndtimeo(sk, noblock);
763
764         /*
765          * We have to use sk_stream_wait_connect here to set sk_write_pending,
766          * so that the trick in dccp_rcv_request_sent_state_process.
767          */
768         /* Wait for a connection to finish. */
769         if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
770                 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
771                         goto out_release;
772
773         size = sk->sk_prot->max_header + len;
774         release_sock(sk);
775         skb = sock_alloc_send_skb(sk, size, noblock, &rc);
776         lock_sock(sk);
777         if (skb == NULL)
778                 goto out_release;
779
780         skb_reserve(skb, sk->sk_prot->max_header);
781         rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
782         if (rc != 0)
783                 goto out_discard;
784
785         rc = dccp_msghdr_parse(msg, skb);
786         if (rc != 0)
787                 goto out_discard;
788
789         dccp_qpolicy_push(sk, skb);
790         /*
791          * The xmit_timer is set if the TX CCID is rate-based and will expire
792          * when congestion control permits to release further packets into the
793          * network. Window-based CCIDs do not use this timer.
794          */
795         if (!timer_pending(&dp->dccps_xmit_timer))
796                 dccp_write_xmit(sk);
797 out_release:
798         release_sock(sk);
799         return rc ? : len;
800 out_discard:
801         kfree_skb(skb);
802         goto out_release;
803 }
804
805 EXPORT_SYMBOL_GPL(dccp_sendmsg);
806
807 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
808                  size_t len, int nonblock, int flags, int *addr_len)
809 {
810         const struct dccp_hdr *dh;
811         long timeo;
812
813         lock_sock(sk);
814
815         if (sk->sk_state == DCCP_LISTEN) {
816                 len = -ENOTCONN;
817                 goto out;
818         }
819
820         timeo = sock_rcvtimeo(sk, nonblock);
821
822         do {
823                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
824
825                 if (skb == NULL)
826                         goto verify_sock_status;
827
828                 dh = dccp_hdr(skb);
829
830                 switch (dh->dccph_type) {
831                 case DCCP_PKT_DATA:
832                 case DCCP_PKT_DATAACK:
833                         goto found_ok_skb;
834
835                 case DCCP_PKT_CLOSE:
836                 case DCCP_PKT_CLOSEREQ:
837                         if (!(flags & MSG_PEEK))
838                                 dccp_finish_passive_close(sk);
839                         /* fall through */
840                 case DCCP_PKT_RESET:
841                         dccp_pr_debug("found fin (%s) ok!\n",
842                                       dccp_packet_name(dh->dccph_type));
843                         len = 0;
844                         goto found_fin_ok;
845                 default:
846                         dccp_pr_debug("packet_type=%s\n",
847                                       dccp_packet_name(dh->dccph_type));
848                         sk_eat_skb(sk, skb, 0);
849                 }
850 verify_sock_status:
851                 if (sock_flag(sk, SOCK_DONE)) {
852                         len = 0;
853                         break;
854                 }
855
856                 if (sk->sk_err) {
857                         len = sock_error(sk);
858                         break;
859                 }
860
861                 if (sk->sk_shutdown & RCV_SHUTDOWN) {
862                         len = 0;
863                         break;
864                 }
865
866                 if (sk->sk_state == DCCP_CLOSED) {
867                         if (!sock_flag(sk, SOCK_DONE)) {
868                                 /* This occurs when user tries to read
869                                  * from never connected socket.
870                                  */
871                                 len = -ENOTCONN;
872                                 break;
873                         }
874                         len = 0;
875                         break;
876                 }
877
878                 if (!timeo) {
879                         len = -EAGAIN;
880                         break;
881                 }
882
883                 if (signal_pending(current)) {
884                         len = sock_intr_errno(timeo);
885                         break;
886                 }
887
888                 sk_wait_data(sk, &timeo);
889                 continue;
890         found_ok_skb:
891                 if (len > skb->len)
892                         len = skb->len;
893                 else if (len < skb->len)
894                         msg->msg_flags |= MSG_TRUNC;
895
896                 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
897                         /* Exception. Bailout! */
898                         len = -EFAULT;
899                         break;
900                 }
901                 if (flags & MSG_TRUNC)
902                         len = skb->len;
903         found_fin_ok:
904                 if (!(flags & MSG_PEEK))
905                         sk_eat_skb(sk, skb, 0);
906                 break;
907         } while (1);
908 out:
909         release_sock(sk);
910         return len;
911 }
912
913 EXPORT_SYMBOL_GPL(dccp_recvmsg);
914
915 int inet_dccp_listen(struct socket *sock, int backlog)
916 {
917         struct sock *sk = sock->sk;
918         unsigned char old_state;
919         int err;
920
921         lock_sock(sk);
922
923         err = -EINVAL;
924         if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
925                 goto out;
926
927         old_state = sk->sk_state;
928         if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
929                 goto out;
930
931         /* Really, if the socket is already in listen state
932          * we can only allow the backlog to be adjusted.
933          */
934         if (old_state != DCCP_LISTEN) {
935                 /*
936                  * FIXME: here it probably should be sk->sk_prot->listen_start
937                  * see tcp_listen_start
938                  */
939                 err = dccp_listen_start(sk, backlog);
940                 if (err)
941                         goto out;
942         }
943         sk->sk_max_ack_backlog = backlog;
944         err = 0;
945
946 out:
947         release_sock(sk);
948         return err;
949 }
950
951 EXPORT_SYMBOL_GPL(inet_dccp_listen);
952
953 static void dccp_terminate_connection(struct sock *sk)
954 {
955         u8 next_state = DCCP_CLOSED;
956
957         switch (sk->sk_state) {
958         case DCCP_PASSIVE_CLOSE:
959         case DCCP_PASSIVE_CLOSEREQ:
960                 dccp_finish_passive_close(sk);
961                 break;
962         case DCCP_PARTOPEN:
963                 dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
964                 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
965                 /* fall through */
966         case DCCP_OPEN:
967                 dccp_send_close(sk, 1);
968
969                 if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
970                     !dccp_sk(sk)->dccps_server_timewait)
971                         next_state = DCCP_ACTIVE_CLOSEREQ;
972                 else
973                         next_state = DCCP_CLOSING;
974                 /* fall through */
975         default:
976                 dccp_set_state(sk, next_state);
977         }
978 }
979
980 void dccp_close(struct sock *sk, long timeout)
981 {
982         struct dccp_sock *dp = dccp_sk(sk);
983         struct sk_buff *skb;
984         u32 data_was_unread = 0;
985         int state;
986
987         lock_sock(sk);
988
989         sk->sk_shutdown = SHUTDOWN_MASK;
990
991         if (sk->sk_state == DCCP_LISTEN) {
992                 dccp_set_state(sk, DCCP_CLOSED);
993
994                 /* Special case. */
995                 inet_csk_listen_stop(sk);
996
997                 goto adjudge_to_death;
998         }
999
1000         sk_stop_timer(sk, &dp->dccps_xmit_timer);
1001
1002         /*
1003          * We need to flush the recv. buffs.  We do this only on the
1004          * descriptor close, not protocol-sourced closes, because the
1005           *reader process may not have drained the data yet!
1006          */
1007         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1008                 data_was_unread += skb->len;
1009                 __kfree_skb(skb);
1010         }
1011
1012         if (data_was_unread) {
1013                 /* Unread data was tossed, send an appropriate Reset Code */
1014                 DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
1015                 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
1016                 dccp_set_state(sk, DCCP_CLOSED);
1017         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1018                 /* Check zero linger _after_ checking for unread data. */
1019                 sk->sk_prot->disconnect(sk, 0);
1020         } else if (sk->sk_state != DCCP_CLOSED) {
1021                 /*
1022                  * Normal connection termination. May need to wait if there are
1023                  * still packets in the TX queue that are delayed by the CCID.
1024                  */
1025                 dccp_flush_write_queue(sk, &timeout);
1026                 dccp_terminate_connection(sk);
1027         }
1028
1029         /*
1030          * Flush write queue. This may be necessary in several cases:
1031          * - we have been closed by the peer but still have application data;
1032          * - abortive termination (unread data or zero linger time),
1033          * - normal termination but queue could not be flushed within time limit
1034          */
1035         __skb_queue_purge(&sk->sk_write_queue);
1036
1037         sk_stream_wait_close(sk, timeout);
1038
1039 adjudge_to_death:
1040         state = sk->sk_state;
1041         sock_hold(sk);
1042         sock_orphan(sk);
1043
1044         /*
1045          * It is the last release_sock in its life. It will remove backlog.
1046          */
1047         release_sock(sk);
1048         /*
1049          * Now socket is owned by kernel and we acquire BH lock
1050          * to finish close. No need to check for user refs.
1051          */
1052         local_bh_disable();
1053         bh_lock_sock(sk);
1054         WARN_ON(sock_owned_by_user(sk));
1055
1056         percpu_counter_inc(sk->sk_prot->orphan_count);
1057
1058         /* Have we already been destroyed by a softirq or backlog? */
1059         if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
1060                 goto out;
1061
1062         if (sk->sk_state == DCCP_CLOSED)
1063                 inet_csk_destroy_sock(sk);
1064
1065         /* Otherwise, socket is reprieved until protocol close. */
1066
1067 out:
1068         bh_unlock_sock(sk);
1069         local_bh_enable();
1070         sock_put(sk);
1071 }
1072
1073 EXPORT_SYMBOL_GPL(dccp_close);
1074
1075 void dccp_shutdown(struct sock *sk, int how)
1076 {
1077         dccp_pr_debug("called shutdown(%x)\n", how);
1078 }
1079
1080 EXPORT_SYMBOL_GPL(dccp_shutdown);
1081
1082 static inline int dccp_mib_init(void)
1083 {
1084         return snmp_mib_init((void __percpu **)dccp_statistics,
1085                              sizeof(struct dccp_mib),
1086                              __alignof__(struct dccp_mib));
1087 }
1088
1089 static inline void dccp_mib_exit(void)
1090 {
1091         snmp_mib_free((void __percpu **)dccp_statistics);
1092 }
1093
1094 static int thash_entries;
1095 module_param(thash_entries, int, 0444);
1096 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
1097
1098 #ifdef CONFIG_IP_DCCP_DEBUG
1099 int dccp_debug;
1100 module_param(dccp_debug, bool, 0644);
1101 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
1102
1103 EXPORT_SYMBOL_GPL(dccp_debug);
1104 #endif
1105
1106 static int __init dccp_init(void)
1107 {
1108         unsigned long goal;
1109         int ehash_order, bhash_order, i;
1110         int rc;
1111
1112         BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
1113                      FIELD_SIZEOF(struct sk_buff, cb));
1114         rc = percpu_counter_init(&dccp_orphan_count, 0);
1115         if (rc)
1116                 goto out_fail;
1117         rc = -ENOBUFS;
1118         inet_hashinfo_init(&dccp_hashinfo);
1119         dccp_hashinfo.bind_bucket_cachep =
1120                 kmem_cache_create("dccp_bind_bucket",
1121                                   sizeof(struct inet_bind_bucket), 0,
1122                                   SLAB_HWCACHE_ALIGN, NULL);
1123         if (!dccp_hashinfo.bind_bucket_cachep)
1124                 goto out_free_percpu;
1125
1126         /*
1127          * Size and allocate the main established and bind bucket
1128          * hash tables.
1129          *
1130          * The methodology is similar to that of the buffer cache.
1131          */
1132         if (totalram_pages >= (128 * 1024))
1133                 goal = totalram_pages >> (21 - PAGE_SHIFT);
1134         else
1135                 goal = totalram_pages >> (23 - PAGE_SHIFT);
1136
1137         if (thash_entries)
1138                 goal = (thash_entries *
1139                         sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
1140         for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
1141                 ;
1142         do {
1143                 unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
1144                                         sizeof(struct inet_ehash_bucket);
1145
1146                 while (hash_size & (hash_size - 1))
1147                         hash_size--;
1148                 dccp_hashinfo.ehash_mask = hash_size - 1;
1149                 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
1150                         __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
1151         } while (!dccp_hashinfo.ehash && --ehash_order > 0);
1152
1153         if (!dccp_hashinfo.ehash) {
1154                 DCCP_CRIT("Failed to allocate DCCP established hash table");
1155                 goto out_free_bind_bucket_cachep;
1156         }
1157
1158         for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
1159                 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1160                 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
1161         }
1162
1163         if (inet_ehash_locks_alloc(&dccp_hashinfo))
1164                         goto out_free_dccp_ehash;
1165
1166         bhash_order = ehash_order;
1167
1168         do {
1169                 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
1170                                         sizeof(struct inet_bind_hashbucket);
1171                 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
1172                     bhash_order > 0)
1173                         continue;
1174                 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
1175                         __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
1176         } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
1177
1178         if (!dccp_hashinfo.bhash) {
1179                 DCCP_CRIT("Failed to allocate DCCP bind hash table");
1180                 goto out_free_dccp_locks;
1181         }
1182
1183         for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
1184                 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
1185                 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1186         }
1187
1188         rc = dccp_mib_init();
1189         if (rc)
1190                 goto out_free_dccp_bhash;
1191
1192         rc = dccp_ackvec_init();
1193         if (rc)
1194                 goto out_free_dccp_mib;
1195
1196         rc = dccp_sysctl_init();
1197         if (rc)
1198                 goto out_ackvec_exit;
1199
1200         rc = ccid_initialize_builtins();
1201         if (rc)
1202                 goto out_sysctl_exit;
1203
1204         dccp_timestamping_init();
1205
1206         return 0;
1207
1208 out_sysctl_exit:
1209         dccp_sysctl_exit();
1210 out_ackvec_exit:
1211         dccp_ackvec_exit();
1212 out_free_dccp_mib:
1213         dccp_mib_exit();
1214 out_free_dccp_bhash:
1215         free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1216 out_free_dccp_locks:
1217         inet_ehash_locks_free(&dccp_hashinfo);
1218 out_free_dccp_ehash:
1219         free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1220 out_free_bind_bucket_cachep:
1221         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1222 out_free_percpu:
1223         percpu_counter_destroy(&dccp_orphan_count);
1224 out_fail:
1225         dccp_hashinfo.bhash = NULL;
1226         dccp_hashinfo.ehash = NULL;
1227         dccp_hashinfo.bind_bucket_cachep = NULL;
1228         return rc;
1229 }
1230
1231 static void __exit dccp_fini(void)
1232 {
1233         ccid_cleanup_builtins();
1234         dccp_mib_exit();
1235         free_pages((unsigned long)dccp_hashinfo.bhash,
1236                    get_order(dccp_hashinfo.bhash_size *
1237                              sizeof(struct inet_bind_hashbucket)));
1238         free_pages((unsigned long)dccp_hashinfo.ehash,
1239                    get_order((dccp_hashinfo.ehash_mask + 1) *
1240                              sizeof(struct inet_ehash_bucket)));
1241         inet_ehash_locks_free(&dccp_hashinfo);
1242         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1243         dccp_ackvec_exit();
1244         dccp_sysctl_exit();
1245         percpu_counter_destroy(&dccp_orphan_count);
1246 }
1247
1248 module_init(dccp_init);
1249 module_exit(dccp_fini);
1250
1251 MODULE_LICENSE("GPL");
1252 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
1253 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");