dccp: Policy-based packet dequeueing infrastructure
[pandora-kernel.git] / net / dccp / proto.c
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *      This program is free software; you can redistribute it and/or modify it
8  *      under the terms of the GNU General Public License version 2 as
9  *      published by the Free Software Foundation.
10  */
11
12 #include <linux/dccp.h>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
17 #include <linux/skbuff.h>
18 #include <linux/netdevice.h>
19 #include <linux/in.h>
20 #include <linux/if_arp.h>
21 #include <linux/init.h>
22 #include <linux/random.h>
23 #include <net/checksum.h>
24
25 #include <net/inet_sock.h>
26 #include <net/sock.h>
27 #include <net/xfrm.h>
28
29 #include <asm/ioctls.h>
30 #include <linux/spinlock.h>
31 #include <linux/timer.h>
32 #include <linux/delay.h>
33 #include <linux/poll.h>
34
35 #include "ccid.h"
36 #include "dccp.h"
37 #include "feat.h"
38
39 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
40
41 EXPORT_SYMBOL_GPL(dccp_statistics);
42
43 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
44
45 EXPORT_SYMBOL_GPL(dccp_orphan_count);
46
47 struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
48         .lhash_lock     = RW_LOCK_UNLOCKED,
49         .lhash_users    = ATOMIC_INIT(0),
50         .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
51 };
52
53 EXPORT_SYMBOL_GPL(dccp_hashinfo);
54
55 /* the maximum queue length for tx in packets. 0 is no limit */
56 int sysctl_dccp_tx_qlen __read_mostly = 5;
57
58 void dccp_set_state(struct sock *sk, const int state)
59 {
60         const int oldstate = sk->sk_state;
61
62         dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
63                       dccp_state_name(oldstate), dccp_state_name(state));
64         WARN_ON(state == oldstate);
65
66         switch (state) {
67         case DCCP_OPEN:
68                 if (oldstate != DCCP_OPEN)
69                         DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
70                 /* Client retransmits all Confirm options until entering OPEN */
71                 if (oldstate == DCCP_PARTOPEN)
72                         dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
73                 break;
74
75         case DCCP_CLOSED:
76                 if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
77                     oldstate == DCCP_CLOSING)
78                         DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
79
80                 sk->sk_prot->unhash(sk);
81                 if (inet_csk(sk)->icsk_bind_hash != NULL &&
82                     !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
83                         inet_put_port(sk);
84                 /* fall through */
85         default:
86                 if (oldstate == DCCP_OPEN)
87                         DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
88         }
89
90         /* Change state AFTER socket is unhashed to avoid closed
91          * socket sitting in hash tables.
92          */
93         sk->sk_state = state;
94 }
95
96 EXPORT_SYMBOL_GPL(dccp_set_state);
97
98 static void dccp_finish_passive_close(struct sock *sk)
99 {
100         switch (sk->sk_state) {
101         case DCCP_PASSIVE_CLOSE:
102                 /* Node (client or server) has received Close packet. */
103                 dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
104                 dccp_set_state(sk, DCCP_CLOSED);
105                 break;
106         case DCCP_PASSIVE_CLOSEREQ:
107                 /*
108                  * Client received CloseReq. We set the `active' flag so that
109                  * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
110                  */
111                 dccp_send_close(sk, 1);
112                 dccp_set_state(sk, DCCP_CLOSING);
113         }
114 }
115
116 void dccp_done(struct sock *sk)
117 {
118         dccp_set_state(sk, DCCP_CLOSED);
119         dccp_clear_xmit_timers(sk);
120
121         sk->sk_shutdown = SHUTDOWN_MASK;
122
123         if (!sock_flag(sk, SOCK_DEAD))
124                 sk->sk_state_change(sk);
125         else
126                 inet_csk_destroy_sock(sk);
127 }
128
129 EXPORT_SYMBOL_GPL(dccp_done);
130
131 const char *dccp_packet_name(const int type)
132 {
133         static const char *dccp_packet_names[] = {
134                 [DCCP_PKT_REQUEST]  = "REQUEST",
135                 [DCCP_PKT_RESPONSE] = "RESPONSE",
136                 [DCCP_PKT_DATA]     = "DATA",
137                 [DCCP_PKT_ACK]      = "ACK",
138                 [DCCP_PKT_DATAACK]  = "DATAACK",
139                 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
140                 [DCCP_PKT_CLOSE]    = "CLOSE",
141                 [DCCP_PKT_RESET]    = "RESET",
142                 [DCCP_PKT_SYNC]     = "SYNC",
143                 [DCCP_PKT_SYNCACK]  = "SYNCACK",
144         };
145
146         if (type >= DCCP_NR_PKT_TYPES)
147                 return "INVALID";
148         else
149                 return dccp_packet_names[type];
150 }
151
152 EXPORT_SYMBOL_GPL(dccp_packet_name);
153
154 const char *dccp_state_name(const int state)
155 {
156         static char *dccp_state_names[] = {
157         [DCCP_OPEN]             = "OPEN",
158         [DCCP_REQUESTING]       = "REQUESTING",
159         [DCCP_PARTOPEN]         = "PARTOPEN",
160         [DCCP_LISTEN]           = "LISTEN",
161         [DCCP_RESPOND]          = "RESPOND",
162         [DCCP_CLOSING]          = "CLOSING",
163         [DCCP_ACTIVE_CLOSEREQ]  = "CLOSEREQ",
164         [DCCP_PASSIVE_CLOSE]    = "PASSIVE_CLOSE",
165         [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
166         [DCCP_TIME_WAIT]        = "TIME_WAIT",
167         [DCCP_CLOSED]           = "CLOSED",
168         };
169
170         if (state >= DCCP_MAX_STATES)
171                 return "INVALID STATE!";
172         else
173                 return dccp_state_names[state];
174 }
175
176 EXPORT_SYMBOL_GPL(dccp_state_name);
177
178 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
179 {
180         struct dccp_sock *dp = dccp_sk(sk);
181         struct inet_connection_sock *icsk = inet_csk(sk);
182
183         icsk->icsk_rto          = DCCP_TIMEOUT_INIT;
184         icsk->icsk_syn_retries  = sysctl_dccp_request_retries;
185         sk->sk_state            = DCCP_CLOSED;
186         sk->sk_write_space      = dccp_write_space;
187         icsk->icsk_sync_mss     = dccp_sync_mss;
188         dp->dccps_mss_cache     = 536;
189         dp->dccps_rate_last     = jiffies;
190         dp->dccps_role          = DCCP_ROLE_UNDEFINED;
191         dp->dccps_service       = DCCP_SERVICE_CODE_IS_ABSENT;
192         dp->dccps_tx_qlen       = sysctl_dccp_tx_qlen;
193
194         dccp_init_xmit_timers(sk);
195
196         INIT_LIST_HEAD(&dp->dccps_featneg);
197         /* control socket doesn't need feat nego */
198         if (likely(ctl_sock_initialized))
199                 return dccp_feat_init(sk);
200         return 0;
201 }
202
203 EXPORT_SYMBOL_GPL(dccp_init_sock);
204
205 void dccp_destroy_sock(struct sock *sk)
206 {
207         struct dccp_sock *dp = dccp_sk(sk);
208
209         /*
210          * DCCP doesn't use sk_write_queue, just sk_send_head
211          * for retransmissions
212          */
213         if (sk->sk_send_head != NULL) {
214                 kfree_skb(sk->sk_send_head);
215                 sk->sk_send_head = NULL;
216         }
217
218         /* Clean up a referenced DCCP bind bucket. */
219         if (inet_csk(sk)->icsk_bind_hash != NULL)
220                 inet_put_port(sk);
221
222         kfree(dp->dccps_service_list);
223         dp->dccps_service_list = NULL;
224
225         if (dp->dccps_hc_rx_ackvec != NULL) {
226                 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
227                 dp->dccps_hc_rx_ackvec = NULL;
228         }
229         ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
230         ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
231         dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
232
233         /* clean up feature negotiation state */
234         dccp_feat_list_purge(&dp->dccps_featneg);
235 }
236
237 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
238
239 static inline int dccp_listen_start(struct sock *sk, int backlog)
240 {
241         struct dccp_sock *dp = dccp_sk(sk);
242
243         dp->dccps_role = DCCP_ROLE_LISTEN;
244         /* do not start to listen if feature negotiation setup fails */
245         if (dccp_feat_finalise_settings(dp))
246                 return -EPROTO;
247         return inet_csk_listen_start(sk, backlog);
248 }
249
250 static inline int dccp_need_reset(int state)
251 {
252         return state != DCCP_CLOSED && state != DCCP_LISTEN &&
253                state != DCCP_REQUESTING;
254 }
255
256 int dccp_disconnect(struct sock *sk, int flags)
257 {
258         struct inet_connection_sock *icsk = inet_csk(sk);
259         struct inet_sock *inet = inet_sk(sk);
260         int err = 0;
261         const int old_state = sk->sk_state;
262
263         if (old_state != DCCP_CLOSED)
264                 dccp_set_state(sk, DCCP_CLOSED);
265
266         /*
267          * This corresponds to the ABORT function of RFC793, sec. 3.8
268          * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
269          */
270         if (old_state == DCCP_LISTEN) {
271                 inet_csk_listen_stop(sk);
272         } else if (dccp_need_reset(old_state)) {
273                 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
274                 sk->sk_err = ECONNRESET;
275         } else if (old_state == DCCP_REQUESTING)
276                 sk->sk_err = ECONNRESET;
277
278         dccp_clear_xmit_timers(sk);
279
280         __skb_queue_purge(&sk->sk_receive_queue);
281         __skb_queue_purge(&sk->sk_write_queue);
282         if (sk->sk_send_head != NULL) {
283                 __kfree_skb(sk->sk_send_head);
284                 sk->sk_send_head = NULL;
285         }
286
287         inet->dport = 0;
288
289         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
290                 inet_reset_saddr(sk);
291
292         sk->sk_shutdown = 0;
293         sock_reset_flag(sk, SOCK_DONE);
294
295         icsk->icsk_backoff = 0;
296         inet_csk_delack_init(sk);
297         __sk_dst_reset(sk);
298
299         WARN_ON(inet->num && !icsk->icsk_bind_hash);
300
301         sk->sk_error_report(sk);
302         return err;
303 }
304
305 EXPORT_SYMBOL_GPL(dccp_disconnect);
306
307 /*
308  *      Wait for a DCCP event.
309  *
310  *      Note that we don't need to lock the socket, as the upper poll layers
311  *      take care of normal races (between the test and the event) and we don't
312  *      go look at any of the socket buffers directly.
313  */
314 unsigned int dccp_poll(struct file *file, struct socket *sock,
315                        poll_table *wait)
316 {
317         unsigned int mask;
318         struct sock *sk = sock->sk;
319
320         poll_wait(file, sk->sk_sleep, wait);
321         if (sk->sk_state == DCCP_LISTEN)
322                 return inet_csk_listen_poll(sk);
323
324         /* Socket is not locked. We are protected from async events
325            by poll logic and correct handling of state changes
326            made by another threads is impossible in any case.
327          */
328
329         mask = 0;
330         if (sk->sk_err)
331                 mask = POLLERR;
332
333         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
334                 mask |= POLLHUP;
335         if (sk->sk_shutdown & RCV_SHUTDOWN)
336                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
337
338         /* Connected? */
339         if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
340                 if (atomic_read(&sk->sk_rmem_alloc) > 0)
341                         mask |= POLLIN | POLLRDNORM;
342
343                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
344                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
345                                 mask |= POLLOUT | POLLWRNORM;
346                         } else {  /* send SIGIO later */
347                                 set_bit(SOCK_ASYNC_NOSPACE,
348                                         &sk->sk_socket->flags);
349                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
350
351                                 /* Race breaker. If space is freed after
352                                  * wspace test but before the flags are set,
353                                  * IO signal will be lost.
354                                  */
355                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
356                                         mask |= POLLOUT | POLLWRNORM;
357                         }
358                 }
359         }
360         return mask;
361 }
362
363 EXPORT_SYMBOL_GPL(dccp_poll);
364
365 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
366 {
367         int rc = -ENOTCONN;
368
369         lock_sock(sk);
370
371         if (sk->sk_state == DCCP_LISTEN)
372                 goto out;
373
374         switch (cmd) {
375         case SIOCINQ: {
376                 struct sk_buff *skb;
377                 unsigned long amount = 0;
378
379                 skb = skb_peek(&sk->sk_receive_queue);
380                 if (skb != NULL) {
381                         /*
382                          * We will only return the amount of this packet since
383                          * that is all that will be read.
384                          */
385                         amount = skb->len;
386                 }
387                 rc = put_user(amount, (int __user *)arg);
388         }
389                 break;
390         default:
391                 rc = -ENOIOCTLCMD;
392                 break;
393         }
394 out:
395         release_sock(sk);
396         return rc;
397 }
398
399 EXPORT_SYMBOL_GPL(dccp_ioctl);
400
401 static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
402                                    char __user *optval, int optlen)
403 {
404         struct dccp_sock *dp = dccp_sk(sk);
405         struct dccp_service_list *sl = NULL;
406
407         if (service == DCCP_SERVICE_INVALID_VALUE ||
408             optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
409                 return -EINVAL;
410
411         if (optlen > sizeof(service)) {
412                 sl = kmalloc(optlen, GFP_KERNEL);
413                 if (sl == NULL)
414                         return -ENOMEM;
415
416                 sl->dccpsl_nr = optlen / sizeof(u32) - 1;
417                 if (copy_from_user(sl->dccpsl_list,
418                                    optval + sizeof(service),
419                                    optlen - sizeof(service)) ||
420                     dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
421                         kfree(sl);
422                         return -EFAULT;
423                 }
424         }
425
426         lock_sock(sk);
427         dp->dccps_service = service;
428
429         kfree(dp->dccps_service_list);
430
431         dp->dccps_service_list = sl;
432         release_sock(sk);
433         return 0;
434 }
435
436 static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
437 {
438         u8 *list, len;
439         int i, rc;
440
441         if (cscov < 0 || cscov > 15)
442                 return -EINVAL;
443         /*
444          * Populate a list of permissible values, in the range cscov...15. This
445          * is necessary since feature negotiation of single values only works if
446          * both sides incidentally choose the same value. Since the list starts
447          * lowest-value first, negotiation will pick the smallest shared value.
448          */
449         if (cscov == 0)
450                 return 0;
451         len = 16 - cscov;
452
453         list = kmalloc(len, GFP_KERNEL);
454         if (list == NULL)
455                 return -ENOBUFS;
456
457         for (i = 0; i < len; i++)
458                 list[i] = cscov++;
459
460         rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
461
462         if (rc == 0) {
463                 if (rx)
464                         dccp_sk(sk)->dccps_pcrlen = cscov;
465                 else
466                         dccp_sk(sk)->dccps_pcslen = cscov;
467         }
468         kfree(list);
469         return rc;
470 }
471
472 static int dccp_setsockopt_ccid(struct sock *sk, int type,
473                                 char __user *optval, int optlen)
474 {
475         u8 *val;
476         int rc = 0;
477
478         if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
479                 return -EINVAL;
480
481         val = kmalloc(optlen, GFP_KERNEL);
482         if (val == NULL)
483                 return -ENOMEM;
484
485         if (copy_from_user(val, optval, optlen)) {
486                 kfree(val);
487                 return -EFAULT;
488         }
489
490         lock_sock(sk);
491         if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
492                 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
493
494         if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
495                 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
496         release_sock(sk);
497
498         kfree(val);
499         return rc;
500 }
501
502 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
503                 char __user *optval, int optlen)
504 {
505         struct dccp_sock *dp = dccp_sk(sk);
506         int val, err = 0;
507
508         switch (optname) {
509         case DCCP_SOCKOPT_PACKET_SIZE:
510                 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
511                 return 0;
512         case DCCP_SOCKOPT_CHANGE_L:
513         case DCCP_SOCKOPT_CHANGE_R:
514                 DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
515                 return 0;
516         case DCCP_SOCKOPT_CCID:
517         case DCCP_SOCKOPT_RX_CCID:
518         case DCCP_SOCKOPT_TX_CCID:
519                 return dccp_setsockopt_ccid(sk, optname, optval, optlen);
520         }
521
522         if (optlen < (int)sizeof(int))
523                 return -EINVAL;
524
525         if (get_user(val, (int __user *)optval))
526                 return -EFAULT;
527
528         if (optname == DCCP_SOCKOPT_SERVICE)
529                 return dccp_setsockopt_service(sk, val, optval, optlen);
530
531         lock_sock(sk);
532         switch (optname) {
533         case DCCP_SOCKOPT_SERVER_TIMEWAIT:
534                 if (dp->dccps_role != DCCP_ROLE_SERVER)
535                         err = -EOPNOTSUPP;
536                 else
537                         dp->dccps_server_timewait = (val != 0);
538                 break;
539         case DCCP_SOCKOPT_SEND_CSCOV:
540                 err = dccp_setsockopt_cscov(sk, val, false);
541                 break;
542         case DCCP_SOCKOPT_RECV_CSCOV:
543                 err = dccp_setsockopt_cscov(sk, val, true);
544                 break;
545         case DCCP_SOCKOPT_QPOLICY_ID:
546                 if (sk->sk_state != DCCP_CLOSED)
547                         err = -EISCONN;
548                 else if (val < 0 || val >= DCCPQ_POLICY_MAX)
549                         err = -EINVAL;
550                 else
551                         dp->dccps_qpolicy = val;
552                 break;
553         case DCCP_SOCKOPT_QPOLICY_TXQLEN:
554                 if (val < 0)
555                         err = -EINVAL;
556                 else
557                         dp->dccps_tx_qlen = val;
558                 break;
559         default:
560                 err = -ENOPROTOOPT;
561                 break;
562         }
563         release_sock(sk);
564
565         return err;
566 }
567
568 int dccp_setsockopt(struct sock *sk, int level, int optname,
569                     char __user *optval, int optlen)
570 {
571         if (level != SOL_DCCP)
572                 return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
573                                                              optname, optval,
574                                                              optlen);
575         return do_dccp_setsockopt(sk, level, optname, optval, optlen);
576 }
577
578 EXPORT_SYMBOL_GPL(dccp_setsockopt);
579
580 #ifdef CONFIG_COMPAT
581 int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
582                            char __user *optval, int optlen)
583 {
584         if (level != SOL_DCCP)
585                 return inet_csk_compat_setsockopt(sk, level, optname,
586                                                   optval, optlen);
587         return do_dccp_setsockopt(sk, level, optname, optval, optlen);
588 }
589
590 EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
591 #endif
592
593 static int dccp_getsockopt_service(struct sock *sk, int len,
594                                    __be32 __user *optval,
595                                    int __user *optlen)
596 {
597         const struct dccp_sock *dp = dccp_sk(sk);
598         const struct dccp_service_list *sl;
599         int err = -ENOENT, slen = 0, total_len = sizeof(u32);
600
601         lock_sock(sk);
602         if ((sl = dp->dccps_service_list) != NULL) {
603                 slen = sl->dccpsl_nr * sizeof(u32);
604                 total_len += slen;
605         }
606
607         err = -EINVAL;
608         if (total_len > len)
609                 goto out;
610
611         err = 0;
612         if (put_user(total_len, optlen) ||
613             put_user(dp->dccps_service, optval) ||
614             (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
615                 err = -EFAULT;
616 out:
617         release_sock(sk);
618         return err;
619 }
620
621 static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
622                     char __user *optval, int __user *optlen)
623 {
624         struct dccp_sock *dp;
625         int val, len;
626
627         if (get_user(len, optlen))
628                 return -EFAULT;
629
630         if (len < (int)sizeof(int))
631                 return -EINVAL;
632
633         dp = dccp_sk(sk);
634
635         switch (optname) {
636         case DCCP_SOCKOPT_PACKET_SIZE:
637                 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
638                 return 0;
639         case DCCP_SOCKOPT_SERVICE:
640                 return dccp_getsockopt_service(sk, len,
641                                                (__be32 __user *)optval, optlen);
642         case DCCP_SOCKOPT_GET_CUR_MPS:
643                 val = dp->dccps_mss_cache;
644                 break;
645         case DCCP_SOCKOPT_AVAILABLE_CCIDS:
646                 return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
647         case DCCP_SOCKOPT_TX_CCID:
648                 val = ccid_get_current_tx_ccid(dp);
649                 if (val < 0)
650                         return -ENOPROTOOPT;
651                 break;
652         case DCCP_SOCKOPT_RX_CCID:
653                 val = ccid_get_current_rx_ccid(dp);
654                 if (val < 0)
655                         return -ENOPROTOOPT;
656                 break;
657         case DCCP_SOCKOPT_SERVER_TIMEWAIT:
658                 val = dp->dccps_server_timewait;
659                 break;
660         case DCCP_SOCKOPT_SEND_CSCOV:
661                 val = dp->dccps_pcslen;
662                 break;
663         case DCCP_SOCKOPT_RECV_CSCOV:
664                 val = dp->dccps_pcrlen;
665                 break;
666         case DCCP_SOCKOPT_QPOLICY_ID:
667                 val = dp->dccps_qpolicy;
668                 break;
669         case DCCP_SOCKOPT_QPOLICY_TXQLEN:
670                 val = dp->dccps_tx_qlen;
671                 break;
672         case 128 ... 191:
673                 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
674                                              len, (u32 __user *)optval, optlen);
675         case 192 ... 255:
676                 return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
677                                              len, (u32 __user *)optval, optlen);
678         default:
679                 return -ENOPROTOOPT;
680         }
681
682         len = sizeof(val);
683         if (put_user(len, optlen) || copy_to_user(optval, &val, len))
684                 return -EFAULT;
685
686         return 0;
687 }
688
689 int dccp_getsockopt(struct sock *sk, int level, int optname,
690                     char __user *optval, int __user *optlen)
691 {
692         if (level != SOL_DCCP)
693                 return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
694                                                              optname, optval,
695                                                              optlen);
696         return do_dccp_getsockopt(sk, level, optname, optval, optlen);
697 }
698
699 EXPORT_SYMBOL_GPL(dccp_getsockopt);
700
701 #ifdef CONFIG_COMPAT
702 int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
703                            char __user *optval, int __user *optlen)
704 {
705         if (level != SOL_DCCP)
706                 return inet_csk_compat_getsockopt(sk, level, optname,
707                                                   optval, optlen);
708         return do_dccp_getsockopt(sk, level, optname, optval, optlen);
709 }
710
711 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
712 #endif
713
714 static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
715 {
716         struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
717
718         /*
719          * Assign an (opaque) qpolicy priority value to skb->priority.
720          *
721          * We are overloading this skb field for use with the qpolicy subystem.
722          * The skb->priority is normally used for the SO_PRIORITY option, which
723          * is initialised from sk_priority. Since the assignment of sk_priority
724          * to skb->priority happens later (on layer 3), we overload this field
725          * for use with queueing priorities as long as the skb is on layer 4.
726          * The default priority value (if nothing is set) is 0.
727          */
728         skb->priority = 0;
729
730         for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
731
732                 if (!CMSG_OK(msg, cmsg))
733                         return -EINVAL;
734
735                 if (cmsg->cmsg_level != SOL_DCCP)
736                         continue;
737
738                 switch (cmsg->cmsg_type) {
739                 case DCCP_SCM_PRIORITY:
740                         if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
741                                 return -EINVAL;
742                         skb->priority = *(__u32 *)CMSG_DATA(cmsg);
743                         break;
744                 default:
745                         return -EINVAL;
746                 }
747         }
748         return 0;
749 }
750
751 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
752                  size_t len)
753 {
754         const struct dccp_sock *dp = dccp_sk(sk);
755         const int flags = msg->msg_flags;
756         const int noblock = flags & MSG_DONTWAIT;
757         struct sk_buff *skb;
758         int rc, size;
759         long timeo;
760
761         if (len > dp->dccps_mss_cache)
762                 return -EMSGSIZE;
763
764         lock_sock(sk);
765
766         if (dccp_qpolicy_full(sk)) {
767                 rc = -EAGAIN;
768                 goto out_release;
769         }
770
771         timeo = sock_sndtimeo(sk, noblock);
772
773         /*
774          * We have to use sk_stream_wait_connect here to set sk_write_pending,
775          * so that the trick in dccp_rcv_request_sent_state_process.
776          */
777         /* Wait for a connection to finish. */
778         if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
779                 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
780                         goto out_release;
781
782         size = sk->sk_prot->max_header + len;
783         release_sock(sk);
784         skb = sock_alloc_send_skb(sk, size, noblock, &rc);
785         lock_sock(sk);
786         if (skb == NULL)
787                 goto out_release;
788
789         skb_reserve(skb, sk->sk_prot->max_header);
790         rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
791         if (rc != 0)
792                 goto out_discard;
793
794         rc = dccp_msghdr_parse(msg, skb);
795         if (rc != 0)
796                 goto out_discard;
797
798         dccp_qpolicy_push(sk, skb);
799         dccp_write_xmit(sk);
800 out_release:
801         release_sock(sk);
802         return rc ? : len;
803 out_discard:
804         kfree_skb(skb);
805         goto out_release;
806 }
807
808 EXPORT_SYMBOL_GPL(dccp_sendmsg);
809
810 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
811                  size_t len, int nonblock, int flags, int *addr_len)
812 {
813         const struct dccp_hdr *dh;
814         long timeo;
815
816         lock_sock(sk);
817
818         if (sk->sk_state == DCCP_LISTEN) {
819                 len = -ENOTCONN;
820                 goto out;
821         }
822
823         timeo = sock_rcvtimeo(sk, nonblock);
824
825         do {
826                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
827
828                 if (skb == NULL)
829                         goto verify_sock_status;
830
831                 dh = dccp_hdr(skb);
832
833                 switch (dh->dccph_type) {
834                 case DCCP_PKT_DATA:
835                 case DCCP_PKT_DATAACK:
836                         goto found_ok_skb;
837
838                 case DCCP_PKT_CLOSE:
839                 case DCCP_PKT_CLOSEREQ:
840                         if (!(flags & MSG_PEEK))
841                                 dccp_finish_passive_close(sk);
842                         /* fall through */
843                 case DCCP_PKT_RESET:
844                         dccp_pr_debug("found fin (%s) ok!\n",
845                                       dccp_packet_name(dh->dccph_type));
846                         len = 0;
847                         goto found_fin_ok;
848                 default:
849                         dccp_pr_debug("packet_type=%s\n",
850                                       dccp_packet_name(dh->dccph_type));
851                         sk_eat_skb(sk, skb, 0);
852                 }
853 verify_sock_status:
854                 if (sock_flag(sk, SOCK_DONE)) {
855                         len = 0;
856                         break;
857                 }
858
859                 if (sk->sk_err) {
860                         len = sock_error(sk);
861                         break;
862                 }
863
864                 if (sk->sk_shutdown & RCV_SHUTDOWN) {
865                         len = 0;
866                         break;
867                 }
868
869                 if (sk->sk_state == DCCP_CLOSED) {
870                         if (!sock_flag(sk, SOCK_DONE)) {
871                                 /* This occurs when user tries to read
872                                  * from never connected socket.
873                                  */
874                                 len = -ENOTCONN;
875                                 break;
876                         }
877                         len = 0;
878                         break;
879                 }
880
881                 if (!timeo) {
882                         len = -EAGAIN;
883                         break;
884                 }
885
886                 if (signal_pending(current)) {
887                         len = sock_intr_errno(timeo);
888                         break;
889                 }
890
891                 sk_wait_data(sk, &timeo);
892                 continue;
893         found_ok_skb:
894                 if (len > skb->len)
895                         len = skb->len;
896                 else if (len < skb->len)
897                         msg->msg_flags |= MSG_TRUNC;
898
899                 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
900                         /* Exception. Bailout! */
901                         len = -EFAULT;
902                         break;
903                 }
904         found_fin_ok:
905                 if (!(flags & MSG_PEEK))
906                         sk_eat_skb(sk, skb, 0);
907                 break;
908         } while (1);
909 out:
910         release_sock(sk);
911         return len;
912 }
913
914 EXPORT_SYMBOL_GPL(dccp_recvmsg);
915
916 int inet_dccp_listen(struct socket *sock, int backlog)
917 {
918         struct sock *sk = sock->sk;
919         unsigned char old_state;
920         int err;
921
922         lock_sock(sk);
923
924         err = -EINVAL;
925         if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
926                 goto out;
927
928         old_state = sk->sk_state;
929         if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
930                 goto out;
931
932         /* Really, if the socket is already in listen state
933          * we can only allow the backlog to be adjusted.
934          */
935         if (old_state != DCCP_LISTEN) {
936                 /*
937                  * FIXME: here it probably should be sk->sk_prot->listen_start
938                  * see tcp_listen_start
939                  */
940                 err = dccp_listen_start(sk, backlog);
941                 if (err)
942                         goto out;
943         }
944         sk->sk_max_ack_backlog = backlog;
945         err = 0;
946
947 out:
948         release_sock(sk);
949         return err;
950 }
951
952 EXPORT_SYMBOL_GPL(inet_dccp_listen);
953
954 static void dccp_terminate_connection(struct sock *sk)
955 {
956         u8 next_state = DCCP_CLOSED;
957
958         switch (sk->sk_state) {
959         case DCCP_PASSIVE_CLOSE:
960         case DCCP_PASSIVE_CLOSEREQ:
961                 dccp_finish_passive_close(sk);
962                 break;
963         case DCCP_PARTOPEN:
964                 dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
965                 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
966                 /* fall through */
967         case DCCP_OPEN:
968                 dccp_send_close(sk, 1);
969
970                 if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
971                     !dccp_sk(sk)->dccps_server_timewait)
972                         next_state = DCCP_ACTIVE_CLOSEREQ;
973                 else
974                         next_state = DCCP_CLOSING;
975                 /* fall through */
976         default:
977                 dccp_set_state(sk, next_state);
978         }
979 }
980
981 void dccp_close(struct sock *sk, long timeout)
982 {
983         struct dccp_sock *dp = dccp_sk(sk);
984         struct sk_buff *skb;
985         u32 data_was_unread = 0;
986         int state;
987
988         lock_sock(sk);
989
990         sk->sk_shutdown = SHUTDOWN_MASK;
991
992         if (sk->sk_state == DCCP_LISTEN) {
993                 dccp_set_state(sk, DCCP_CLOSED);
994
995                 /* Special case. */
996                 inet_csk_listen_stop(sk);
997
998                 goto adjudge_to_death;
999         }
1000
1001         sk_stop_timer(sk, &dp->dccps_xmit_timer);
1002
1003         /*
1004          * We need to flush the recv. buffs.  We do this only on the
1005          * descriptor close, not protocol-sourced closes, because the
1006           *reader process may not have drained the data yet!
1007          */
1008         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1009                 data_was_unread += skb->len;
1010                 __kfree_skb(skb);
1011         }
1012
1013         if (data_was_unread) {
1014                 /* Unread data was tossed, send an appropriate Reset Code */
1015                 DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
1016                 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
1017                 dccp_set_state(sk, DCCP_CLOSED);
1018         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1019                 /* Check zero linger _after_ checking for unread data. */
1020                 sk->sk_prot->disconnect(sk, 0);
1021         } else if (sk->sk_state != DCCP_CLOSED) {
1022                 /*
1023                  * Normal connection termination. May need to wait if there are
1024                  * still packets in the TX queue that are delayed by the CCID.
1025                  */
1026                 dccp_flush_write_queue(sk, &timeout);
1027                 dccp_terminate_connection(sk);
1028         }
1029
1030         /*
1031          * Flush write queue. This may be necessary in several cases:
1032          * - we have been closed by the peer but still have application data;
1033          * - abortive termination (unread data or zero linger time),
1034          * - normal termination but queue could not be flushed within time limit
1035          */
1036         __skb_queue_purge(&sk->sk_write_queue);
1037
1038         sk_stream_wait_close(sk, timeout);
1039
1040 adjudge_to_death:
1041         state = sk->sk_state;
1042         sock_hold(sk);
1043         sock_orphan(sk);
1044         atomic_inc(sk->sk_prot->orphan_count);
1045
1046         /*
1047          * It is the last release_sock in its life. It will remove backlog.
1048          */
1049         release_sock(sk);
1050         /*
1051          * Now socket is owned by kernel and we acquire BH lock
1052          * to finish close. No need to check for user refs.
1053          */
1054         local_bh_disable();
1055         bh_lock_sock(sk);
1056         WARN_ON(sock_owned_by_user(sk));
1057
1058         /* Have we already been destroyed by a softirq or backlog? */
1059         if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
1060                 goto out;
1061
1062         if (sk->sk_state == DCCP_CLOSED)
1063                 inet_csk_destroy_sock(sk);
1064
1065         /* Otherwise, socket is reprieved until protocol close. */
1066
1067 out:
1068         bh_unlock_sock(sk);
1069         local_bh_enable();
1070         sock_put(sk);
1071 }
1072
1073 EXPORT_SYMBOL_GPL(dccp_close);
1074
1075 void dccp_shutdown(struct sock *sk, int how)
1076 {
1077         dccp_pr_debug("called shutdown(%x)\n", how);
1078 }
1079
1080 EXPORT_SYMBOL_GPL(dccp_shutdown);
1081
1082 static inline int dccp_mib_init(void)
1083 {
1084         return snmp_mib_init((void**)dccp_statistics, sizeof(struct dccp_mib));
1085 }
1086
1087 static inline void dccp_mib_exit(void)
1088 {
1089         snmp_mib_free((void**)dccp_statistics);
1090 }
1091
1092 static int thash_entries;
1093 module_param(thash_entries, int, 0444);
1094 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
1095
1096 #ifdef CONFIG_IP_DCCP_DEBUG
1097 int dccp_debug;
1098 module_param(dccp_debug, bool, 0644);
1099 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
1100
1101 EXPORT_SYMBOL_GPL(dccp_debug);
1102 #endif
1103
1104 static int __init dccp_init(void)
1105 {
1106         unsigned long goal;
1107         int ehash_order, bhash_order, i;
1108         int rc = -ENOBUFS;
1109
1110         BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
1111                      FIELD_SIZEOF(struct sk_buff, cb));
1112
1113         dccp_hashinfo.bind_bucket_cachep =
1114                 kmem_cache_create("dccp_bind_bucket",
1115                                   sizeof(struct inet_bind_bucket), 0,
1116                                   SLAB_HWCACHE_ALIGN, NULL);
1117         if (!dccp_hashinfo.bind_bucket_cachep)
1118                 goto out;
1119
1120         /*
1121          * Size and allocate the main established and bind bucket
1122          * hash tables.
1123          *
1124          * The methodology is similar to that of the buffer cache.
1125          */
1126         if (num_physpages >= (128 * 1024))
1127                 goal = num_physpages >> (21 - PAGE_SHIFT);
1128         else
1129                 goal = num_physpages >> (23 - PAGE_SHIFT);
1130
1131         if (thash_entries)
1132                 goal = (thash_entries *
1133                         sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
1134         for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
1135                 ;
1136         do {
1137                 dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
1138                                         sizeof(struct inet_ehash_bucket);
1139                 while (dccp_hashinfo.ehash_size &
1140                        (dccp_hashinfo.ehash_size - 1))
1141                         dccp_hashinfo.ehash_size--;
1142                 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
1143                         __get_free_pages(GFP_ATOMIC, ehash_order);
1144         } while (!dccp_hashinfo.ehash && --ehash_order > 0);
1145
1146         if (!dccp_hashinfo.ehash) {
1147                 DCCP_CRIT("Failed to allocate DCCP established hash table");
1148                 goto out_free_bind_bucket_cachep;
1149         }
1150
1151         for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
1152                 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
1153                 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
1154         }
1155
1156         if (inet_ehash_locks_alloc(&dccp_hashinfo))
1157                         goto out_free_dccp_ehash;
1158
1159         bhash_order = ehash_order;
1160
1161         do {
1162                 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
1163                                         sizeof(struct inet_bind_hashbucket);
1164                 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
1165                     bhash_order > 0)
1166                         continue;
1167                 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
1168                         __get_free_pages(GFP_ATOMIC, bhash_order);
1169         } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
1170
1171         if (!dccp_hashinfo.bhash) {
1172                 DCCP_CRIT("Failed to allocate DCCP bind hash table");
1173                 goto out_free_dccp_locks;
1174         }
1175
1176         for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
1177                 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
1178                 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1179         }
1180
1181         rc = dccp_mib_init();
1182         if (rc)
1183                 goto out_free_dccp_bhash;
1184
1185         rc = dccp_ackvec_init();
1186         if (rc)
1187                 goto out_free_dccp_mib;
1188
1189         rc = dccp_sysctl_init();
1190         if (rc)
1191                 goto out_ackvec_exit;
1192
1193         dccp_timestamping_init();
1194 out:
1195         return rc;
1196 out_ackvec_exit:
1197         dccp_ackvec_exit();
1198 out_free_dccp_mib:
1199         dccp_mib_exit();
1200 out_free_dccp_bhash:
1201         free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1202         dccp_hashinfo.bhash = NULL;
1203 out_free_dccp_locks:
1204         inet_ehash_locks_free(&dccp_hashinfo);
1205 out_free_dccp_ehash:
1206         free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1207         dccp_hashinfo.ehash = NULL;
1208 out_free_bind_bucket_cachep:
1209         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1210         dccp_hashinfo.bind_bucket_cachep = NULL;
1211         goto out;
1212 }
1213
1214 static void __exit dccp_fini(void)
1215 {
1216         dccp_mib_exit();
1217         free_pages((unsigned long)dccp_hashinfo.bhash,
1218                    get_order(dccp_hashinfo.bhash_size *
1219                              sizeof(struct inet_bind_hashbucket)));
1220         free_pages((unsigned long)dccp_hashinfo.ehash,
1221                    get_order(dccp_hashinfo.ehash_size *
1222                              sizeof(struct inet_ehash_bucket)));
1223         inet_ehash_locks_free(&dccp_hashinfo);
1224         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1225         dccp_ackvec_exit();
1226         dccp_sysctl_exit();
1227 }
1228
1229 module_init(dccp_init);
1230 module_exit(dccp_fini);
1231
1232 MODULE_LICENSE("GPL");
1233 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
1234 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");