net: packet: fix information leak to userland
[pandora-kernel.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 #include <linux/errqueue.h>
86 #include <linux/net_tstamp.h>
87
88 #ifdef CONFIG_INET
89 #include <net/inet_common.h>
90 #endif
91
92 /*
93    Assumptions:
94    - if device has no dev->hard_header routine, it adds and removes ll header
95      inside itself. In this case ll header is invisible outside of device,
96      but higher levels still should reserve dev->hard_header_len.
97      Some devices are enough clever to reallocate skb, when header
98      will not fit to reserved space (tunnel), another ones are silly
99      (PPP).
100    - packet socket receives packets with pulled ll header,
101      so that SOCK_RAW should push it back.
102
103 On receive:
104 -----------
105
106 Incoming, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> data
109
110 Outgoing, dev->hard_header!=NULL
111    mac_header -> ll header
112    data       -> ll header
113
114 Incoming, dev->hard_header==NULL
115    mac_header -> UNKNOWN position. It is very likely, that it points to ll
116                  header.  PPP makes it, that is wrong, because introduce
117                  assymetry between rx and tx paths.
118    data       -> data
119
120 Outgoing, dev->hard_header==NULL
121    mac_header -> data. ll header is still not built!
122    data       -> data
123
124 Resume
125   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
126
127
128 On transmit:
129 ------------
130
131 dev->hard_header != NULL
132    mac_header -> ll header
133    data       -> ll header
134
135 dev->hard_header == NULL (ll header is added by device, we cannot control it)
136    mac_header -> data
137    data       -> data
138
139    We should set nh.raw on output to correct posistion,
140    packet classifier depends on it.
141  */
142
143 /* Private packet socket structures. */
144
145 struct packet_mclist {
146         struct packet_mclist    *next;
147         int                     ifindex;
148         int                     count;
149         unsigned short          type;
150         unsigned short          alen;
151         unsigned char           addr[MAX_ADDR_LEN];
152 };
153 /* identical to struct packet_mreq except it has
154  * a longer address field.
155  */
156 struct packet_mreq_max {
157         int             mr_ifindex;
158         unsigned short  mr_type;
159         unsigned short  mr_alen;
160         unsigned char   mr_address[MAX_ADDR_LEN];
161 };
162
163 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164                 int closing, int tx_ring);
165
166 struct packet_ring_buffer {
167         char                    **pg_vec;
168         unsigned int            head;
169         unsigned int            frames_per_block;
170         unsigned int            frame_size;
171         unsigned int            frame_max;
172
173         unsigned int            pg_vec_order;
174         unsigned int            pg_vec_pages;
175         unsigned int            pg_vec_len;
176
177         atomic_t                pending;
178 };
179
180 struct packet_sock;
181 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
182
183 static void packet_flush_mclist(struct sock *sk);
184
185 struct packet_sock {
186         /* struct sock has to be the first member of packet_sock */
187         struct sock             sk;
188         struct tpacket_stats    stats;
189         struct packet_ring_buffer       rx_ring;
190         struct packet_ring_buffer       tx_ring;
191         int                     copy_thresh;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1,
197                                 has_vnet_hdr:1;
198         int                     ifindex;        /* bound device         */
199         __be16                  num;
200         struct packet_mclist    *mclist;
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206         unsigned int            tp_tstamp;
207         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
208 };
209
210 struct packet_skb_cb {
211         unsigned int origlen;
212         union {
213                 struct sockaddr_pkt pkt;
214                 struct sockaddr_ll ll;
215         } sa;
216 };
217
218 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
219
220 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
221 {
222         union {
223                 struct tpacket_hdr *h1;
224                 struct tpacket2_hdr *h2;
225                 void *raw;
226         } h;
227
228         h.raw = frame;
229         switch (po->tp_version) {
230         case TPACKET_V1:
231                 h.h1->tp_status = status;
232                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
233                 break;
234         case TPACKET_V2:
235                 h.h2->tp_status = status;
236                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
237                 break;
238         default:
239                 pr_err("TPACKET version not supported\n");
240                 BUG();
241         }
242
243         smp_wmb();
244 }
245
246 static int __packet_get_status(struct packet_sock *po, void *frame)
247 {
248         union {
249                 struct tpacket_hdr *h1;
250                 struct tpacket2_hdr *h2;
251                 void *raw;
252         } h;
253
254         smp_rmb();
255
256         h.raw = frame;
257         switch (po->tp_version) {
258         case TPACKET_V1:
259                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
260                 return h.h1->tp_status;
261         case TPACKET_V2:
262                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
263                 return h.h2->tp_status;
264         default:
265                 pr_err("TPACKET version not supported\n");
266                 BUG();
267                 return 0;
268         }
269 }
270
271 static void *packet_lookup_frame(struct packet_sock *po,
272                 struct packet_ring_buffer *rb,
273                 unsigned int position,
274                 int status)
275 {
276         unsigned int pg_vec_pos, frame_offset;
277         union {
278                 struct tpacket_hdr *h1;
279                 struct tpacket2_hdr *h2;
280                 void *raw;
281         } h;
282
283         pg_vec_pos = position / rb->frames_per_block;
284         frame_offset = position % rb->frames_per_block;
285
286         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
287
288         if (status != __packet_get_status(po, h.raw))
289                 return NULL;
290
291         return h.raw;
292 }
293
294 static inline void *packet_current_frame(struct packet_sock *po,
295                 struct packet_ring_buffer *rb,
296                 int status)
297 {
298         return packet_lookup_frame(po, rb, rb->head, status);
299 }
300
301 static inline void *packet_previous_frame(struct packet_sock *po,
302                 struct packet_ring_buffer *rb,
303                 int status)
304 {
305         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
306         return packet_lookup_frame(po, rb, previous, status);
307 }
308
309 static inline void packet_increment_head(struct packet_ring_buffer *buff)
310 {
311         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
312 }
313
314 static inline struct packet_sock *pkt_sk(struct sock *sk)
315 {
316         return (struct packet_sock *)sk;
317 }
318
319 static void packet_sock_destruct(struct sock *sk)
320 {
321         skb_queue_purge(&sk->sk_error_queue);
322
323         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
324         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
325
326         if (!sock_flag(sk, SOCK_DEAD)) {
327                 pr_err("Attempt to release alive packet socket: %p\n", sk);
328                 return;
329         }
330
331         sk_refcnt_debug_dec(sk);
332 }
333
334
335 static const struct proto_ops packet_ops;
336
337 static const struct proto_ops packet_ops_spkt;
338
339 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
340                            struct packet_type *pt, struct net_device *orig_dev)
341 {
342         struct sock *sk;
343         struct sockaddr_pkt *spkt;
344
345         /*
346          *      When we registered the protocol we saved the socket in the data
347          *      field for just this event.
348          */
349
350         sk = pt->af_packet_priv;
351
352         /*
353          *      Yank back the headers [hope the device set this
354          *      right or kerboom...]
355          *
356          *      Incoming packets have ll header pulled,
357          *      push it back.
358          *
359          *      For outgoing ones skb->data == skb_mac_header(skb)
360          *      so that this procedure is noop.
361          */
362
363         if (skb->pkt_type == PACKET_LOOPBACK)
364                 goto out;
365
366         if (!net_eq(dev_net(dev), sock_net(sk)))
367                 goto out;
368
369         skb = skb_share_check(skb, GFP_ATOMIC);
370         if (skb == NULL)
371                 goto oom;
372
373         /* drop any routing info */
374         skb_dst_drop(skb);
375
376         /* drop conntrack reference */
377         nf_reset(skb);
378
379         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
380
381         skb_push(skb, skb->data - skb_mac_header(skb));
382
383         /*
384          *      The SOCK_PACKET socket receives _all_ frames.
385          */
386
387         spkt->spkt_family = dev->type;
388         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
389         spkt->spkt_protocol = skb->protocol;
390
391         /*
392          *      Charge the memory to the socket. This is done specifically
393          *      to prevent sockets using all the memory up.
394          */
395
396         if (sock_queue_rcv_skb(sk, skb) == 0)
397                 return 0;
398
399 out:
400         kfree_skb(skb);
401 oom:
402         return 0;
403 }
404
405
406 /*
407  *      Output a raw packet to a device layer. This bypasses all the other
408  *      protocol layers and you must therefore supply it with a complete frame
409  */
410
411 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
412                                struct msghdr *msg, size_t len)
413 {
414         struct sock *sk = sock->sk;
415         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
416         struct sk_buff *skb = NULL;
417         struct net_device *dev;
418         __be16 proto = 0;
419         int err;
420
421         /*
422          *      Get and verify the address.
423          */
424
425         if (saddr) {
426                 if (msg->msg_namelen < sizeof(struct sockaddr))
427                         return -EINVAL;
428                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
429                         proto = saddr->spkt_protocol;
430         } else
431                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
432
433         /*
434          *      Find the device first to size check it
435          */
436
437         saddr->spkt_device[13] = 0;
438 retry:
439         rcu_read_lock();
440         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
441         err = -ENODEV;
442         if (dev == NULL)
443                 goto out_unlock;
444
445         err = -ENETDOWN;
446         if (!(dev->flags & IFF_UP))
447                 goto out_unlock;
448
449         /*
450          * You may not queue a frame bigger than the mtu. This is the lowest level
451          * raw protocol and you must do your own fragmentation at this level.
452          */
453
454         err = -EMSGSIZE;
455         if (len > dev->mtu + dev->hard_header_len)
456                 goto out_unlock;
457
458         if (!skb) {
459                 size_t reserved = LL_RESERVED_SPACE(dev);
460                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
461
462                 rcu_read_unlock();
463                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
464                 if (skb == NULL)
465                         return -ENOBUFS;
466                 /* FIXME: Save some space for broken drivers that write a hard
467                  * header at transmission time by themselves. PPP is the notable
468                  * one here. This should really be fixed at the driver level.
469                  */
470                 skb_reserve(skb, reserved);
471                 skb_reset_network_header(skb);
472
473                 /* Try to align data part correctly */
474                 if (hhlen) {
475                         skb->data -= hhlen;
476                         skb->tail -= hhlen;
477                         if (len < hhlen)
478                                 skb_reset_network_header(skb);
479                 }
480                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
481                 if (err)
482                         goto out_free;
483                 goto retry;
484         }
485
486
487         skb->protocol = proto;
488         skb->dev = dev;
489         skb->priority = sk->sk_priority;
490         skb->mark = sk->sk_mark;
491         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
492         if (err < 0)
493                 goto out_unlock;
494
495         dev_queue_xmit(skb);
496         rcu_read_unlock();
497         return len;
498
499 out_unlock:
500         rcu_read_unlock();
501 out_free:
502         kfree_skb(skb);
503         return err;
504 }
505
506 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
507                                       unsigned int res)
508 {
509         struct sk_filter *filter;
510
511         rcu_read_lock_bh();
512         filter = rcu_dereference_bh(sk->sk_filter);
513         if (filter != NULL)
514                 res = sk_run_filter(skb, filter->insns, filter->len);
515         rcu_read_unlock_bh();
516
517         return res;
518 }
519
520 /*
521    This function makes lazy skb cloning in hope that most of packets
522    are discarded by BPF.
523
524    Note tricky part: we DO mangle shared skb! skb->data, skb->len
525    and skb->cb are mangled. It works because (and until) packets
526    falling here are owned by current CPU. Output packets are cloned
527    by dev_queue_xmit_nit(), input packets are processed by net_bh
528    sequencially, so that if we return skb to original state on exit,
529    we will not harm anyone.
530  */
531
532 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
533                       struct packet_type *pt, struct net_device *orig_dev)
534 {
535         struct sock *sk;
536         struct sockaddr_ll *sll;
537         struct packet_sock *po;
538         u8 *skb_head = skb->data;
539         int skb_len = skb->len;
540         unsigned int snaplen, res;
541
542         if (skb->pkt_type == PACKET_LOOPBACK)
543                 goto drop;
544
545         sk = pt->af_packet_priv;
546         po = pkt_sk(sk);
547
548         if (!net_eq(dev_net(dev), sock_net(sk)))
549                 goto drop;
550
551         skb->dev = dev;
552
553         if (dev->header_ops) {
554                 /* The device has an explicit notion of ll header,
555                    exported to higher levels.
556
557                    Otherwise, the device hides datails of it frame
558                    structure, so that corresponding packet head
559                    never delivered to user.
560                  */
561                 if (sk->sk_type != SOCK_DGRAM)
562                         skb_push(skb, skb->data - skb_mac_header(skb));
563                 else if (skb->pkt_type == PACKET_OUTGOING) {
564                         /* Special case: outgoing packets have ll header at head */
565                         skb_pull(skb, skb_network_offset(skb));
566                 }
567         }
568
569         snaplen = skb->len;
570
571         res = run_filter(skb, sk, snaplen);
572         if (!res)
573                 goto drop_n_restore;
574         if (snaplen > res)
575                 snaplen = res;
576
577         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
578             (unsigned)sk->sk_rcvbuf)
579                 goto drop_n_acct;
580
581         if (skb_shared(skb)) {
582                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
583                 if (nskb == NULL)
584                         goto drop_n_acct;
585
586                 if (skb_head != skb->data) {
587                         skb->data = skb_head;
588                         skb->len = skb_len;
589                 }
590                 kfree_skb(skb);
591                 skb = nskb;
592         }
593
594         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
595                      sizeof(skb->cb));
596
597         sll = &PACKET_SKB_CB(skb)->sa.ll;
598         sll->sll_family = AF_PACKET;
599         sll->sll_hatype = dev->type;
600         sll->sll_protocol = skb->protocol;
601         sll->sll_pkttype = skb->pkt_type;
602         if (unlikely(po->origdev))
603                 sll->sll_ifindex = orig_dev->ifindex;
604         else
605                 sll->sll_ifindex = dev->ifindex;
606
607         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
608
609         PACKET_SKB_CB(skb)->origlen = skb->len;
610
611         if (pskb_trim(skb, snaplen))
612                 goto drop_n_acct;
613
614         skb_set_owner_r(skb, sk);
615         skb->dev = NULL;
616         skb_dst_drop(skb);
617
618         /* drop conntrack reference */
619         nf_reset(skb);
620
621         spin_lock(&sk->sk_receive_queue.lock);
622         po->stats.tp_packets++;
623         skb->dropcount = atomic_read(&sk->sk_drops);
624         __skb_queue_tail(&sk->sk_receive_queue, skb);
625         spin_unlock(&sk->sk_receive_queue.lock);
626         sk->sk_data_ready(sk, skb->len);
627         return 0;
628
629 drop_n_acct:
630         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
631
632 drop_n_restore:
633         if (skb_head != skb->data && skb_shared(skb)) {
634                 skb->data = skb_head;
635                 skb->len = skb_len;
636         }
637 drop:
638         consume_skb(skb);
639         return 0;
640 }
641
642 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
643                        struct packet_type *pt, struct net_device *orig_dev)
644 {
645         struct sock *sk;
646         struct packet_sock *po;
647         struct sockaddr_ll *sll;
648         union {
649                 struct tpacket_hdr *h1;
650                 struct tpacket2_hdr *h2;
651                 void *raw;
652         } h;
653         u8 *skb_head = skb->data;
654         int skb_len = skb->len;
655         unsigned int snaplen, res;
656         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
657         unsigned short macoff, netoff, hdrlen;
658         struct sk_buff *copy_skb = NULL;
659         struct timeval tv;
660         struct timespec ts;
661         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
662
663         if (skb->pkt_type == PACKET_LOOPBACK)
664                 goto drop;
665
666         sk = pt->af_packet_priv;
667         po = pkt_sk(sk);
668
669         if (!net_eq(dev_net(dev), sock_net(sk)))
670                 goto drop;
671
672         if (dev->header_ops) {
673                 if (sk->sk_type != SOCK_DGRAM)
674                         skb_push(skb, skb->data - skb_mac_header(skb));
675                 else if (skb->pkt_type == PACKET_OUTGOING) {
676                         /* Special case: outgoing packets have ll header at head */
677                         skb_pull(skb, skb_network_offset(skb));
678                 }
679         }
680
681         if (skb->ip_summed == CHECKSUM_PARTIAL)
682                 status |= TP_STATUS_CSUMNOTREADY;
683
684         snaplen = skb->len;
685
686         res = run_filter(skb, sk, snaplen);
687         if (!res)
688                 goto drop_n_restore;
689         if (snaplen > res)
690                 snaplen = res;
691
692         if (sk->sk_type == SOCK_DGRAM) {
693                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
694                                   po->tp_reserve;
695         } else {
696                 unsigned maclen = skb_network_offset(skb);
697                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
698                                        (maclen < 16 ? 16 : maclen)) +
699                         po->tp_reserve;
700                 macoff = netoff - maclen;
701         }
702
703         if (macoff + snaplen > po->rx_ring.frame_size) {
704                 if (po->copy_thresh &&
705                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
706                     (unsigned)sk->sk_rcvbuf) {
707                         if (skb_shared(skb)) {
708                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
709                         } else {
710                                 copy_skb = skb_get(skb);
711                                 skb_head = skb->data;
712                         }
713                         if (copy_skb)
714                                 skb_set_owner_r(copy_skb, sk);
715                 }
716                 snaplen = po->rx_ring.frame_size - macoff;
717                 if ((int)snaplen < 0)
718                         snaplen = 0;
719         }
720
721         spin_lock(&sk->sk_receive_queue.lock);
722         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
723         if (!h.raw)
724                 goto ring_is_full;
725         packet_increment_head(&po->rx_ring);
726         po->stats.tp_packets++;
727         if (copy_skb) {
728                 status |= TP_STATUS_COPY;
729                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
730         }
731         if (!po->stats.tp_drops)
732                 status &= ~TP_STATUS_LOSING;
733         spin_unlock(&sk->sk_receive_queue.lock);
734
735         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
736
737         switch (po->tp_version) {
738         case TPACKET_V1:
739                 h.h1->tp_len = skb->len;
740                 h.h1->tp_snaplen = snaplen;
741                 h.h1->tp_mac = macoff;
742                 h.h1->tp_net = netoff;
743                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
744                                 && shhwtstamps->syststamp.tv64)
745                         tv = ktime_to_timeval(shhwtstamps->syststamp);
746                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
747                                 && shhwtstamps->hwtstamp.tv64)
748                         tv = ktime_to_timeval(shhwtstamps->hwtstamp);
749                 else if (skb->tstamp.tv64)
750                         tv = ktime_to_timeval(skb->tstamp);
751                 else
752                         do_gettimeofday(&tv);
753                 h.h1->tp_sec = tv.tv_sec;
754                 h.h1->tp_usec = tv.tv_usec;
755                 hdrlen = sizeof(*h.h1);
756                 break;
757         case TPACKET_V2:
758                 h.h2->tp_len = skb->len;
759                 h.h2->tp_snaplen = snaplen;
760                 h.h2->tp_mac = macoff;
761                 h.h2->tp_net = netoff;
762                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
763                                 && shhwtstamps->syststamp.tv64)
764                         ts = ktime_to_timespec(shhwtstamps->syststamp);
765                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
766                                 && shhwtstamps->hwtstamp.tv64)
767                         ts = ktime_to_timespec(shhwtstamps->hwtstamp);
768                 else if (skb->tstamp.tv64)
769                         ts = ktime_to_timespec(skb->tstamp);
770                 else
771                         getnstimeofday(&ts);
772                 h.h2->tp_sec = ts.tv_sec;
773                 h.h2->tp_nsec = ts.tv_nsec;
774                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
775                 hdrlen = sizeof(*h.h2);
776                 break;
777         default:
778                 BUG();
779         }
780
781         sll = h.raw + TPACKET_ALIGN(hdrlen);
782         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
783         sll->sll_family = AF_PACKET;
784         sll->sll_hatype = dev->type;
785         sll->sll_protocol = skb->protocol;
786         sll->sll_pkttype = skb->pkt_type;
787         if (unlikely(po->origdev))
788                 sll->sll_ifindex = orig_dev->ifindex;
789         else
790                 sll->sll_ifindex = dev->ifindex;
791
792         __packet_set_status(po, h.raw, status);
793         smp_mb();
794         {
795                 struct page *p_start, *p_end;
796                 u8 *h_end = h.raw + macoff + snaplen - 1;
797
798                 p_start = virt_to_page(h.raw);
799                 p_end = virt_to_page(h_end);
800                 while (p_start <= p_end) {
801                         flush_dcache_page(p_start);
802                         p_start++;
803                 }
804         }
805
806         sk->sk_data_ready(sk, 0);
807
808 drop_n_restore:
809         if (skb_head != skb->data && skb_shared(skb)) {
810                 skb->data = skb_head;
811                 skb->len = skb_len;
812         }
813 drop:
814         kfree_skb(skb);
815         return 0;
816
817 ring_is_full:
818         po->stats.tp_drops++;
819         spin_unlock(&sk->sk_receive_queue.lock);
820
821         sk->sk_data_ready(sk, 0);
822         kfree_skb(copy_skb);
823         goto drop_n_restore;
824 }
825
826 static void tpacket_destruct_skb(struct sk_buff *skb)
827 {
828         struct packet_sock *po = pkt_sk(skb->sk);
829         void *ph;
830
831         BUG_ON(skb == NULL);
832
833         if (likely(po->tx_ring.pg_vec)) {
834                 ph = skb_shinfo(skb)->destructor_arg;
835                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
836                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
837                 atomic_dec(&po->tx_ring.pending);
838                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
839         }
840
841         sock_wfree(skb);
842 }
843
844 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
845                 void *frame, struct net_device *dev, int size_max,
846                 __be16 proto, unsigned char *addr)
847 {
848         union {
849                 struct tpacket_hdr *h1;
850                 struct tpacket2_hdr *h2;
851                 void *raw;
852         } ph;
853         int to_write, offset, len, tp_len, nr_frags, len_max;
854         struct socket *sock = po->sk.sk_socket;
855         struct page *page;
856         void *data;
857         int err;
858
859         ph.raw = frame;
860
861         skb->protocol = proto;
862         skb->dev = dev;
863         skb->priority = po->sk.sk_priority;
864         skb->mark = po->sk.sk_mark;
865         skb_shinfo(skb)->destructor_arg = ph.raw;
866
867         switch (po->tp_version) {
868         case TPACKET_V2:
869                 tp_len = ph.h2->tp_len;
870                 break;
871         default:
872                 tp_len = ph.h1->tp_len;
873                 break;
874         }
875         if (unlikely(tp_len > size_max)) {
876                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
877                 return -EMSGSIZE;
878         }
879
880         skb_reserve(skb, LL_RESERVED_SPACE(dev));
881         skb_reset_network_header(skb);
882
883         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
884         to_write = tp_len;
885
886         if (sock->type == SOCK_DGRAM) {
887                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
888                                 NULL, tp_len);
889                 if (unlikely(err < 0))
890                         return -EINVAL;
891         } else if (dev->hard_header_len) {
892                 /* net device doesn't like empty head */
893                 if (unlikely(tp_len <= dev->hard_header_len)) {
894                         pr_err("packet size is too short (%d < %d)\n",
895                                tp_len, dev->hard_header_len);
896                         return -EINVAL;
897                 }
898
899                 skb_push(skb, dev->hard_header_len);
900                 err = skb_store_bits(skb, 0, data,
901                                 dev->hard_header_len);
902                 if (unlikely(err))
903                         return err;
904
905                 data += dev->hard_header_len;
906                 to_write -= dev->hard_header_len;
907         }
908
909         err = -EFAULT;
910         page = virt_to_page(data);
911         offset = offset_in_page(data);
912         len_max = PAGE_SIZE - offset;
913         len = ((to_write > len_max) ? len_max : to_write);
914
915         skb->data_len = to_write;
916         skb->len += to_write;
917         skb->truesize += to_write;
918         atomic_add(to_write, &po->sk.sk_wmem_alloc);
919
920         while (likely(to_write)) {
921                 nr_frags = skb_shinfo(skb)->nr_frags;
922
923                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
924                         pr_err("Packet exceed the number of skb frags(%lu)\n",
925                                MAX_SKB_FRAGS);
926                         return -EFAULT;
927                 }
928
929                 flush_dcache_page(page);
930                 get_page(page);
931                 skb_fill_page_desc(skb,
932                                 nr_frags,
933                                 page++, offset, len);
934                 to_write -= len;
935                 offset = 0;
936                 len_max = PAGE_SIZE;
937                 len = ((to_write > len_max) ? len_max : to_write);
938         }
939
940         return tp_len;
941 }
942
943 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
944 {
945         struct socket *sock;
946         struct sk_buff *skb;
947         struct net_device *dev;
948         __be16 proto;
949         int ifindex, err, reserve = 0;
950         void *ph;
951         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
952         int tp_len, size_max;
953         unsigned char *addr;
954         int len_sum = 0;
955         int status = 0;
956
957         sock = po->sk.sk_socket;
958
959         mutex_lock(&po->pg_vec_lock);
960
961         err = -EBUSY;
962         if (saddr == NULL) {
963                 ifindex = po->ifindex;
964                 proto   = po->num;
965                 addr    = NULL;
966         } else {
967                 err = -EINVAL;
968                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
969                         goto out;
970                 if (msg->msg_namelen < (saddr->sll_halen
971                                         + offsetof(struct sockaddr_ll,
972                                                 sll_addr)))
973                         goto out;
974                 ifindex = saddr->sll_ifindex;
975                 proto   = saddr->sll_protocol;
976                 addr    = saddr->sll_addr;
977         }
978
979         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
980         err = -ENXIO;
981         if (unlikely(dev == NULL))
982                 goto out;
983
984         reserve = dev->hard_header_len;
985
986         err = -ENETDOWN;
987         if (unlikely(!(dev->flags & IFF_UP)))
988                 goto out_put;
989
990         size_max = po->tx_ring.frame_size
991                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
992
993         if (size_max > dev->mtu + reserve)
994                 size_max = dev->mtu + reserve;
995
996         do {
997                 ph = packet_current_frame(po, &po->tx_ring,
998                                 TP_STATUS_SEND_REQUEST);
999
1000                 if (unlikely(ph == NULL)) {
1001                         schedule();
1002                         continue;
1003                 }
1004
1005                 status = TP_STATUS_SEND_REQUEST;
1006                 skb = sock_alloc_send_skb(&po->sk,
1007                                 LL_ALLOCATED_SPACE(dev)
1008                                 + sizeof(struct sockaddr_ll),
1009                                 0, &err);
1010
1011                 if (unlikely(skb == NULL))
1012                         goto out_status;
1013
1014                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015                                 addr);
1016
1017                 if (unlikely(tp_len < 0)) {
1018                         if (po->tp_loss) {
1019                                 __packet_set_status(po, ph,
1020                                                 TP_STATUS_AVAILABLE);
1021                                 packet_increment_head(&po->tx_ring);
1022                                 kfree_skb(skb);
1023                                 continue;
1024                         } else {
1025                                 status = TP_STATUS_WRONG_FORMAT;
1026                                 err = tp_len;
1027                                 goto out_status;
1028                         }
1029                 }
1030
1031                 skb->destructor = tpacket_destruct_skb;
1032                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1033                 atomic_inc(&po->tx_ring.pending);
1034
1035                 status = TP_STATUS_SEND_REQUEST;
1036                 err = dev_queue_xmit(skb);
1037                 if (unlikely(err > 0)) {
1038                         err = net_xmit_errno(err);
1039                         if (err && __packet_get_status(po, ph) ==
1040                                    TP_STATUS_AVAILABLE) {
1041                                 /* skb was destructed already */
1042                                 skb = NULL;
1043                                 goto out_status;
1044                         }
1045                         /*
1046                          * skb was dropped but not destructed yet;
1047                          * let's treat it like congestion or err < 0
1048                          */
1049                         err = 0;
1050                 }
1051                 packet_increment_head(&po->tx_ring);
1052                 len_sum += tp_len;
1053         } while (likely((ph != NULL) ||
1054                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1055                          (atomic_read(&po->tx_ring.pending))))
1056                 );
1057
1058         err = len_sum;
1059         goto out_put;
1060
1061 out_status:
1062         __packet_set_status(po, ph, status);
1063         kfree_skb(skb);
1064 out_put:
1065         dev_put(dev);
1066 out:
1067         mutex_unlock(&po->pg_vec_lock);
1068         return err;
1069 }
1070
1071 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1072                                                size_t reserve, size_t len,
1073                                                size_t linear, int noblock,
1074                                                int *err)
1075 {
1076         struct sk_buff *skb;
1077
1078         /* Under a page?  Don't bother with paged skb. */
1079         if (prepad + len < PAGE_SIZE || !linear)
1080                 linear = len;
1081
1082         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1083                                    err);
1084         if (!skb)
1085                 return NULL;
1086
1087         skb_reserve(skb, reserve);
1088         skb_put(skb, linear);
1089         skb->data_len = len - linear;
1090         skb->len += len - linear;
1091
1092         return skb;
1093 }
1094
1095 static int packet_snd(struct socket *sock,
1096                           struct msghdr *msg, size_t len)
1097 {
1098         struct sock *sk = sock->sk;
1099         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1100         struct sk_buff *skb;
1101         struct net_device *dev;
1102         __be16 proto;
1103         unsigned char *addr;
1104         int ifindex, err, reserve = 0;
1105         struct virtio_net_hdr vnet_hdr = { 0 };
1106         int offset = 0;
1107         int vnet_hdr_len;
1108         struct packet_sock *po = pkt_sk(sk);
1109         unsigned short gso_type = 0;
1110
1111         /*
1112          *      Get and verify the address.
1113          */
1114
1115         if (saddr == NULL) {
1116                 ifindex = po->ifindex;
1117                 proto   = po->num;
1118                 addr    = NULL;
1119         } else {
1120                 err = -EINVAL;
1121                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1122                         goto out;
1123                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1124                         goto out;
1125                 ifindex = saddr->sll_ifindex;
1126                 proto   = saddr->sll_protocol;
1127                 addr    = saddr->sll_addr;
1128         }
1129
1130
1131         dev = dev_get_by_index(sock_net(sk), ifindex);
1132         err = -ENXIO;
1133         if (dev == NULL)
1134                 goto out_unlock;
1135         if (sock->type == SOCK_RAW)
1136                 reserve = dev->hard_header_len;
1137
1138         err = -ENETDOWN;
1139         if (!(dev->flags & IFF_UP))
1140                 goto out_unlock;
1141
1142         if (po->has_vnet_hdr) {
1143                 vnet_hdr_len = sizeof(vnet_hdr);
1144
1145                 err = -EINVAL;
1146                 if (len < vnet_hdr_len)
1147                         goto out_unlock;
1148
1149                 len -= vnet_hdr_len;
1150
1151                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1152                                        vnet_hdr_len);
1153                 if (err < 0)
1154                         goto out_unlock;
1155
1156                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1157                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1158                       vnet_hdr.hdr_len))
1159                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1160                                                  vnet_hdr.csum_offset + 2;
1161
1162                 err = -EINVAL;
1163                 if (vnet_hdr.hdr_len > len)
1164                         goto out_unlock;
1165
1166                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1167                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1168                         case VIRTIO_NET_HDR_GSO_TCPV4:
1169                                 gso_type = SKB_GSO_TCPV4;
1170                                 break;
1171                         case VIRTIO_NET_HDR_GSO_TCPV6:
1172                                 gso_type = SKB_GSO_TCPV6;
1173                                 break;
1174                         case VIRTIO_NET_HDR_GSO_UDP:
1175                                 gso_type = SKB_GSO_UDP;
1176                                 break;
1177                         default:
1178                                 goto out_unlock;
1179                         }
1180
1181                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1182                                 gso_type |= SKB_GSO_TCP_ECN;
1183
1184                         if (vnet_hdr.gso_size == 0)
1185                                 goto out_unlock;
1186
1187                 }
1188         }
1189
1190         err = -EMSGSIZE;
1191         if (!gso_type && (len > dev->mtu+reserve))
1192                 goto out_unlock;
1193
1194         err = -ENOBUFS;
1195         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1196                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1197                                msg->msg_flags & MSG_DONTWAIT, &err);
1198         if (skb == NULL)
1199                 goto out_unlock;
1200
1201         skb_set_network_header(skb, reserve);
1202
1203         err = -EINVAL;
1204         if (sock->type == SOCK_DGRAM &&
1205             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1206                 goto out_free;
1207
1208         /* Returns -EFAULT on error */
1209         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1210         if (err)
1211                 goto out_free;
1212         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1213         if (err < 0)
1214                 goto out_free;
1215
1216         skb->protocol = proto;
1217         skb->dev = dev;
1218         skb->priority = sk->sk_priority;
1219         skb->mark = sk->sk_mark;
1220
1221         if (po->has_vnet_hdr) {
1222                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1223                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1224                                                   vnet_hdr.csum_offset)) {
1225                                 err = -EINVAL;
1226                                 goto out_free;
1227                         }
1228                 }
1229
1230                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1231                 skb_shinfo(skb)->gso_type = gso_type;
1232
1233                 /* Header must be checked, and gso_segs computed. */
1234                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1235                 skb_shinfo(skb)->gso_segs = 0;
1236
1237                 len += vnet_hdr_len;
1238         }
1239
1240         /*
1241          *      Now send it
1242          */
1243
1244         err = dev_queue_xmit(skb);
1245         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1246                 goto out_unlock;
1247
1248         dev_put(dev);
1249
1250         return len;
1251
1252 out_free:
1253         kfree_skb(skb);
1254 out_unlock:
1255         if (dev)
1256                 dev_put(dev);
1257 out:
1258         return err;
1259 }
1260
1261 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1262                 struct msghdr *msg, size_t len)
1263 {
1264         struct sock *sk = sock->sk;
1265         struct packet_sock *po = pkt_sk(sk);
1266         if (po->tx_ring.pg_vec)
1267                 return tpacket_snd(po, msg);
1268         else
1269                 return packet_snd(sock, msg, len);
1270 }
1271
1272 /*
1273  *      Close a PACKET socket. This is fairly simple. We immediately go
1274  *      to 'closed' state and remove our protocol entry in the device list.
1275  */
1276
1277 static int packet_release(struct socket *sock)
1278 {
1279         struct sock *sk = sock->sk;
1280         struct packet_sock *po;
1281         struct net *net;
1282         struct tpacket_req req;
1283
1284         if (!sk)
1285                 return 0;
1286
1287         net = sock_net(sk);
1288         po = pkt_sk(sk);
1289
1290         spin_lock_bh(&net->packet.sklist_lock);
1291         sk_del_node_init_rcu(sk);
1292         sock_prot_inuse_add(net, sk->sk_prot, -1);
1293         spin_unlock_bh(&net->packet.sklist_lock);
1294
1295         spin_lock(&po->bind_lock);
1296         if (po->running) {
1297                 /*
1298                  * Remove from protocol table
1299                  */
1300                 po->running = 0;
1301                 po->num = 0;
1302                 __dev_remove_pack(&po->prot_hook);
1303                 __sock_put(sk);
1304         }
1305         spin_unlock(&po->bind_lock);
1306
1307         packet_flush_mclist(sk);
1308
1309         memset(&req, 0, sizeof(req));
1310
1311         if (po->rx_ring.pg_vec)
1312                 packet_set_ring(sk, &req, 1, 0);
1313
1314         if (po->tx_ring.pg_vec)
1315                 packet_set_ring(sk, &req, 1, 1);
1316
1317         synchronize_net();
1318         /*
1319          *      Now the socket is dead. No more input will appear.
1320          */
1321         sock_orphan(sk);
1322         sock->sk = NULL;
1323
1324         /* Purge queues */
1325
1326         skb_queue_purge(&sk->sk_receive_queue);
1327         sk_refcnt_debug_release(sk);
1328
1329         sock_put(sk);
1330         return 0;
1331 }
1332
1333 /*
1334  *      Attach a packet hook.
1335  */
1336
1337 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1338 {
1339         struct packet_sock *po = pkt_sk(sk);
1340         /*
1341          *      Detach an existing hook if present.
1342          */
1343
1344         lock_sock(sk);
1345
1346         spin_lock(&po->bind_lock);
1347         if (po->running) {
1348                 __sock_put(sk);
1349                 po->running = 0;
1350                 po->num = 0;
1351                 spin_unlock(&po->bind_lock);
1352                 dev_remove_pack(&po->prot_hook);
1353                 spin_lock(&po->bind_lock);
1354         }
1355
1356         po->num = protocol;
1357         po->prot_hook.type = protocol;
1358         po->prot_hook.dev = dev;
1359
1360         po->ifindex = dev ? dev->ifindex : 0;
1361
1362         if (protocol == 0)
1363                 goto out_unlock;
1364
1365         if (!dev || (dev->flags & IFF_UP)) {
1366                 dev_add_pack(&po->prot_hook);
1367                 sock_hold(sk);
1368                 po->running = 1;
1369         } else {
1370                 sk->sk_err = ENETDOWN;
1371                 if (!sock_flag(sk, SOCK_DEAD))
1372                         sk->sk_error_report(sk);
1373         }
1374
1375 out_unlock:
1376         spin_unlock(&po->bind_lock);
1377         release_sock(sk);
1378         return 0;
1379 }
1380
1381 /*
1382  *      Bind a packet socket to a device
1383  */
1384
1385 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1386                             int addr_len)
1387 {
1388         struct sock *sk = sock->sk;
1389         char name[15];
1390         struct net_device *dev;
1391         int err = -ENODEV;
1392
1393         /*
1394          *      Check legality
1395          */
1396
1397         if (addr_len != sizeof(struct sockaddr))
1398                 return -EINVAL;
1399         strlcpy(name, uaddr->sa_data, sizeof(name));
1400
1401         dev = dev_get_by_name(sock_net(sk), name);
1402         if (dev) {
1403                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1404                 dev_put(dev);
1405         }
1406         return err;
1407 }
1408
1409 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1410 {
1411         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1412         struct sock *sk = sock->sk;
1413         struct net_device *dev = NULL;
1414         int err;
1415
1416
1417         /*
1418          *      Check legality
1419          */
1420
1421         if (addr_len < sizeof(struct sockaddr_ll))
1422                 return -EINVAL;
1423         if (sll->sll_family != AF_PACKET)
1424                 return -EINVAL;
1425
1426         if (sll->sll_ifindex) {
1427                 err = -ENODEV;
1428                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1429                 if (dev == NULL)
1430                         goto out;
1431         }
1432         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1433         if (dev)
1434                 dev_put(dev);
1435
1436 out:
1437         return err;
1438 }
1439
1440 static struct proto packet_proto = {
1441         .name     = "PACKET",
1442         .owner    = THIS_MODULE,
1443         .obj_size = sizeof(struct packet_sock),
1444 };
1445
1446 /*
1447  *      Create a packet of type SOCK_PACKET.
1448  */
1449
1450 static int packet_create(struct net *net, struct socket *sock, int protocol,
1451                          int kern)
1452 {
1453         struct sock *sk;
1454         struct packet_sock *po;
1455         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1456         int err;
1457
1458         if (!capable(CAP_NET_RAW))
1459                 return -EPERM;
1460         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1461             sock->type != SOCK_PACKET)
1462                 return -ESOCKTNOSUPPORT;
1463
1464         sock->state = SS_UNCONNECTED;
1465
1466         err = -ENOBUFS;
1467         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1468         if (sk == NULL)
1469                 goto out;
1470
1471         sock->ops = &packet_ops;
1472         if (sock->type == SOCK_PACKET)
1473                 sock->ops = &packet_ops_spkt;
1474
1475         sock_init_data(sock, sk);
1476
1477         po = pkt_sk(sk);
1478         sk->sk_family = PF_PACKET;
1479         po->num = proto;
1480
1481         sk->sk_destruct = packet_sock_destruct;
1482         sk_refcnt_debug_inc(sk);
1483
1484         /*
1485          *      Attach a protocol block
1486          */
1487
1488         spin_lock_init(&po->bind_lock);
1489         mutex_init(&po->pg_vec_lock);
1490         po->prot_hook.func = packet_rcv;
1491
1492         if (sock->type == SOCK_PACKET)
1493                 po->prot_hook.func = packet_rcv_spkt;
1494
1495         po->prot_hook.af_packet_priv = sk;
1496
1497         if (proto) {
1498                 po->prot_hook.type = proto;
1499                 dev_add_pack(&po->prot_hook);
1500                 sock_hold(sk);
1501                 po->running = 1;
1502         }
1503
1504         spin_lock_bh(&net->packet.sklist_lock);
1505         sk_add_node_rcu(sk, &net->packet.sklist);
1506         sock_prot_inuse_add(net, &packet_proto, 1);
1507         spin_unlock_bh(&net->packet.sklist_lock);
1508
1509         return 0;
1510 out:
1511         return err;
1512 }
1513
1514 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1515 {
1516         struct sock_exterr_skb *serr;
1517         struct sk_buff *skb, *skb2;
1518         int copied, err;
1519
1520         err = -EAGAIN;
1521         skb = skb_dequeue(&sk->sk_error_queue);
1522         if (skb == NULL)
1523                 goto out;
1524
1525         copied = skb->len;
1526         if (copied > len) {
1527                 msg->msg_flags |= MSG_TRUNC;
1528                 copied = len;
1529         }
1530         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1531         if (err)
1532                 goto out_free_skb;
1533
1534         sock_recv_timestamp(msg, sk, skb);
1535
1536         serr = SKB_EXT_ERR(skb);
1537         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1538                  sizeof(serr->ee), &serr->ee);
1539
1540         msg->msg_flags |= MSG_ERRQUEUE;
1541         err = copied;
1542
1543         /* Reset and regenerate socket error */
1544         spin_lock_bh(&sk->sk_error_queue.lock);
1545         sk->sk_err = 0;
1546         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1547                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1548                 spin_unlock_bh(&sk->sk_error_queue.lock);
1549                 sk->sk_error_report(sk);
1550         } else
1551                 spin_unlock_bh(&sk->sk_error_queue.lock);
1552
1553 out_free_skb:
1554         kfree_skb(skb);
1555 out:
1556         return err;
1557 }
1558
1559 /*
1560  *      Pull a packet from our receive queue and hand it to the user.
1561  *      If necessary we block.
1562  */
1563
1564 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1565                           struct msghdr *msg, size_t len, int flags)
1566 {
1567         struct sock *sk = sock->sk;
1568         struct sk_buff *skb;
1569         int copied, err;
1570         struct sockaddr_ll *sll;
1571         int vnet_hdr_len = 0;
1572
1573         err = -EINVAL;
1574         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1575                 goto out;
1576
1577 #if 0
1578         /* What error should we return now? EUNATTACH? */
1579         if (pkt_sk(sk)->ifindex < 0)
1580                 return -ENODEV;
1581 #endif
1582
1583         if (flags & MSG_ERRQUEUE) {
1584                 err = packet_recv_error(sk, msg, len);
1585                 goto out;
1586         }
1587
1588         /*
1589          *      Call the generic datagram receiver. This handles all sorts
1590          *      of horrible races and re-entrancy so we can forget about it
1591          *      in the protocol layers.
1592          *
1593          *      Now it will return ENETDOWN, if device have just gone down,
1594          *      but then it will block.
1595          */
1596
1597         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1598
1599         /*
1600          *      An error occurred so return it. Because skb_recv_datagram()
1601          *      handles the blocking we don't see and worry about blocking
1602          *      retries.
1603          */
1604
1605         if (skb == NULL)
1606                 goto out;
1607
1608         if (pkt_sk(sk)->has_vnet_hdr) {
1609                 struct virtio_net_hdr vnet_hdr = { 0 };
1610
1611                 err = -EINVAL;
1612                 vnet_hdr_len = sizeof(vnet_hdr);
1613                 if ((len -= vnet_hdr_len) < 0)
1614                         goto out_free;
1615
1616                 if (skb_is_gso(skb)) {
1617                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1618
1619                         /* This is a hint as to how much should be linear. */
1620                         vnet_hdr.hdr_len = skb_headlen(skb);
1621                         vnet_hdr.gso_size = sinfo->gso_size;
1622                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1623                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1624                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1625                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1626                         else if (sinfo->gso_type & SKB_GSO_UDP)
1627                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1628                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1629                                 goto out_free;
1630                         else
1631                                 BUG();
1632                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1633                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1634                 } else
1635                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1636
1637                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1638                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1639                         vnet_hdr.csum_start = skb->csum_start -
1640                                                         skb_headroom(skb);
1641                         vnet_hdr.csum_offset = skb->csum_offset;
1642                 } /* else everything is zero */
1643
1644                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1645                                      vnet_hdr_len);
1646                 if (err < 0)
1647                         goto out_free;
1648         }
1649
1650         /*
1651          *      If the address length field is there to be filled in, we fill
1652          *      it in now.
1653          */
1654
1655         sll = &PACKET_SKB_CB(skb)->sa.ll;
1656         if (sock->type == SOCK_PACKET)
1657                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1658         else
1659                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1660
1661         /*
1662          *      You lose any data beyond the buffer you gave. If it worries a
1663          *      user program they can ask the device for its MTU anyway.
1664          */
1665
1666         copied = skb->len;
1667         if (copied > len) {
1668                 copied = len;
1669                 msg->msg_flags |= MSG_TRUNC;
1670         }
1671
1672         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1673         if (err)
1674                 goto out_free;
1675
1676         sock_recv_ts_and_drops(msg, sk, skb);
1677
1678         if (msg->msg_name)
1679                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1680                        msg->msg_namelen);
1681
1682         if (pkt_sk(sk)->auxdata) {
1683                 struct tpacket_auxdata aux;
1684
1685                 aux.tp_status = TP_STATUS_USER;
1686                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1687                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1688                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1689                 aux.tp_snaplen = skb->len;
1690                 aux.tp_mac = 0;
1691                 aux.tp_net = skb_network_offset(skb);
1692                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1693
1694                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1695         }
1696
1697         /*
1698          *      Free or return the buffer as appropriate. Again this
1699          *      hides all the races and re-entrancy issues from us.
1700          */
1701         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1702
1703 out_free:
1704         skb_free_datagram(sk, skb);
1705 out:
1706         return err;
1707 }
1708
1709 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1710                                int *uaddr_len, int peer)
1711 {
1712         struct net_device *dev;
1713         struct sock *sk = sock->sk;
1714
1715         if (peer)
1716                 return -EOPNOTSUPP;
1717
1718         uaddr->sa_family = AF_PACKET;
1719         rcu_read_lock();
1720         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1721         if (dev)
1722                 strncpy(uaddr->sa_data, dev->name, 14);
1723         else
1724                 memset(uaddr->sa_data, 0, 14);
1725         rcu_read_unlock();
1726         *uaddr_len = sizeof(*uaddr);
1727
1728         return 0;
1729 }
1730
1731 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1732                           int *uaddr_len, int peer)
1733 {
1734         struct net_device *dev;
1735         struct sock *sk = sock->sk;
1736         struct packet_sock *po = pkt_sk(sk);
1737         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1738
1739         if (peer)
1740                 return -EOPNOTSUPP;
1741
1742         sll->sll_family = AF_PACKET;
1743         sll->sll_ifindex = po->ifindex;
1744         sll->sll_protocol = po->num;
1745         sll->sll_pkttype = 0;
1746         rcu_read_lock();
1747         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1748         if (dev) {
1749                 sll->sll_hatype = dev->type;
1750                 sll->sll_halen = dev->addr_len;
1751                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1752         } else {
1753                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1754                 sll->sll_halen = 0;
1755         }
1756         rcu_read_unlock();
1757         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1758
1759         return 0;
1760 }
1761
1762 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1763                          int what)
1764 {
1765         switch (i->type) {
1766         case PACKET_MR_MULTICAST:
1767                 if (i->alen != dev->addr_len)
1768                         return -EINVAL;
1769                 if (what > 0)
1770                         return dev_mc_add(dev, i->addr);
1771                 else
1772                         return dev_mc_del(dev, i->addr);
1773                 break;
1774         case PACKET_MR_PROMISC:
1775                 return dev_set_promiscuity(dev, what);
1776                 break;
1777         case PACKET_MR_ALLMULTI:
1778                 return dev_set_allmulti(dev, what);
1779                 break;
1780         case PACKET_MR_UNICAST:
1781                 if (i->alen != dev->addr_len)
1782                         return -EINVAL;
1783                 if (what > 0)
1784                         return dev_uc_add(dev, i->addr);
1785                 else
1786                         return dev_uc_del(dev, i->addr);
1787                 break;
1788         default:
1789                 break;
1790         }
1791         return 0;
1792 }
1793
1794 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1795 {
1796         for ( ; i; i = i->next) {
1797                 if (i->ifindex == dev->ifindex)
1798                         packet_dev_mc(dev, i, what);
1799         }
1800 }
1801
1802 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1803 {
1804         struct packet_sock *po = pkt_sk(sk);
1805         struct packet_mclist *ml, *i;
1806         struct net_device *dev;
1807         int err;
1808
1809         rtnl_lock();
1810
1811         err = -ENODEV;
1812         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1813         if (!dev)
1814                 goto done;
1815
1816         err = -EINVAL;
1817         if (mreq->mr_alen > dev->addr_len)
1818                 goto done;
1819
1820         err = -ENOBUFS;
1821         i = kmalloc(sizeof(*i), GFP_KERNEL);
1822         if (i == NULL)
1823                 goto done;
1824
1825         err = 0;
1826         for (ml = po->mclist; ml; ml = ml->next) {
1827                 if (ml->ifindex == mreq->mr_ifindex &&
1828                     ml->type == mreq->mr_type &&
1829                     ml->alen == mreq->mr_alen &&
1830                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1831                         ml->count++;
1832                         /* Free the new element ... */
1833                         kfree(i);
1834                         goto done;
1835                 }
1836         }
1837
1838         i->type = mreq->mr_type;
1839         i->ifindex = mreq->mr_ifindex;
1840         i->alen = mreq->mr_alen;
1841         memcpy(i->addr, mreq->mr_address, i->alen);
1842         i->count = 1;
1843         i->next = po->mclist;
1844         po->mclist = i;
1845         err = packet_dev_mc(dev, i, 1);
1846         if (err) {
1847                 po->mclist = i->next;
1848                 kfree(i);
1849         }
1850
1851 done:
1852         rtnl_unlock();
1853         return err;
1854 }
1855
1856 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1857 {
1858         struct packet_mclist *ml, **mlp;
1859
1860         rtnl_lock();
1861
1862         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1863                 if (ml->ifindex == mreq->mr_ifindex &&
1864                     ml->type == mreq->mr_type &&
1865                     ml->alen == mreq->mr_alen &&
1866                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1867                         if (--ml->count == 0) {
1868                                 struct net_device *dev;
1869                                 *mlp = ml->next;
1870                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1871                                 if (dev)
1872                                         packet_dev_mc(dev, ml, -1);
1873                                 kfree(ml);
1874                         }
1875                         rtnl_unlock();
1876                         return 0;
1877                 }
1878         }
1879         rtnl_unlock();
1880         return -EADDRNOTAVAIL;
1881 }
1882
1883 static void packet_flush_mclist(struct sock *sk)
1884 {
1885         struct packet_sock *po = pkt_sk(sk);
1886         struct packet_mclist *ml;
1887
1888         if (!po->mclist)
1889                 return;
1890
1891         rtnl_lock();
1892         while ((ml = po->mclist) != NULL) {
1893                 struct net_device *dev;
1894
1895                 po->mclist = ml->next;
1896                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1897                 if (dev != NULL)
1898                         packet_dev_mc(dev, ml, -1);
1899                 kfree(ml);
1900         }
1901         rtnl_unlock();
1902 }
1903
1904 static int
1905 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1906 {
1907         struct sock *sk = sock->sk;
1908         struct packet_sock *po = pkt_sk(sk);
1909         int ret;
1910
1911         if (level != SOL_PACKET)
1912                 return -ENOPROTOOPT;
1913
1914         switch (optname) {
1915         case PACKET_ADD_MEMBERSHIP:
1916         case PACKET_DROP_MEMBERSHIP:
1917         {
1918                 struct packet_mreq_max mreq;
1919                 int len = optlen;
1920                 memset(&mreq, 0, sizeof(mreq));
1921                 if (len < sizeof(struct packet_mreq))
1922                         return -EINVAL;
1923                 if (len > sizeof(mreq))
1924                         len = sizeof(mreq);
1925                 if (copy_from_user(&mreq, optval, len))
1926                         return -EFAULT;
1927                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1928                         return -EINVAL;
1929                 if (optname == PACKET_ADD_MEMBERSHIP)
1930                         ret = packet_mc_add(sk, &mreq);
1931                 else
1932                         ret = packet_mc_drop(sk, &mreq);
1933                 return ret;
1934         }
1935
1936         case PACKET_RX_RING:
1937         case PACKET_TX_RING:
1938         {
1939                 struct tpacket_req req;
1940
1941                 if (optlen < sizeof(req))
1942                         return -EINVAL;
1943                 if (pkt_sk(sk)->has_vnet_hdr)
1944                         return -EINVAL;
1945                 if (copy_from_user(&req, optval, sizeof(req)))
1946                         return -EFAULT;
1947                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1948         }
1949         case PACKET_COPY_THRESH:
1950         {
1951                 int val;
1952
1953                 if (optlen != sizeof(val))
1954                         return -EINVAL;
1955                 if (copy_from_user(&val, optval, sizeof(val)))
1956                         return -EFAULT;
1957
1958                 pkt_sk(sk)->copy_thresh = val;
1959                 return 0;
1960         }
1961         case PACKET_VERSION:
1962         {
1963                 int val;
1964
1965                 if (optlen != sizeof(val))
1966                         return -EINVAL;
1967                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1968                         return -EBUSY;
1969                 if (copy_from_user(&val, optval, sizeof(val)))
1970                         return -EFAULT;
1971                 switch (val) {
1972                 case TPACKET_V1:
1973                 case TPACKET_V2:
1974                         po->tp_version = val;
1975                         return 0;
1976                 default:
1977                         return -EINVAL;
1978                 }
1979         }
1980         case PACKET_RESERVE:
1981         {
1982                 unsigned int val;
1983
1984                 if (optlen != sizeof(val))
1985                         return -EINVAL;
1986                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1987                         return -EBUSY;
1988                 if (copy_from_user(&val, optval, sizeof(val)))
1989                         return -EFAULT;
1990                 po->tp_reserve = val;
1991                 return 0;
1992         }
1993         case PACKET_LOSS:
1994         {
1995                 unsigned int val;
1996
1997                 if (optlen != sizeof(val))
1998                         return -EINVAL;
1999                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2000                         return -EBUSY;
2001                 if (copy_from_user(&val, optval, sizeof(val)))
2002                         return -EFAULT;
2003                 po->tp_loss = !!val;
2004                 return 0;
2005         }
2006         case PACKET_AUXDATA:
2007         {
2008                 int val;
2009
2010                 if (optlen < sizeof(val))
2011                         return -EINVAL;
2012                 if (copy_from_user(&val, optval, sizeof(val)))
2013                         return -EFAULT;
2014
2015                 po->auxdata = !!val;
2016                 return 0;
2017         }
2018         case PACKET_ORIGDEV:
2019         {
2020                 int val;
2021
2022                 if (optlen < sizeof(val))
2023                         return -EINVAL;
2024                 if (copy_from_user(&val, optval, sizeof(val)))
2025                         return -EFAULT;
2026
2027                 po->origdev = !!val;
2028                 return 0;
2029         }
2030         case PACKET_VNET_HDR:
2031         {
2032                 int val;
2033
2034                 if (sock->type != SOCK_RAW)
2035                         return -EINVAL;
2036                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2037                         return -EBUSY;
2038                 if (optlen < sizeof(val))
2039                         return -EINVAL;
2040                 if (copy_from_user(&val, optval, sizeof(val)))
2041                         return -EFAULT;
2042
2043                 po->has_vnet_hdr = !!val;
2044                 return 0;
2045         }
2046         case PACKET_TIMESTAMP:
2047         {
2048                 int val;
2049
2050                 if (optlen != sizeof(val))
2051                         return -EINVAL;
2052                 if (copy_from_user(&val, optval, sizeof(val)))
2053                         return -EFAULT;
2054
2055                 po->tp_tstamp = val;
2056                 return 0;
2057         }
2058         default:
2059                 return -ENOPROTOOPT;
2060         }
2061 }
2062
2063 static int packet_getsockopt(struct socket *sock, int level, int optname,
2064                              char __user *optval, int __user *optlen)
2065 {
2066         int len;
2067         int val;
2068         struct sock *sk = sock->sk;
2069         struct packet_sock *po = pkt_sk(sk);
2070         void *data;
2071         struct tpacket_stats st;
2072
2073         if (level != SOL_PACKET)
2074                 return -ENOPROTOOPT;
2075
2076         if (get_user(len, optlen))
2077                 return -EFAULT;
2078
2079         if (len < 0)
2080                 return -EINVAL;
2081
2082         switch (optname) {
2083         case PACKET_STATISTICS:
2084                 if (len > sizeof(struct tpacket_stats))
2085                         len = sizeof(struct tpacket_stats);
2086                 spin_lock_bh(&sk->sk_receive_queue.lock);
2087                 st = po->stats;
2088                 memset(&po->stats, 0, sizeof(st));
2089                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2090                 st.tp_packets += st.tp_drops;
2091
2092                 data = &st;
2093                 break;
2094         case PACKET_AUXDATA:
2095                 if (len > sizeof(int))
2096                         len = sizeof(int);
2097                 val = po->auxdata;
2098
2099                 data = &val;
2100                 break;
2101         case PACKET_ORIGDEV:
2102                 if (len > sizeof(int))
2103                         len = sizeof(int);
2104                 val = po->origdev;
2105
2106                 data = &val;
2107                 break;
2108         case PACKET_VNET_HDR:
2109                 if (len > sizeof(int))
2110                         len = sizeof(int);
2111                 val = po->has_vnet_hdr;
2112
2113                 data = &val;
2114                 break;
2115         case PACKET_VERSION:
2116                 if (len > sizeof(int))
2117                         len = sizeof(int);
2118                 val = po->tp_version;
2119                 data = &val;
2120                 break;
2121         case PACKET_HDRLEN:
2122                 if (len > sizeof(int))
2123                         len = sizeof(int);
2124                 if (copy_from_user(&val, optval, len))
2125                         return -EFAULT;
2126                 switch (val) {
2127                 case TPACKET_V1:
2128                         val = sizeof(struct tpacket_hdr);
2129                         break;
2130                 case TPACKET_V2:
2131                         val = sizeof(struct tpacket2_hdr);
2132                         break;
2133                 default:
2134                         return -EINVAL;
2135                 }
2136                 data = &val;
2137                 break;
2138         case PACKET_RESERVE:
2139                 if (len > sizeof(unsigned int))
2140                         len = sizeof(unsigned int);
2141                 val = po->tp_reserve;
2142                 data = &val;
2143                 break;
2144         case PACKET_LOSS:
2145                 if (len > sizeof(unsigned int))
2146                         len = sizeof(unsigned int);
2147                 val = po->tp_loss;
2148                 data = &val;
2149                 break;
2150         case PACKET_TIMESTAMP:
2151                 if (len > sizeof(int))
2152                         len = sizeof(int);
2153                 val = po->tp_tstamp;
2154                 data = &val;
2155                 break;
2156         default:
2157                 return -ENOPROTOOPT;
2158         }
2159
2160         if (put_user(len, optlen))
2161                 return -EFAULT;
2162         if (copy_to_user(optval, data, len))
2163                 return -EFAULT;
2164         return 0;
2165 }
2166
2167
2168 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2169 {
2170         struct sock *sk;
2171         struct hlist_node *node;
2172         struct net_device *dev = data;
2173         struct net *net = dev_net(dev);
2174
2175         rcu_read_lock();
2176         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2177                 struct packet_sock *po = pkt_sk(sk);
2178
2179                 switch (msg) {
2180                 case NETDEV_UNREGISTER:
2181                         if (po->mclist)
2182                                 packet_dev_mclist(dev, po->mclist, -1);
2183                         /* fallthrough */
2184
2185                 case NETDEV_DOWN:
2186                         if (dev->ifindex == po->ifindex) {
2187                                 spin_lock(&po->bind_lock);
2188                                 if (po->running) {
2189                                         __dev_remove_pack(&po->prot_hook);
2190                                         __sock_put(sk);
2191                                         po->running = 0;
2192                                         sk->sk_err = ENETDOWN;
2193                                         if (!sock_flag(sk, SOCK_DEAD))
2194                                                 sk->sk_error_report(sk);
2195                                 }
2196                                 if (msg == NETDEV_UNREGISTER) {
2197                                         po->ifindex = -1;
2198                                         po->prot_hook.dev = NULL;
2199                                 }
2200                                 spin_unlock(&po->bind_lock);
2201                         }
2202                         break;
2203                 case NETDEV_UP:
2204                         if (dev->ifindex == po->ifindex) {
2205                                 spin_lock(&po->bind_lock);
2206                                 if (po->num && !po->running) {
2207                                         dev_add_pack(&po->prot_hook);
2208                                         sock_hold(sk);
2209                                         po->running = 1;
2210                                 }
2211                                 spin_unlock(&po->bind_lock);
2212                         }
2213                         break;
2214                 }
2215         }
2216         rcu_read_unlock();
2217         return NOTIFY_DONE;
2218 }
2219
2220
2221 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2222                         unsigned long arg)
2223 {
2224         struct sock *sk = sock->sk;
2225
2226         switch (cmd) {
2227         case SIOCOUTQ:
2228         {
2229                 int amount = sk_wmem_alloc_get(sk);
2230
2231                 return put_user(amount, (int __user *)arg);
2232         }
2233         case SIOCINQ:
2234         {
2235                 struct sk_buff *skb;
2236                 int amount = 0;
2237
2238                 spin_lock_bh(&sk->sk_receive_queue.lock);
2239                 skb = skb_peek(&sk->sk_receive_queue);
2240                 if (skb)
2241                         amount = skb->len;
2242                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2243                 return put_user(amount, (int __user *)arg);
2244         }
2245         case SIOCGSTAMP:
2246                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2247         case SIOCGSTAMPNS:
2248                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2249
2250 #ifdef CONFIG_INET
2251         case SIOCADDRT:
2252         case SIOCDELRT:
2253         case SIOCDARP:
2254         case SIOCGARP:
2255         case SIOCSARP:
2256         case SIOCGIFADDR:
2257         case SIOCSIFADDR:
2258         case SIOCGIFBRDADDR:
2259         case SIOCSIFBRDADDR:
2260         case SIOCGIFNETMASK:
2261         case SIOCSIFNETMASK:
2262         case SIOCGIFDSTADDR:
2263         case SIOCSIFDSTADDR:
2264         case SIOCSIFFLAGS:
2265                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2266 #endif
2267
2268         default:
2269                 return -ENOIOCTLCMD;
2270         }
2271         return 0;
2272 }
2273
2274 static unsigned int packet_poll(struct file *file, struct socket *sock,
2275                                 poll_table *wait)
2276 {
2277         struct sock *sk = sock->sk;
2278         struct packet_sock *po = pkt_sk(sk);
2279         unsigned int mask = datagram_poll(file, sock, wait);
2280
2281         spin_lock_bh(&sk->sk_receive_queue.lock);
2282         if (po->rx_ring.pg_vec) {
2283                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2284                         mask |= POLLIN | POLLRDNORM;
2285         }
2286         spin_unlock_bh(&sk->sk_receive_queue.lock);
2287         spin_lock_bh(&sk->sk_write_queue.lock);
2288         if (po->tx_ring.pg_vec) {
2289                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2290                         mask |= POLLOUT | POLLWRNORM;
2291         }
2292         spin_unlock_bh(&sk->sk_write_queue.lock);
2293         return mask;
2294 }
2295
2296
2297 /* Dirty? Well, I still did not learn better way to account
2298  * for user mmaps.
2299  */
2300
2301 static void packet_mm_open(struct vm_area_struct *vma)
2302 {
2303         struct file *file = vma->vm_file;
2304         struct socket *sock = file->private_data;
2305         struct sock *sk = sock->sk;
2306
2307         if (sk)
2308                 atomic_inc(&pkt_sk(sk)->mapped);
2309 }
2310
2311 static void packet_mm_close(struct vm_area_struct *vma)
2312 {
2313         struct file *file = vma->vm_file;
2314         struct socket *sock = file->private_data;
2315         struct sock *sk = sock->sk;
2316
2317         if (sk)
2318                 atomic_dec(&pkt_sk(sk)->mapped);
2319 }
2320
2321 static const struct vm_operations_struct packet_mmap_ops = {
2322         .open   =       packet_mm_open,
2323         .close  =       packet_mm_close,
2324 };
2325
2326 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2327 {
2328         int i;
2329
2330         for (i = 0; i < len; i++) {
2331                 if (likely(pg_vec[i]))
2332                         free_pages((unsigned long) pg_vec[i], order);
2333         }
2334         kfree(pg_vec);
2335 }
2336
2337 static inline char *alloc_one_pg_vec_page(unsigned long order)
2338 {
2339         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2340
2341         return (char *) __get_free_pages(gfp_flags, order);
2342 }
2343
2344 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2345 {
2346         unsigned int block_nr = req->tp_block_nr;
2347         char **pg_vec;
2348         int i;
2349
2350         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2351         if (unlikely(!pg_vec))
2352                 goto out;
2353
2354         for (i = 0; i < block_nr; i++) {
2355                 pg_vec[i] = alloc_one_pg_vec_page(order);
2356                 if (unlikely(!pg_vec[i]))
2357                         goto out_free_pgvec;
2358         }
2359
2360 out:
2361         return pg_vec;
2362
2363 out_free_pgvec:
2364         free_pg_vec(pg_vec, order, block_nr);
2365         pg_vec = NULL;
2366         goto out;
2367 }
2368
2369 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2370                 int closing, int tx_ring)
2371 {
2372         char **pg_vec = NULL;
2373         struct packet_sock *po = pkt_sk(sk);
2374         int was_running, order = 0;
2375         struct packet_ring_buffer *rb;
2376         struct sk_buff_head *rb_queue;
2377         __be16 num;
2378         int err;
2379
2380         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2381         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2382
2383         err = -EBUSY;
2384         if (!closing) {
2385                 if (atomic_read(&po->mapped))
2386                         goto out;
2387                 if (atomic_read(&rb->pending))
2388                         goto out;
2389         }
2390
2391         if (req->tp_block_nr) {
2392                 /* Sanity tests and some calculations */
2393                 err = -EBUSY;
2394                 if (unlikely(rb->pg_vec))
2395                         goto out;
2396
2397                 switch (po->tp_version) {
2398                 case TPACKET_V1:
2399                         po->tp_hdrlen = TPACKET_HDRLEN;
2400                         break;
2401                 case TPACKET_V2:
2402                         po->tp_hdrlen = TPACKET2_HDRLEN;
2403                         break;
2404                 }
2405
2406                 err = -EINVAL;
2407                 if (unlikely((int)req->tp_block_size <= 0))
2408                         goto out;
2409                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2410                         goto out;
2411                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2412                                         po->tp_reserve))
2413                         goto out;
2414                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2415                         goto out;
2416
2417                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2418                 if (unlikely(rb->frames_per_block <= 0))
2419                         goto out;
2420                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2421                                         req->tp_frame_nr))
2422                         goto out;
2423
2424                 err = -ENOMEM;
2425                 order = get_order(req->tp_block_size);
2426                 pg_vec = alloc_pg_vec(req, order);
2427                 if (unlikely(!pg_vec))
2428                         goto out;
2429         }
2430         /* Done */
2431         else {
2432                 err = -EINVAL;
2433                 if (unlikely(req->tp_frame_nr))
2434                         goto out;
2435         }
2436
2437         lock_sock(sk);
2438
2439         /* Detach socket from network */
2440         spin_lock(&po->bind_lock);
2441         was_running = po->running;
2442         num = po->num;
2443         if (was_running) {
2444                 __dev_remove_pack(&po->prot_hook);
2445                 po->num = 0;
2446                 po->running = 0;
2447                 __sock_put(sk);
2448         }
2449         spin_unlock(&po->bind_lock);
2450
2451         synchronize_net();
2452
2453         err = -EBUSY;
2454         mutex_lock(&po->pg_vec_lock);
2455         if (closing || atomic_read(&po->mapped) == 0) {
2456                 err = 0;
2457 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2458                 spin_lock_bh(&rb_queue->lock);
2459                 pg_vec = XC(rb->pg_vec, pg_vec);
2460                 rb->frame_max = (req->tp_frame_nr - 1);
2461                 rb->head = 0;
2462                 rb->frame_size = req->tp_frame_size;
2463                 spin_unlock_bh(&rb_queue->lock);
2464
2465                 order = XC(rb->pg_vec_order, order);
2466                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2467
2468                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2469                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2470                                                 tpacket_rcv : packet_rcv;
2471                 skb_queue_purge(rb_queue);
2472 #undef XC
2473                 if (atomic_read(&po->mapped))
2474                         pr_err("packet_mmap: vma is busy: %d\n",
2475                                atomic_read(&po->mapped));
2476         }
2477         mutex_unlock(&po->pg_vec_lock);
2478
2479         spin_lock(&po->bind_lock);
2480         if (was_running && !po->running) {
2481                 sock_hold(sk);
2482                 po->running = 1;
2483                 po->num = num;
2484                 dev_add_pack(&po->prot_hook);
2485         }
2486         spin_unlock(&po->bind_lock);
2487
2488         release_sock(sk);
2489
2490         if (pg_vec)
2491                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2492 out:
2493         return err;
2494 }
2495
2496 static int packet_mmap(struct file *file, struct socket *sock,
2497                 struct vm_area_struct *vma)
2498 {
2499         struct sock *sk = sock->sk;
2500         struct packet_sock *po = pkt_sk(sk);
2501         unsigned long size, expected_size;
2502         struct packet_ring_buffer *rb;
2503         unsigned long start;
2504         int err = -EINVAL;
2505         int i;
2506
2507         if (vma->vm_pgoff)
2508                 return -EINVAL;
2509
2510         mutex_lock(&po->pg_vec_lock);
2511
2512         expected_size = 0;
2513         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2514                 if (rb->pg_vec) {
2515                         expected_size += rb->pg_vec_len
2516                                                 * rb->pg_vec_pages
2517                                                 * PAGE_SIZE;
2518                 }
2519         }
2520
2521         if (expected_size == 0)
2522                 goto out;
2523
2524         size = vma->vm_end - vma->vm_start;
2525         if (size != expected_size)
2526                 goto out;
2527
2528         start = vma->vm_start;
2529         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2530                 if (rb->pg_vec == NULL)
2531                         continue;
2532
2533                 for (i = 0; i < rb->pg_vec_len; i++) {
2534                         struct page *page = virt_to_page(rb->pg_vec[i]);
2535                         int pg_num;
2536
2537                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2538                                         pg_num++, page++) {
2539                                 err = vm_insert_page(vma, start, page);
2540                                 if (unlikely(err))
2541                                         goto out;
2542                                 start += PAGE_SIZE;
2543                         }
2544                 }
2545         }
2546
2547         atomic_inc(&po->mapped);
2548         vma->vm_ops = &packet_mmap_ops;
2549         err = 0;
2550
2551 out:
2552         mutex_unlock(&po->pg_vec_lock);
2553         return err;
2554 }
2555
2556 static const struct proto_ops packet_ops_spkt = {
2557         .family =       PF_PACKET,
2558         .owner =        THIS_MODULE,
2559         .release =      packet_release,
2560         .bind =         packet_bind_spkt,
2561         .connect =      sock_no_connect,
2562         .socketpair =   sock_no_socketpair,
2563         .accept =       sock_no_accept,
2564         .getname =      packet_getname_spkt,
2565         .poll =         datagram_poll,
2566         .ioctl =        packet_ioctl,
2567         .listen =       sock_no_listen,
2568         .shutdown =     sock_no_shutdown,
2569         .setsockopt =   sock_no_setsockopt,
2570         .getsockopt =   sock_no_getsockopt,
2571         .sendmsg =      packet_sendmsg_spkt,
2572         .recvmsg =      packet_recvmsg,
2573         .mmap =         sock_no_mmap,
2574         .sendpage =     sock_no_sendpage,
2575 };
2576
2577 static const struct proto_ops packet_ops = {
2578         .family =       PF_PACKET,
2579         .owner =        THIS_MODULE,
2580         .release =      packet_release,
2581         .bind =         packet_bind,
2582         .connect =      sock_no_connect,
2583         .socketpair =   sock_no_socketpair,
2584         .accept =       sock_no_accept,
2585         .getname =      packet_getname,
2586         .poll =         packet_poll,
2587         .ioctl =        packet_ioctl,
2588         .listen =       sock_no_listen,
2589         .shutdown =     sock_no_shutdown,
2590         .setsockopt =   packet_setsockopt,
2591         .getsockopt =   packet_getsockopt,
2592         .sendmsg =      packet_sendmsg,
2593         .recvmsg =      packet_recvmsg,
2594         .mmap =         packet_mmap,
2595         .sendpage =     sock_no_sendpage,
2596 };
2597
2598 static const struct net_proto_family packet_family_ops = {
2599         .family =       PF_PACKET,
2600         .create =       packet_create,
2601         .owner  =       THIS_MODULE,
2602 };
2603
2604 static struct notifier_block packet_netdev_notifier = {
2605         .notifier_call =        packet_notifier,
2606 };
2607
2608 #ifdef CONFIG_PROC_FS
2609
2610 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2611         __acquires(RCU)
2612 {
2613         struct net *net = seq_file_net(seq);
2614
2615         rcu_read_lock();
2616         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2617 }
2618
2619 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2620 {
2621         struct net *net = seq_file_net(seq);
2622         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2623 }
2624
2625 static void packet_seq_stop(struct seq_file *seq, void *v)
2626         __releases(RCU)
2627 {
2628         rcu_read_unlock();
2629 }
2630
2631 static int packet_seq_show(struct seq_file *seq, void *v)
2632 {
2633         if (v == SEQ_START_TOKEN)
2634                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2635         else {
2636                 struct sock *s = sk_entry(v);
2637                 const struct packet_sock *po = pkt_sk(s);
2638
2639                 seq_printf(seq,
2640                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2641                            s,
2642                            atomic_read(&s->sk_refcnt),
2643                            s->sk_type,
2644                            ntohs(po->num),
2645                            po->ifindex,
2646                            po->running,
2647                            atomic_read(&s->sk_rmem_alloc),
2648                            sock_i_uid(s),
2649                            sock_i_ino(s));
2650         }
2651
2652         return 0;
2653 }
2654
2655 static const struct seq_operations packet_seq_ops = {
2656         .start  = packet_seq_start,
2657         .next   = packet_seq_next,
2658         .stop   = packet_seq_stop,
2659         .show   = packet_seq_show,
2660 };
2661
2662 static int packet_seq_open(struct inode *inode, struct file *file)
2663 {
2664         return seq_open_net(inode, file, &packet_seq_ops,
2665                             sizeof(struct seq_net_private));
2666 }
2667
2668 static const struct file_operations packet_seq_fops = {
2669         .owner          = THIS_MODULE,
2670         .open           = packet_seq_open,
2671         .read           = seq_read,
2672         .llseek         = seq_lseek,
2673         .release        = seq_release_net,
2674 };
2675
2676 #endif
2677
2678 static int __net_init packet_net_init(struct net *net)
2679 {
2680         spin_lock_init(&net->packet.sklist_lock);
2681         INIT_HLIST_HEAD(&net->packet.sklist);
2682
2683         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2684                 return -ENOMEM;
2685
2686         return 0;
2687 }
2688
2689 static void __net_exit packet_net_exit(struct net *net)
2690 {
2691         proc_net_remove(net, "packet");
2692 }
2693
2694 static struct pernet_operations packet_net_ops = {
2695         .init = packet_net_init,
2696         .exit = packet_net_exit,
2697 };
2698
2699
2700 static void __exit packet_exit(void)
2701 {
2702         unregister_netdevice_notifier(&packet_netdev_notifier);
2703         unregister_pernet_subsys(&packet_net_ops);
2704         sock_unregister(PF_PACKET);
2705         proto_unregister(&packet_proto);
2706 }
2707
2708 static int __init packet_init(void)
2709 {
2710         int rc = proto_register(&packet_proto, 0);
2711
2712         if (rc != 0)
2713                 goto out;
2714
2715         sock_register(&packet_family_ops);
2716         register_pernet_subsys(&packet_net_ops);
2717         register_netdevice_notifier(&packet_netdev_notifier);
2718 out:
2719         return rc;
2720 }
2721
2722 module_init(packet_init);
2723 module_exit(packet_exit);
2724 MODULE_LICENSE("GPL");
2725 MODULE_ALIAS_NETPROTO(PF_PACKET);