You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
net: TX_RING and packet mmap
New packet socket feature that makes packet socket more efficient for transmission. - It reduces number of system call through a PACKET_TX_RING mechanism, based on PACKET_RX_RING (Circular buffer allocated in kernel space which is mmapped from user space). - It minimizes CPU copy using fragmented SKB (almost zero copy). Signed-off-by: Johann Baudy <johann.baudy@gnu-log.net> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
committed by
David S. Miller
parent
f67f340849
commit
69e3c75f4d
@@ -4,16 +4,18 @@
|
|||||||
|
|
||||||
This file documents the CONFIG_PACKET_MMAP option available with the PACKET
|
This file documents the CONFIG_PACKET_MMAP option available with the PACKET
|
||||||
socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
|
socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
|
||||||
capture network traffic with utilities like tcpdump or any other that uses
|
capture network traffic with utilities like tcpdump or any other that needs
|
||||||
the libpcap library.
|
raw access to network interface.
|
||||||
|
|
||||||
You can find the latest version of this document at
|
|
||||||
|
|
||||||
|
You can find the latest version of this document at:
|
||||||
http://pusa.uv.es/~ulisses/packet_mmap/
|
http://pusa.uv.es/~ulisses/packet_mmap/
|
||||||
|
|
||||||
Please send me your comments to
|
Howto can be found at:
|
||||||
|
http://wiki.gnu-log.net (packet_mmap)
|
||||||
|
|
||||||
|
Please send your comments to
|
||||||
Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
|
Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
|
||||||
|
Johann Baudy <johann.baudy@gnu-log.net>
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
+ Why use PACKET_MMAP
|
+ Why use PACKET_MMAP
|
||||||
@@ -25,19 +27,24 @@ to capture each packet, it requires two if you want to get packet's
|
|||||||
timestamp (like libpcap always does).
|
timestamp (like libpcap always does).
|
||||||
|
|
||||||
In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size
|
In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size
|
||||||
configurable circular buffer mapped in user space. This way reading packets just
|
configurable circular buffer mapped in user space that can be used to either
|
||||||
needs to wait for them, most of the time there is no need to issue a single
|
send or receive packets. This way reading packets just needs to wait for them,
|
||||||
system call. By using a shared buffer between the kernel and the user
|
most of the time there is no need to issue a single system call. Concerning
|
||||||
also has the benefit of minimizing packet copies.
|
transmission, multiple packets can be sent through one system call to get the
|
||||||
|
highest bandwidth.
|
||||||
|
By using a shared buffer between the kernel and the user also has the benefit
|
||||||
|
of minimizing packet copies.
|
||||||
|
|
||||||
It's fine to use PACKET_MMAP to improve the performance of the capture process,
|
It's fine to use PACKET_MMAP to improve the performance of the capture and
|
||||||
but it isn't everything. At least, if you are capturing at high speeds (this
|
transmission process, but it isn't everything. At least, if you are capturing
|
||||||
is relative to the cpu speed), you should check if the device driver of your
|
at high speeds (this is relative to the cpu speed), you should check if the
|
||||||
network interface card supports some sort of interrupt load mitigation or
|
device driver of your network interface card supports some sort of interrupt
|
||||||
(even better) if it supports NAPI, also make sure it is enabled.
|
load mitigation or (even better) if it supports NAPI, also make sure it is
|
||||||
|
enabled. For transmission, check the MTU (Maximum Transmission Unit) used and
|
||||||
|
supported by devices of your network.
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
+ How to use CONFIG_PACKET_MMAP
|
+ How to use CONFIG_PACKET_MMAP to improve capture process
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
|
|
||||||
From the user standpoint, you should use the higher level libpcap library, which
|
From the user standpoint, you should use the higher level libpcap library, which
|
||||||
@@ -57,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP
|
|||||||
support.
|
support.
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
+ How to use CONFIG_PACKET_MMAP directly
|
+ How to use CONFIG_PACKET_MMAP directly to improve capture process
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
|
|
||||||
From the system calls stand point, the use of PACKET_MMAP involves
|
From the system calls stand point, the use of PACKET_MMAP involves
|
||||||
@@ -66,6 +73,7 @@ the following process:
|
|||||||
|
|
||||||
[setup] socket() -------> creation of the capture socket
|
[setup] socket() -------> creation of the capture socket
|
||||||
setsockopt() ---> allocation of the circular buffer (ring)
|
setsockopt() ---> allocation of the circular buffer (ring)
|
||||||
|
option: PACKET_RX_RING
|
||||||
mmap() ---------> mapping of the allocated buffer to the
|
mmap() ---------> mapping of the allocated buffer to the
|
||||||
user process
|
user process
|
||||||
|
|
||||||
@@ -96,6 +104,65 @@ Next I will describe PACKET_MMAP settings and it's constraints,
|
|||||||
also the mapping of the circular buffer in the user process and
|
also the mapping of the circular buffer in the user process and
|
||||||
the use of this buffer.
|
the use of this buffer.
|
||||||
|
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
+ How to use CONFIG_PACKET_MMAP directly to improve transmission process
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Transmission process is similar to capture as shown below.
|
||||||
|
|
||||||
|
[setup] socket() -------> creation of the transmission socket
|
||||||
|
setsockopt() ---> allocation of the circular buffer (ring)
|
||||||
|
option: PACKET_TX_RING
|
||||||
|
bind() ---------> bind transmission socket with a network interface
|
||||||
|
mmap() ---------> mapping of the allocated buffer to the
|
||||||
|
user process
|
||||||
|
|
||||||
|
[transmission] poll() ---------> wait for free packets (optional)
|
||||||
|
send() ---------> send all packets that are set as ready in
|
||||||
|
the ring
|
||||||
|
The flag MSG_DONTWAIT can be used to return
|
||||||
|
before end of transfer.
|
||||||
|
|
||||||
|
[shutdown] close() --------> destruction of the transmission socket and
|
||||||
|
deallocation of all associated resources.
|
||||||
|
|
||||||
|
Binding the socket to your network interface is mandatory (with zero copy) to
|
||||||
|
know the header size of frames used in the circular buffer.
|
||||||
|
|
||||||
|
As capture, each frame contains two parts:
|
||||||
|
|
||||||
|
--------------------
|
||||||
|
| struct tpacket_hdr | Header. It contains the status of
|
||||||
|
| | of this frame
|
||||||
|
|--------------------|
|
||||||
|
| data buffer |
|
||||||
|
. . Data that will be sent over the network interface.
|
||||||
|
. .
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
bind() associates the socket to your network interface thanks to
|
||||||
|
sll_ifindex parameter of struct sockaddr_ll.
|
||||||
|
|
||||||
|
Initialization example:
|
||||||
|
|
||||||
|
struct sockaddr_ll my_addr;
|
||||||
|
struct ifreq s_ifr;
|
||||||
|
...
|
||||||
|
|
||||||
|
strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
|
||||||
|
|
||||||
|
/* get interface index of eth0 */
|
||||||
|
ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
|
||||||
|
|
||||||
|
/* fill sockaddr_ll struct to prepare binding */
|
||||||
|
my_addr.sll_family = AF_PACKET;
|
||||||
|
my_addr.sll_protocol = ETH_P_ALL;
|
||||||
|
my_addr.sll_ifindex = s_ifr.ifr_ifindex;
|
||||||
|
|
||||||
|
/* bind socket to eth0 */
|
||||||
|
bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
|
||||||
|
|
||||||
|
A complete tutorial is available at: http://wiki.gnu-log.net/
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
+ PACKET_MMAP settings
|
+ PACKET_MMAP settings
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
@@ -103,7 +170,10 @@ the use of this buffer.
|
|||||||
|
|
||||||
To setup PACKET_MMAP from user level code is done with a call like
|
To setup PACKET_MMAP from user level code is done with a call like
|
||||||
|
|
||||||
|
- Capture process
|
||||||
setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
|
setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
|
||||||
|
- Transmission process
|
||||||
|
setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
|
||||||
|
|
||||||
The most significant argument in the previous call is the req parameter,
|
The most significant argument in the previous call is the req parameter,
|
||||||
this parameter must to have the following structure:
|
this parameter must to have the following structure:
|
||||||
@@ -117,11 +187,11 @@ this parameter must to have the following structure:
|
|||||||
};
|
};
|
||||||
|
|
||||||
This structure is defined in /usr/include/linux/if_packet.h and establishes a
|
This structure is defined in /usr/include/linux/if_packet.h and establishes a
|
||||||
circular buffer (ring) of unswappable memory mapped in the capture process.
|
circular buffer (ring) of unswappable memory.
|
||||||
Being mapped in the capture process allows reading the captured frames and
|
Being mapped in the capture process allows reading the captured frames and
|
||||||
related meta-information like timestamps without requiring a system call.
|
related meta-information like timestamps without requiring a system call.
|
||||||
|
|
||||||
Captured frames are grouped in blocks. Each block is a physically contiguous
|
Frames are grouped in blocks. Each block is a physically contiguous
|
||||||
region of memory and holds tp_block_size/tp_frame_size frames. The total number
|
region of memory and holds tp_block_size/tp_frame_size frames. The total number
|
||||||
of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
|
of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
|
||||||
|
|
||||||
@@ -336,6 +406,7 @@ struct tpacket_hdr). If this field is 0 means that the frame is ready
|
|||||||
to be used for the kernel, If not, there is a frame the user can read
|
to be used for the kernel, If not, there is a frame the user can read
|
||||||
and the following flags apply:
|
and the following flags apply:
|
||||||
|
|
||||||
|
+++ Capture process:
|
||||||
from include/linux/if_packet.h
|
from include/linux/if_packet.h
|
||||||
|
|
||||||
#define TP_STATUS_COPY 2
|
#define TP_STATUS_COPY 2
|
||||||
@@ -391,6 +462,37 @@ packets are in the ring:
|
|||||||
It doesn't incur in a race condition to first check the status value and
|
It doesn't incur in a race condition to first check the status value and
|
||||||
then poll for frames.
|
then poll for frames.
|
||||||
|
|
||||||
|
|
||||||
|
++ Transmission process
|
||||||
|
Those defines are also used for transmission:
|
||||||
|
|
||||||
|
#define TP_STATUS_AVAILABLE 0 // Frame is available
|
||||||
|
#define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send()
|
||||||
|
#define TP_STATUS_SENDING 2 // Frame is currently in transmission
|
||||||
|
#define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct
|
||||||
|
|
||||||
|
First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a
|
||||||
|
packet, the user fills a data buffer of an available frame, sets tp_len to
|
||||||
|
current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST.
|
||||||
|
This can be done on multiple frames. Once the user is ready to transmit, it
|
||||||
|
calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are
|
||||||
|
forwarded to the network device. The kernel updates each status of sent
|
||||||
|
frames with TP_STATUS_SENDING until the end of transfer.
|
||||||
|
At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE.
|
||||||
|
|
||||||
|
header->tp_len = in_i_size;
|
||||||
|
header->tp_status = TP_STATUS_SEND_REQUEST;
|
||||||
|
retval = send(this->socket, NULL, 0, 0);
|
||||||
|
|
||||||
|
The user can also use poll() to check if a buffer is available:
|
||||||
|
(status == TP_STATUS_SENDING)
|
||||||
|
|
||||||
|
struct pollfd pfd;
|
||||||
|
pfd.fd = fd;
|
||||||
|
pfd.revents = 0;
|
||||||
|
pfd.events = POLLOUT;
|
||||||
|
retval = poll(&pfd, 1, timeout);
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
+ THANKS
|
+ THANKS
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -46,6 +46,8 @@ struct sockaddr_ll
|
|||||||
#define PACKET_VERSION 10
|
#define PACKET_VERSION 10
|
||||||
#define PACKET_HDRLEN 11
|
#define PACKET_HDRLEN 11
|
||||||
#define PACKET_RESERVE 12
|
#define PACKET_RESERVE 12
|
||||||
|
#define PACKET_TX_RING 13
|
||||||
|
#define PACKET_LOSS 14
|
||||||
|
|
||||||
struct tpacket_stats
|
struct tpacket_stats
|
||||||
{
|
{
|
||||||
@@ -63,14 +65,22 @@ struct tpacket_auxdata
|
|||||||
__u16 tp_vlan_tci;
|
__u16 tp_vlan_tci;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Rx ring - header status */
|
||||||
|
#define TP_STATUS_KERNEL 0x0
|
||||||
|
#define TP_STATUS_USER 0x1
|
||||||
|
#define TP_STATUS_COPY 0x2
|
||||||
|
#define TP_STATUS_LOSING 0x4
|
||||||
|
#define TP_STATUS_CSUMNOTREADY 0x8
|
||||||
|
|
||||||
|
/* Tx ring - header status */
|
||||||
|
#define TP_STATUS_AVAILABLE 0x0
|
||||||
|
#define TP_STATUS_SEND_REQUEST 0x1
|
||||||
|
#define TP_STATUS_SENDING 0x2
|
||||||
|
#define TP_STATUS_WRONG_FORMAT 0x4
|
||||||
|
|
||||||
struct tpacket_hdr
|
struct tpacket_hdr
|
||||||
{
|
{
|
||||||
unsigned long tp_status;
|
unsigned long tp_status;
|
||||||
#define TP_STATUS_KERNEL 0
|
|
||||||
#define TP_STATUS_USER 1
|
|
||||||
#define TP_STATUS_COPY 2
|
|
||||||
#define TP_STATUS_LOSING 4
|
|
||||||
#define TP_STATUS_CSUMNOTREADY 8
|
|
||||||
unsigned int tp_len;
|
unsigned int tp_len;
|
||||||
unsigned int tp_snaplen;
|
unsigned int tp_snaplen;
|
||||||
unsigned short tp_mac;
|
unsigned short tp_mac;
|
||||||
|
|||||||
@@ -203,6 +203,9 @@ struct skb_shared_info {
|
|||||||
#ifdef CONFIG_HAS_DMA
|
#ifdef CONFIG_HAS_DMA
|
||||||
dma_addr_t dma_maps[MAX_SKB_FRAGS + 1];
|
dma_addr_t dma_maps[MAX_SKB_FRAGS + 1];
|
||||||
#endif
|
#endif
|
||||||
|
/* Intermediate layers must ensure that destructor_arg
|
||||||
|
* remains valid until skb destructor */
|
||||||
|
void * destructor_arg;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* We divide dataref into two halves. The higher 16 bits hold references
|
/* We divide dataref into two halves. The higher 16 bits hold references
|
||||||
|
|||||||
+484
-118
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user