You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'origin' into devel
Conflicts: sound/soc/pxa/pxa2xx-i2s.c
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
What: security/ima/policy
|
||||
Date: May 2008
|
||||
Contact: Mimi Zohar <zohar@us.ibm.com>
|
||||
Description:
|
||||
The Trusted Computing Group(TCG) runtime Integrity
|
||||
Measurement Architecture(IMA) maintains a list of hash
|
||||
values of executables and other sensitive system files
|
||||
loaded into the run-time of this system. At runtime,
|
||||
the policy can be constrained based on LSM specific data.
|
||||
Policies are loaded into the securityfs file ima/policy
|
||||
by opening the file, writing the rules one at a time and
|
||||
then closing the file. The new policy takes effect after
|
||||
the file ima/policy is closed.
|
||||
|
||||
rule format: action [condition ...]
|
||||
|
||||
action: measure | dont_measure
|
||||
condition:= base | lsm
|
||||
base: [[func=] [mask=] [fsmagic=] [uid=]]
|
||||
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
||||
[obj_user=] [obj_role=] [obj_type=]]
|
||||
|
||||
base: func:= [BPRM_CHECK][FILE_MMAP][INODE_PERMISSION]
|
||||
mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC]
|
||||
fsmagic:= hex value
|
||||
uid:= decimal value
|
||||
lsm: are LSM specific
|
||||
|
||||
default policy:
|
||||
# PROC_SUPER_MAGIC
|
||||
dont_measure fsmagic=0x9fa0
|
||||
# SYSFS_MAGIC
|
||||
dont_measure fsmagic=0x62656572
|
||||
# DEBUGFS_MAGIC
|
||||
dont_measure fsmagic=0x64626720
|
||||
# TMPFS_MAGIC
|
||||
dont_measure fsmagic=0x01021994
|
||||
# SECURITYFS_MAGIC
|
||||
dont_measure fsmagic=0x73636673
|
||||
|
||||
measure func=BPRM_CHECK
|
||||
measure func=FILE_MMAP mask=MAY_EXEC
|
||||
measure func=INODE_PERM mask=MAY_READ uid=0
|
||||
|
||||
The default policy measures all executables in bprm_check,
|
||||
all files mmapped executable in file_mmap, and all files
|
||||
open for read by root in inode_permission.
|
||||
|
||||
Examples of LSM specific definitions:
|
||||
|
||||
SELinux:
|
||||
# SELINUX_MAGIC
|
||||
dont_measure fsmagic=0xF97CFF8C
|
||||
|
||||
dont_measure obj_type=var_log_t
|
||||
dont_measure obj_type=auditd_log_t
|
||||
measure subj_user=system_u func=INODE_PERM mask=MAY_READ
|
||||
measure subj_role=system_r func=INODE_PERM mask=MAY_READ
|
||||
|
||||
Smack:
|
||||
measure subj_user=_ func=INODE_PERM mask=MAY_READ
|
||||
@@ -12,7 +12,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
|
||||
kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \
|
||||
gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
|
||||
genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
|
||||
mac80211.xml debugobjects.xml sh.xml regulator.xml
|
||||
mac80211.xml debugobjects.xml sh.xml regulator.xml \
|
||||
alsa-driver-api.xml writing-an-alsa-driver.xml
|
||||
|
||||
###
|
||||
# The build process is as follows (targets):
|
||||
|
||||
+13
-4
@@ -1,11 +1,11 @@
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN">
|
||||
|
||||
<book>
|
||||
<?dbhtml filename="index.html">
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<!-- ****************************************************** -->
|
||||
<!-- Header -->
|
||||
<!-- ****************************************************** -->
|
||||
<book id="ALSA-Driver-API">
|
||||
<bookinfo>
|
||||
<title>The ALSA Driver API</title>
|
||||
|
||||
@@ -35,6 +35,8 @@
|
||||
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter><title>Management of Cards and Devices</title>
|
||||
<sect1><title>Card Management</title>
|
||||
!Esound/core/init.c
|
||||
@@ -71,6 +73,10 @@
|
||||
!Esound/pci/ac97/ac97_codec.c
|
||||
!Esound/pci/ac97/ac97_pcm.c
|
||||
</sect1>
|
||||
<sect1><title>Virtual Master Control API</title>
|
||||
!Esound/core/vmaster.c
|
||||
!Iinclude/sound/control.h
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter><title>MIDI API</title>
|
||||
<sect1><title>Raw MIDI API</title>
|
||||
@@ -88,6 +94,9 @@
|
||||
<chapter><title>Miscellaneous Functions</title>
|
||||
<sect1><title>Hardware-Dependent Devices API</title>
|
||||
!Esound/core/hwdep.c
|
||||
</sect1>
|
||||
<sect1><title>Jack Abstraction Layer API</title>
|
||||
!Esound/core/jack.c
|
||||
</sect1>
|
||||
<sect1><title>ISA DMA Helpers</title>
|
||||
!Esound/core/isadma.c
|
||||
@@ -440,6 +440,7 @@ desc->chip->end();
|
||||
used in the generic IRQ layer.
|
||||
</para>
|
||||
!Iinclude/linux/irq.h
|
||||
!Iinclude/linux/interrupt.h
|
||||
</chapter>
|
||||
|
||||
<chapter id="pubfunctions">
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2007</year>
|
||||
<year>2008</year>
|
||||
<year>2007-2009</year>
|
||||
<holder>Johannes Berg</holder>
|
||||
</copyright>
|
||||
|
||||
@@ -165,8 +164,8 @@ usage should require reading the full document.
|
||||
!Pinclude/net/mac80211.h Frame format
|
||||
</sect1>
|
||||
<sect1>
|
||||
<title>Alignment issues</title>
|
||||
<para>TBD</para>
|
||||
<title>Packet alignment</title>
|
||||
!Pnet/mac80211/rx.c Packet alignment
|
||||
</sect1>
|
||||
<sect1>
|
||||
<title>Calling into mac80211 from interrupts</title>
|
||||
@@ -223,6 +222,17 @@ usage should require reading the full document.
|
||||
!Finclude/net/mac80211.h ieee80211_key_flags
|
||||
</chapter>
|
||||
|
||||
<chapter id="powersave">
|
||||
<title>Powersave support</title>
|
||||
!Pinclude/net/mac80211.h Powersave support
|
||||
</chapter>
|
||||
|
||||
<chapter id="beacon-filter">
|
||||
<title>Beacon filter support</title>
|
||||
!Pinclude/net/mac80211.h Beacon filter support
|
||||
!Finclude/net/mac80211.h ieee80211_beacon_loss
|
||||
</chapter>
|
||||
|
||||
<chapter id="qos">
|
||||
<title>Multiple queues and QoS support</title>
|
||||
<para>TBD</para>
|
||||
|
||||
@@ -41,6 +41,13 @@ GPL version 2.
|
||||
</abstract>
|
||||
|
||||
<revhistory>
|
||||
<revision>
|
||||
<revnumber>0.8</revnumber>
|
||||
<date>2008-12-24</date>
|
||||
<authorinitials>hjk</authorinitials>
|
||||
<revremark>Added name attributes in mem and portio sysfs directories.
|
||||
</revremark>
|
||||
</revision>
|
||||
<revision>
|
||||
<revnumber>0.7</revnumber>
|
||||
<date>2008-12-23</date>
|
||||
@@ -303,10 +310,17 @@ interested in translating it, please email me
|
||||
appear if the size of the mapping is not 0.
|
||||
</para>
|
||||
<para>
|
||||
Each <filename>mapX/</filename> directory contains two read-only files
|
||||
that show start address and size of the memory:
|
||||
Each <filename>mapX/</filename> directory contains four read-only files
|
||||
that show attributes of the memory:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<filename>name</filename>: A string identifier for this mapping. This
|
||||
is optional, the string can be empty. Drivers can set this to make it
|
||||
easier for userspace to find the correct mapping.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<filename>addr</filename>: The address of memory that can be mapped.
|
||||
@@ -366,10 +380,17 @@ offset = N * getpagesize();
|
||||
<filename>/sys/class/uio/uioX/portio/</filename>.
|
||||
</para>
|
||||
<para>
|
||||
Each <filename>portX/</filename> directory contains three read-only
|
||||
files that show start, size, and type of the port region:
|
||||
Each <filename>portX/</filename> directory contains four read-only
|
||||
files that show name, start, size, and type of the port region:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<filename>name</filename>: A string identifier for this port region.
|
||||
The string is optional and can be empty. Drivers can set it to make it
|
||||
easier for userspace to find a certain port region.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<filename>start</filename>: The first port of this region.
|
||||
|
||||
+29
-23
@@ -1,11 +1,11 @@
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN">
|
||||
|
||||
<book>
|
||||
<?dbhtml filename="index.html">
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<!-- ****************************************************** -->
|
||||
<!-- Header -->
|
||||
<!-- ****************************************************** -->
|
||||
<book id="Writing-an-ALSA-Driver">
|
||||
<bookinfo>
|
||||
<title>Writing an ALSA Driver</title>
|
||||
<author>
|
||||
@@ -492,9 +492,9 @@
|
||||
}
|
||||
|
||||
/* (2) */
|
||||
card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
|
||||
if (card == NULL)
|
||||
return -ENOMEM;
|
||||
err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
/* (3) */
|
||||
err = snd_mychip_create(card, pci, &chip);
|
||||
@@ -590,8 +590,9 @@
|
||||
<programlisting>
|
||||
<![CDATA[
|
||||
struct snd_card *card;
|
||||
int err;
|
||||
....
|
||||
card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
|
||||
err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
|
||||
]]>
|
||||
</programlisting>
|
||||
</informalexample>
|
||||
@@ -809,26 +810,28 @@
|
||||
|
||||
<para>
|
||||
As mentioned above, to create a card instance, call
|
||||
<function>snd_card_new()</function>.
|
||||
<function>snd_card_create()</function>.
|
||||
|
||||
<informalexample>
|
||||
<programlisting>
|
||||
<![CDATA[
|
||||
struct snd_card *card;
|
||||
card = snd_card_new(index, id, module, extra_size);
|
||||
int err;
|
||||
err = snd_card_create(index, id, module, extra_size, &card);
|
||||
]]>
|
||||
</programlisting>
|
||||
</informalexample>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The function takes four arguments, the card-index number, the
|
||||
The function takes five arguments, the card-index number, the
|
||||
id string, the module pointer (usually
|
||||
<constant>THIS_MODULE</constant>),
|
||||
and the size of extra-data space. The last argument is used to
|
||||
the size of extra-data space, and the pointer to return the
|
||||
card instance. The extra_size argument is used to
|
||||
allocate card->private_data for the
|
||||
chip-specific data. Note that these data
|
||||
are allocated by <function>snd_card_new()</function>.
|
||||
are allocated by <function>snd_card_create()</function>.
|
||||
</para>
|
||||
</section>
|
||||
|
||||
@@ -915,15 +918,16 @@
|
||||
</para>
|
||||
|
||||
<section id="card-management-chip-specific-snd-card-new">
|
||||
<title>1. Allocating via <function>snd_card_new()</function>.</title>
|
||||
<title>1. Allocating via <function>snd_card_create()</function>.</title>
|
||||
<para>
|
||||
As mentioned above, you can pass the extra-data-length
|
||||
to the 4th argument of <function>snd_card_new()</function>, i.e.
|
||||
to the 4th argument of <function>snd_card_create()</function>, i.e.
|
||||
|
||||
<informalexample>
|
||||
<programlisting>
|
||||
<![CDATA[
|
||||
card = snd_card_new(index[dev], id[dev], THIS_MODULE, sizeof(struct mychip));
|
||||
err = snd_card_create(index[dev], id[dev], THIS_MODULE,
|
||||
sizeof(struct mychip), &card);
|
||||
]]>
|
||||
</programlisting>
|
||||
</informalexample>
|
||||
@@ -952,8 +956,8 @@
|
||||
|
||||
<para>
|
||||
After allocating a card instance via
|
||||
<function>snd_card_new()</function> (with
|
||||
<constant>NULL</constant> on the 4th arg), call
|
||||
<function>snd_card_create()</function> (with
|
||||
<constant>0</constant> on the 4th arg), call
|
||||
<function>kzalloc()</function>.
|
||||
|
||||
<informalexample>
|
||||
@@ -961,7 +965,7 @@
|
||||
<![CDATA[
|
||||
struct snd_card *card;
|
||||
struct mychip *chip;
|
||||
card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL);
|
||||
err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
|
||||
.....
|
||||
chip = kzalloc(sizeof(*chip), GFP_KERNEL);
|
||||
]]>
|
||||
@@ -5750,8 +5754,9 @@ struct _snd_pcm_runtime {
|
||||
....
|
||||
struct snd_card *card;
|
||||
struct mychip *chip;
|
||||
int err;
|
||||
....
|
||||
card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL);
|
||||
err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
|
||||
....
|
||||
chip = kzalloc(sizeof(*chip), GFP_KERNEL);
|
||||
....
|
||||
@@ -5763,7 +5768,7 @@ struct _snd_pcm_runtime {
|
||||
</informalexample>
|
||||
|
||||
When you created the chip data with
|
||||
<function>snd_card_new()</function>, it's anyway accessible
|
||||
<function>snd_card_create()</function>, it's anyway accessible
|
||||
via <structfield>private_data</structfield> field.
|
||||
|
||||
<informalexample>
|
||||
@@ -5775,9 +5780,10 @@ struct _snd_pcm_runtime {
|
||||
....
|
||||
struct snd_card *card;
|
||||
struct mychip *chip;
|
||||
int err;
|
||||
....
|
||||
card = snd_card_new(index[dev], id[dev], THIS_MODULE,
|
||||
sizeof(struct mychip));
|
||||
err = snd_card_create(index[dev], id[dev], THIS_MODULE,
|
||||
sizeof(struct mychip), &card);
|
||||
....
|
||||
chip = card->private_data;
|
||||
....
|
||||
@@ -35,9 +35,3 @@ noop anticipatory deadline [cfq]
|
||||
# echo anticipatory > /sys/block/hda/queue/scheduler
|
||||
# cat /sys/block/hda/queue/scheduler
|
||||
noop [anticipatory] deadline cfq
|
||||
|
||||
Each io queue has a set of io scheduler tunables associated with it. These
|
||||
tunables control how the io scheduler works. You can find these entries
|
||||
in:
|
||||
|
||||
/sys/block/<device>/queue/iosched
|
||||
|
||||
@@ -117,10 +117,28 @@ accessible parameters:
|
||||
sampling_rate: measured in uS (10^-6 seconds), this is how often you
|
||||
want the kernel to look at the CPU usage and to make decisions on
|
||||
what to do about the frequency. Typically this is set to values of
|
||||
around '10000' or more.
|
||||
around '10000' or more. It's default value is (cmp. with users-guide.txt):
|
||||
transition_latency * 1000
|
||||
The lowest value you can set is:
|
||||
transition_latency * 100 or it may get restricted to a value where it
|
||||
makes not sense for the kernel anymore to poll that often which depends
|
||||
on your HZ config variable (HZ=1000: max=20000us, HZ=250: max=5000).
|
||||
Be aware that transition latency is in ns and sampling_rate is in us, so you
|
||||
get the same sysfs value by default.
|
||||
Sampling rate should always get adjusted considering the transition latency
|
||||
To set the sampling rate 750 times as high as the transition latency
|
||||
in the bash (as said, 1000 is default), do:
|
||||
echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) \
|
||||
>ondemand/sampling_rate
|
||||
|
||||
show_sampling_rate_(min|max): the minimum and maximum sampling rates
|
||||
available that you may set 'sampling_rate' to.
|
||||
show_sampling_rate_(min|max): THIS INTERFACE IS DEPRECATED, DON'T USE IT.
|
||||
You can use wider ranges now and the general
|
||||
cpuinfo_transition_latency variable (cmp. with user-guide.txt) can be
|
||||
used to obtain exactly the same info:
|
||||
show_sampling_rate_min = transtition_latency * 500 / 1000
|
||||
show_sampling_rate_max = transtition_latency * 500000 / 1000
|
||||
(divided by 1000 is to illustrate that sampling rate is in us and
|
||||
transition latency is exported ns).
|
||||
|
||||
up_threshold: defines what the average CPU usage between the samplings
|
||||
of 'sampling_rate' needs to be for the kernel to make a decision on
|
||||
|
||||
@@ -152,6 +152,18 @@ cpuinfo_min_freq : this file shows the minimum operating
|
||||
frequency the processor can run at(in kHz)
|
||||
cpuinfo_max_freq : this file shows the maximum operating
|
||||
frequency the processor can run at(in kHz)
|
||||
cpuinfo_transition_latency The time it takes on this CPU to
|
||||
switch between two frequencies in nano
|
||||
seconds. If unknown or known to be
|
||||
that high that the driver does not
|
||||
work with the ondemand governor, -1
|
||||
(CPUFREQ_ETERNAL) will be returned.
|
||||
Using this information can be useful
|
||||
to choose an appropriate polling
|
||||
frequency for a kernel governor or
|
||||
userspace daemon. Make sure to not
|
||||
switch the frequency too often
|
||||
resulting in performance loss.
|
||||
scaling_driver : this file shows what cpufreq driver is
|
||||
used to set the frequency on this CPU
|
||||
|
||||
|
||||
@@ -62,7 +62,6 @@ aic7*reg_print.c*
|
||||
aic7*seq.h*
|
||||
aicasm
|
||||
aicdb.h*
|
||||
asm
|
||||
asm-offsets.h
|
||||
asm_offsets.h
|
||||
autoconf.h*
|
||||
|
||||
@@ -0,0 +1,240 @@
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
This document describes how to use the dynamic debug (ddebug) feature.
|
||||
|
||||
Dynamic debug is designed to allow you to dynamically enable/disable kernel
|
||||
code to obtain additional kernel information. Currently, if
|
||||
CONFIG_DYNAMIC_DEBUG is set, then all pr_debug()/dev_debug() calls can be
|
||||
dynamically enabled per-callsite.
|
||||
|
||||
Dynamic debug has even more useful features:
|
||||
|
||||
* Simple query language allows turning on and off debugging statements by
|
||||
matching any combination of:
|
||||
|
||||
- source filename
|
||||
- function name
|
||||
- line number (including ranges of line numbers)
|
||||
- module name
|
||||
- format string
|
||||
|
||||
* Provides a debugfs control file: <debugfs>/dynamic_debug/control which can be
|
||||
read to display the complete list of known debug statements, to help guide you
|
||||
|
||||
Controlling dynamic debug Behaviour
|
||||
===============================
|
||||
|
||||
The behaviour of pr_debug()/dev_debug()s are controlled via writing to a
|
||||
control file in the 'debugfs' filesystem. Thus, you must first mount the debugfs
|
||||
filesystem, in order to make use of this feature. Subsequently, we refer to the
|
||||
control file as: <debugfs>/dynamic_debug/control. For example, if you want to
|
||||
enable printing from source file 'svcsock.c', line 1603 you simply do:
|
||||
|
||||
nullarbor:~ # echo 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
If you make a mistake with the syntax, the write will fail thus:
|
||||
|
||||
nullarbor:~ # echo 'file svcsock.c wtf 1 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
-bash: echo: write error: Invalid argument
|
||||
|
||||
Viewing Dynamic Debug Behaviour
|
||||
===========================
|
||||
|
||||
You can view the currently configured behaviour of all the debug statements
|
||||
via:
|
||||
|
||||
nullarbor:~ # cat <debugfs>/dynamic_debug/control
|
||||
# filename:lineno [module]function flags format
|
||||
/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:323 [svcxprt_rdma]svc_rdma_cleanup - "SVCRDMA Module Removed, deregister RPC RDMA transport\012"
|
||||
/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:341 [svcxprt_rdma]svc_rdma_init - "\011max_inline : %d\012"
|
||||
/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:340 [svcxprt_rdma]svc_rdma_init - "\011sq_depth : %d\012"
|
||||
/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:338 [svcxprt_rdma]svc_rdma_init - "\011max_requests : %d\012"
|
||||
...
|
||||
|
||||
|
||||
You can also apply standard Unix text manipulation filters to this
|
||||
data, e.g.
|
||||
|
||||
nullarbor:~ # grep -i rdma <debugfs>/dynamic_debug/control | wc -l
|
||||
62
|
||||
|
||||
nullarbor:~ # grep -i tcp <debugfs>/dynamic_debug/control | wc -l
|
||||
42
|
||||
|
||||
Note in particular that the third column shows the enabled behaviour
|
||||
flags for each debug statement callsite (see below for definitions of the
|
||||
flags). The default value, no extra behaviour enabled, is "-". So
|
||||
you can view all the debug statement callsites with any non-default flags:
|
||||
|
||||
nullarbor:~ # awk '$3 != "-"' <debugfs>/dynamic_debug/control
|
||||
# filename:lineno [module]function flags format
|
||||
/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c:1603 [sunrpc]svc_send p "svc_process: st_sendto returned %d\012"
|
||||
|
||||
|
||||
Command Language Reference
|
||||
==========================
|
||||
|
||||
At the lexical level, a command comprises a sequence of words separated
|
||||
by whitespace characters. Note that newlines are treated as word
|
||||
separators and do *not* end a command or allow multiple commands to
|
||||
be done together. So these are all equivalent:
|
||||
|
||||
nullarbor:~ # echo -c 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
nullarbor:~ # echo -c ' file svcsock.c line 1603 +p ' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
nullarbor:~ # echo -c 'file svcsock.c\nline 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
Commands are bounded by a write() system call. If you want to do
|
||||
multiple commands you need to do a separate "echo" for each, like:
|
||||
|
||||
nullarbor:~ # echo 'file svcsock.c line 1603 +p' > /proc/dprintk ;\
|
||||
> echo 'file svcsock.c line 1563 +p' > /proc/dprintk
|
||||
|
||||
or even like:
|
||||
|
||||
nullarbor:~ # (
|
||||
> echo 'file svcsock.c line 1603 +p' ;\
|
||||
> echo 'file svcsock.c line 1563 +p' ;\
|
||||
> ) > /proc/dprintk
|
||||
|
||||
At the syntactical level, a command comprises a sequence of match
|
||||
specifications, followed by a flags change specification.
|
||||
|
||||
command ::= match-spec* flags-spec
|
||||
|
||||
The match-spec's are used to choose a subset of the known dprintk()
|
||||
callsites to which to apply the flags-spec. Think of them as a query
|
||||
with implicit ANDs between each pair. Note that an empty list of
|
||||
match-specs is possible, but is not very useful because it will not
|
||||
match any debug statement callsites.
|
||||
|
||||
A match specification comprises a keyword, which controls the attribute
|
||||
of the callsite to be compared, and a value to compare against. Possible
|
||||
keywords are:
|
||||
|
||||
match-spec ::= 'func' string |
|
||||
'file' string |
|
||||
'module' string |
|
||||
'format' string |
|
||||
'line' line-range
|
||||
|
||||
line-range ::= lineno |
|
||||
'-'lineno |
|
||||
lineno'-' |
|
||||
lineno'-'lineno
|
||||
// Note: line-range cannot contain space, e.g.
|
||||
// "1-30" is valid range but "1 - 30" is not.
|
||||
|
||||
lineno ::= unsigned-int
|
||||
|
||||
The meanings of each keyword are:
|
||||
|
||||
func
|
||||
The given string is compared against the function name
|
||||
of each callsite. Example:
|
||||
|
||||
func svc_tcp_accept
|
||||
|
||||
file
|
||||
The given string is compared against either the full
|
||||
pathname or the basename of the source file of each
|
||||
callsite. Examples:
|
||||
|
||||
file svcsock.c
|
||||
file /usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c
|
||||
|
||||
module
|
||||
The given string is compared against the module name
|
||||
of each callsite. The module name is the string as
|
||||
seen in "lsmod", i.e. without the directory or the .ko
|
||||
suffix and with '-' changed to '_'. Examples:
|
||||
|
||||
module sunrpc
|
||||
module nfsd
|
||||
|
||||
format
|
||||
The given string is searched for in the dynamic debug format
|
||||
string. Note that the string does not need to match the
|
||||
entire format, only some part. Whitespace and other
|
||||
special characters can be escaped using C octal character
|
||||
escape \ooo notation, e.g. the space character is \040.
|
||||
Alternatively, the string can be enclosed in double quote
|
||||
characters (") or single quote characters (').
|
||||
Examples:
|
||||
|
||||
format svcrdma: // many of the NFS/RDMA server dprintks
|
||||
format readahead // some dprintks in the readahead cache
|
||||
format nfsd:\040SETATTR // one way to match a format with whitespace
|
||||
format "nfsd: SETATTR" // a neater way to match a format with whitespace
|
||||
format 'nfsd: SETATTR' // yet another way to match a format with whitespace
|
||||
|
||||
line
|
||||
The given line number or range of line numbers is compared
|
||||
against the line number of each dprintk() callsite. A single
|
||||
line number matches the callsite line number exactly. A
|
||||
range of line numbers matches any callsite between the first
|
||||
and last line number inclusive. An empty first number means
|
||||
the first line in the file, an empty line number means the
|
||||
last number in the file. Examples:
|
||||
|
||||
line 1603 // exactly line 1603
|
||||
line 1600-1605 // the six lines from line 1600 to line 1605
|
||||
line -1605 // the 1605 lines from line 1 to line 1605
|
||||
line 1600- // all lines from line 1600 to the end of the file
|
||||
|
||||
The flags specification comprises a change operation followed
|
||||
by one or more flag characters. The change operation is one
|
||||
of the characters:
|
||||
|
||||
-
|
||||
remove the given flags
|
||||
|
||||
+
|
||||
add the given flags
|
||||
|
||||
=
|
||||
set the flags to the given flags
|
||||
|
||||
The flags are:
|
||||
|
||||
p
|
||||
Causes a printk() message to be emitted to dmesg
|
||||
|
||||
Note the regexp ^[-+=][scp]+$ matches a flags specification.
|
||||
Note also that there is no convenient syntax to remove all
|
||||
the flags at once, you need to use "-psc".
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
// enable the message at line 1603 of file svcsock.c
|
||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
// enable all the messages in file svcsock.c
|
||||
nullarbor:~ # echo -n 'file svcsock.c +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
// enable all the messages in the NFS server module
|
||||
nullarbor:~ # echo -n 'module nfsd +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
// enable all 12 messages in the function svc_process()
|
||||
nullarbor:~ # echo -n 'func svc_process +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
// disable all 12 messages in the function svc_process()
|
||||
nullarbor:~ # echo -n 'func svc_process -p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
// enable messages for NFS calls READ, READLINK, READDIR and READDIR+.
|
||||
nullarbor:~ # echo -n 'format "nfsd: READ" +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
@@ -6,20 +6,47 @@ be removed from this file.
|
||||
|
||||
---------------------------
|
||||
|
||||
What: old static regulatory information and ieee80211_regdom module parameter
|
||||
When: 2.6.29
|
||||
What: The ieee80211_regdom module parameter
|
||||
When: March 2010 / desktop catchup
|
||||
|
||||
Why: This was inherited by the CONFIG_WIRELESS_OLD_REGULATORY code,
|
||||
and currently serves as an option for users to define an
|
||||
ISO / IEC 3166 alpha2 code for the country they are currently
|
||||
present in. Although there are userspace API replacements for this
|
||||
through nl80211 distributions haven't yet caught up with implementing
|
||||
decent alternatives through standard GUIs. Although available as an
|
||||
option through iw or wpa_supplicant its just a matter of time before
|
||||
distributions pick up good GUI options for this. The ideal solution
|
||||
would actually consist of intelligent designs which would do this for
|
||||
the user automatically even when travelling through different countries.
|
||||
Until then we leave this module parameter as a compromise.
|
||||
|
||||
When userspace improves with reasonable widely-available alternatives for
|
||||
this we will no longer need this module parameter. This entry hopes that
|
||||
by the super-futuristically looking date of "March 2010" we will have
|
||||
such replacements widely available.
|
||||
|
||||
Who: Luis R. Rodriguez <lrodriguez@atheros.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: CONFIG_WIRELESS_OLD_REGULATORY - old static regulatory information
|
||||
When: March 2010 / desktop catchup
|
||||
|
||||
Why: The old regulatory infrastructure has been replaced with a new one
|
||||
which does not require statically defined regulatory domains. We do
|
||||
not want to keep static regulatory domains in the kernel due to the
|
||||
the dynamic nature of regulatory law and localization. We kept around
|
||||
the old static definitions for the regulatory domains of:
|
||||
|
||||
* US
|
||||
* JP
|
||||
* EU
|
||||
|
||||
and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
|
||||
set. We also kept around the ieee80211_regdom module parameter in case
|
||||
some applications were relying on it. Changing regulatory domains
|
||||
can now be done instead by using nl80211, as is done with iw.
|
||||
set. We will remove this option once the standard Linux desktop catches
|
||||
up with the new userspace APIs we have implemented.
|
||||
|
||||
Who: Luis R. Rodriguez <lrodriguez@atheros.com>
|
||||
|
||||
---------------------------
|
||||
@@ -229,7 +256,9 @@ Who: Jan Engelhardt <jengelh@computergmbh.de>
|
||||
---------------------------
|
||||
|
||||
What: b43 support for firmware revision < 410
|
||||
When: July 2008
|
||||
When: The schedule was July 2008, but it was decided that we are going to keep the
|
||||
code as long as there are no major maintanance headaches.
|
||||
So it _could_ be removed _any_ time now, if it conflicts with something new.
|
||||
Why: The support code for the old firmware hurts code readability/maintainability
|
||||
and slightly hurts runtime performance. Bugfixes for the old firmware
|
||||
are not provided by Broadcom anymore.
|
||||
@@ -344,3 +373,20 @@ Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
|
||||
Removal is subject to fixing any remaining bugs in ACPI which may
|
||||
cause the thermal throttling not to happen at the right time.
|
||||
Who: Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
|
||||
|
||||
-----------------------------
|
||||
|
||||
What: __do_IRQ all in one fits nothing interrupt handler
|
||||
When: 2.6.32
|
||||
Why: __do_IRQ was kept for easy migration to the type flow handlers.
|
||||
More than two years of migration time is enough.
|
||||
Who: Thomas Gleixner <tglx@linutronix.de>
|
||||
|
||||
-----------------------------
|
||||
|
||||
What: obsolete generic irq defines and typedefs
|
||||
When: 2.6.30
|
||||
Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
|
||||
have been kept around for migration reasons. After more than two years
|
||||
it's time to remove them finally
|
||||
Who: Thomas Gleixner <tglx@linutronix.de>
|
||||
|
||||
@@ -437,8 +437,11 @@ grab BKL for cases when we close a file that had been opened r/w, but that
|
||||
can and should be done using the internal locking with smaller critical areas).
|
||||
Current worst offender is ext2_get_block()...
|
||||
|
||||
->fasync() is a mess. This area needs a big cleanup and that will probably
|
||||
affect locking.
|
||||
->fasync() is called without BKL protection, and is responsible for
|
||||
maintaining the FASYNC bit in filp->f_flags. Most instances call
|
||||
fasync_helper(), which does that maintenance, so it's not normally
|
||||
something one needs to worry about. Return values > 0 will be mapped to
|
||||
zero in the VFS layer.
|
||||
|
||||
->readdir() and ->ioctl() on directories must be changed. Ideally we would
|
||||
move ->readdir() to inode_operations and use a separate method for directory
|
||||
|
||||
@@ -44,6 +44,7 @@ parameter is applicable:
|
||||
FB The frame buffer device is enabled.
|
||||
HW Appropriate hardware is enabled.
|
||||
IA-64 IA-64 architecture is enabled.
|
||||
IMA Integrity measurement architecture is enabled.
|
||||
IOSCHED More than one I/O scheduler is enabled.
|
||||
IP_PNP IP DHCP, BOOTP, or RARP is enabled.
|
||||
ISAPNP ISA PnP code is enabled.
|
||||
@@ -492,10 +493,12 @@ and is between 256 and 4096 characters. It is defined in the file
|
||||
Default: 64
|
||||
|
||||
hpet= [X86-32,HPET] option to control HPET usage
|
||||
Format: { enable (default) | disable | force }
|
||||
Format: { enable (default) | disable | force |
|
||||
verbose }
|
||||
disable: disable HPET and use PIT instead
|
||||
force: allow force enabled of undocumented chips (ICH4,
|
||||
VIA, nVidia)
|
||||
verbose: show contents of HPET registers during setup
|
||||
|
||||
com20020= [HW,NET] ARCnet - COM20020 chipset
|
||||
Format:
|
||||
@@ -829,6 +832,9 @@ and is between 256 and 4096 characters. It is defined in the file
|
||||
|
||||
hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
|
||||
terminal devices. Valid values: 0..8
|
||||
hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
|
||||
If specified, z/VM IUCV HVC accepts connections
|
||||
from listed z/VM user IDs only.
|
||||
|
||||
i2c_bus= [HW] Override the default board specific I2C bus speed
|
||||
or register an additional I2C bus that is not
|
||||
@@ -908,6 +914,15 @@ and is between 256 and 4096 characters. It is defined in the file
|
||||
ihash_entries= [KNL]
|
||||
Set number of hash buckets for inode cache.
|
||||
|
||||
ima_audit= [IMA]
|
||||
Format: { "0" | "1" }
|
||||
0 -- integrity auditing messages. (Default)
|
||||
1 -- enable informational integrity auditing messages.
|
||||
|
||||
ima_hash= [IMA]
|
||||
Formt: { "sha1" | "md5" }
|
||||
default: "sha1"
|
||||
|
||||
in2000= [HW,SCSI]
|
||||
See header of drivers/scsi/in2000.c.
|
||||
|
||||
@@ -1822,11 +1837,6 @@ and is between 256 and 4096 characters. It is defined in the file
|
||||
autoconfiguration.
|
||||
Ranges are in pairs (memory base and size).
|
||||
|
||||
dynamic_printk Enables pr_debug()/dev_dbg() calls if
|
||||
CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
|
||||
These can also be switched on/off via
|
||||
<debugfs>/dynamic_printk/modules
|
||||
|
||||
print-fatal-signals=
|
||||
[KNL] debug: print fatal signals
|
||||
print-fatal-signals=1: print segfault info to
|
||||
|
||||
@@ -141,7 +141,8 @@ rx_ccid = 2
|
||||
Default CCID for the receiver-sender half-connection; see tx_ccid.
|
||||
|
||||
seq_window = 100
|
||||
The initial sequence window (sec. 7.5.2).
|
||||
The initial sequence window (sec. 7.5.2) of the sender. This influences
|
||||
the local ackno validity and the remote seqno validity windows (7.5.1).
|
||||
|
||||
tx_qlen = 5
|
||||
The size of the transmit buffer in packets. A value of 0 corresponds
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
ip_forward - BOOLEAN
|
||||
0 - disabled (default)
|
||||
not 0 - enabled
|
||||
not 0 - enabled
|
||||
|
||||
Forward Packets between interfaces.
|
||||
|
||||
@@ -36,49 +36,49 @@ rt_cache_rebuild_count - INTEGER
|
||||
IP Fragmentation:
|
||||
|
||||
ipfrag_high_thresh - INTEGER
|
||||
Maximum memory used to reassemble IP fragments. When
|
||||
Maximum memory used to reassemble IP fragments. When
|
||||
ipfrag_high_thresh bytes of memory is allocated for this purpose,
|
||||
the fragment handler will toss packets until ipfrag_low_thresh
|
||||
is reached.
|
||||
|
||||
|
||||
ipfrag_low_thresh - INTEGER
|
||||
See ipfrag_high_thresh
|
||||
See ipfrag_high_thresh
|
||||
|
||||
ipfrag_time - INTEGER
|
||||
Time in seconds to keep an IP fragment in memory.
|
||||
Time in seconds to keep an IP fragment in memory.
|
||||
|
||||
ipfrag_secret_interval - INTEGER
|
||||
Regeneration interval (in seconds) of the hash secret (or lifetime
|
||||
Regeneration interval (in seconds) of the hash secret (or lifetime
|
||||
for the hash secret) for IP fragments.
|
||||
Default: 600
|
||||
|
||||
ipfrag_max_dist - INTEGER
|
||||
ipfrag_max_dist is a non-negative integer value which defines the
|
||||
maximum "disorder" which is allowed among fragments which share a
|
||||
common IP source address. Note that reordering of packets is
|
||||
not unusual, but if a large number of fragments arrive from a source
|
||||
IP address while a particular fragment queue remains incomplete, it
|
||||
probably indicates that one or more fragments belonging to that queue
|
||||
have been lost. When ipfrag_max_dist is positive, an additional check
|
||||
is done on fragments before they are added to a reassembly queue - if
|
||||
ipfrag_max_dist (or more) fragments have arrived from a particular IP
|
||||
address between additions to any IP fragment queue using that source
|
||||
address, it's presumed that one or more fragments in the queue are
|
||||
lost. The existing fragment queue will be dropped, and a new one
|
||||
ipfrag_max_dist is a non-negative integer value which defines the
|
||||
maximum "disorder" which is allowed among fragments which share a
|
||||
common IP source address. Note that reordering of packets is
|
||||
not unusual, but if a large number of fragments arrive from a source
|
||||
IP address while a particular fragment queue remains incomplete, it
|
||||
probably indicates that one or more fragments belonging to that queue
|
||||
have been lost. When ipfrag_max_dist is positive, an additional check
|
||||
is done on fragments before they are added to a reassembly queue - if
|
||||
ipfrag_max_dist (or more) fragments have arrived from a particular IP
|
||||
address between additions to any IP fragment queue using that source
|
||||
address, it's presumed that one or more fragments in the queue are
|
||||
lost. The existing fragment queue will be dropped, and a new one
|
||||
started. An ipfrag_max_dist value of zero disables this check.
|
||||
|
||||
Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
|
||||
result in unnecessarily dropping fragment queues when normal
|
||||
reordering of packets occurs, which could lead to poor application
|
||||
performance. Using a very large value, e.g. 50000, increases the
|
||||
likelihood of incorrectly reassembling IP fragments that originate
|
||||
reordering of packets occurs, which could lead to poor application
|
||||
performance. Using a very large value, e.g. 50000, increases the
|
||||
likelihood of incorrectly reassembling IP fragments that originate
|
||||
from different IP datagrams, which could result in data corruption.
|
||||
Default: 64
|
||||
|
||||
INET peer storage:
|
||||
|
||||
inet_peer_threshold - INTEGER
|
||||
The approximate size of the storage. Starting from this threshold
|
||||
The approximate size of the storage. Starting from this threshold
|
||||
entries will be thrown aggressively. This threshold also determines
|
||||
entries' time-to-live and time intervals between garbage collection
|
||||
passes. More entries, less time-to-live, less GC interval.
|
||||
@@ -105,7 +105,7 @@ inet_peer_gc_maxtime - INTEGER
|
||||
in effect under low (or absent) memory pressure on the pool.
|
||||
Measured in seconds.
|
||||
|
||||
TCP variables:
|
||||
TCP variables:
|
||||
|
||||
somaxconn - INTEGER
|
||||
Limit of socket listen() backlog, known in userspace as SOMAXCONN.
|
||||
@@ -310,7 +310,7 @@ tcp_orphan_retries - INTEGER
|
||||
|
||||
tcp_reordering - INTEGER
|
||||
Maximal reordering of packets in a TCP stream.
|
||||
Default: 3
|
||||
Default: 3
|
||||
|
||||
tcp_retrans_collapse - BOOLEAN
|
||||
Bug-to-bug compatibility with some broken printers.
|
||||
@@ -521,7 +521,7 @@ IP Variables:
|
||||
|
||||
ip_local_port_range - 2 INTEGERS
|
||||
Defines the local port range that is used by TCP and UDP to
|
||||
choose the local port. The first number is the first, the
|
||||
choose the local port. The first number is the first, the
|
||||
second the last local port number. Default value depends on
|
||||
amount of memory available on the system:
|
||||
> 128Mb 32768-61000
|
||||
@@ -594,12 +594,12 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
|
||||
|
||||
If zero, icmp error messages are sent with the primary address of
|
||||
the exiting interface.
|
||||
|
||||
|
||||
If non-zero, the message will be sent with the primary address of
|
||||
the interface that received the packet that caused the icmp error.
|
||||
This is the behaviour network many administrators will expect from
|
||||
a router. And it can make debugging complicated network layouts
|
||||
much easier.
|
||||
much easier.
|
||||
|
||||
Note that if no primary address exists for the interface selected,
|
||||
then the primary address of the first non-loopback interface that
|
||||
@@ -611,7 +611,7 @@ igmp_max_memberships - INTEGER
|
||||
Change the maximum number of multicast groups we can subscribe to.
|
||||
Default: 20
|
||||
|
||||
conf/interface/* changes special settings per interface (where "interface" is
|
||||
conf/interface/* changes special settings per interface (where "interface" is
|
||||
the name of your network interface)
|
||||
conf/all/* is special, changes the settings for all interfaces
|
||||
|
||||
@@ -625,11 +625,11 @@ log_martians - BOOLEAN
|
||||
accept_redirects - BOOLEAN
|
||||
Accept ICMP redirect messages.
|
||||
accept_redirects for the interface will be enabled if:
|
||||
- both conf/{all,interface}/accept_redirects are TRUE in the case forwarding
|
||||
for the interface is enabled
|
||||
- both conf/{all,interface}/accept_redirects are TRUE in the case
|
||||
forwarding for the interface is enabled
|
||||
or
|
||||
- at least one of conf/{all,interface}/accept_redirects is TRUE in the case
|
||||
forwarding for the interface is disabled
|
||||
- at least one of conf/{all,interface}/accept_redirects is TRUE in the
|
||||
case forwarding for the interface is disabled
|
||||
accept_redirects for the interface will be disabled otherwise
|
||||
default TRUE (host)
|
||||
FALSE (router)
|
||||
@@ -640,8 +640,8 @@ forwarding - BOOLEAN
|
||||
mc_forwarding - BOOLEAN
|
||||
Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
|
||||
and a multicast routing daemon is required.
|
||||
conf/all/mc_forwarding must also be set to TRUE to enable multicast routing
|
||||
for the interface
|
||||
conf/all/mc_forwarding must also be set to TRUE to enable multicast
|
||||
routing for the interface
|
||||
|
||||
medium_id - INTEGER
|
||||
Integer value used to differentiate the devices by the medium they
|
||||
@@ -649,7 +649,7 @@ medium_id - INTEGER
|
||||
the broadcast packets are received only on one of them.
|
||||
The default value 0 means that the device is the only interface
|
||||
to its medium, value of -1 means that medium is not known.
|
||||
|
||||
|
||||
Currently, it is used to change the proxy_arp behavior:
|
||||
the proxy_arp feature is enabled for packets forwarded between
|
||||
two devices attached to different media.
|
||||
@@ -699,16 +699,22 @@ accept_source_route - BOOLEAN
|
||||
default TRUE (router)
|
||||
FALSE (host)
|
||||
|
||||
rp_filter - BOOLEAN
|
||||
1 - do source validation by reversed path, as specified in RFC1812
|
||||
Recommended option for single homed hosts and stub network
|
||||
routers. Could cause troubles for complicated (not loop free)
|
||||
networks running a slow unreliable protocol (sort of RIP),
|
||||
or using static routes.
|
||||
|
||||
rp_filter - INTEGER
|
||||
0 - No source validation.
|
||||
1 - Strict mode as defined in RFC3704 Strict Reverse Path
|
||||
Each incoming packet is tested against the FIB and if the interface
|
||||
is not the best reverse path the packet check will fail.
|
||||
By default failed packets are discarded.
|
||||
2 - Loose mode as defined in RFC3704 Loose Reverse Path
|
||||
Each incoming packet's source address is also tested against the FIB
|
||||
and if the source address is not reachable via any interface
|
||||
the packet check will fail.
|
||||
|
||||
conf/all/rp_filter must also be set to TRUE to do source validation
|
||||
Current recommended practice in RFC3704 is to enable strict mode
|
||||
to prevent IP spoofing from DDos attacks. If using asymmetric routing
|
||||
or other complicated routing, then loose mode is recommended.
|
||||
|
||||
conf/all/rp_filter must also be set to non-zero to do source validation
|
||||
on the interface
|
||||
|
||||
Default value is 0. Note that some distributions enable it
|
||||
@@ -782,6 +788,12 @@ arp_ignore - INTEGER
|
||||
The max value from conf/{all,interface}/arp_ignore is used
|
||||
when ARP request is received on the {interface}
|
||||
|
||||
arp_notify - BOOLEAN
|
||||
Define mode for notification of address and device changes.
|
||||
0 - (default): do nothing
|
||||
1 - Generate gratuitous arp replies when device is brought up
|
||||
or hardware address changes.
|
||||
|
||||
arp_accept - BOOLEAN
|
||||
Define behavior when gratuitous arp replies are received:
|
||||
0 - drop gratuitous arp frames
|
||||
@@ -823,7 +835,7 @@ apply to IPv6 [XXX?].
|
||||
|
||||
bindv6only - BOOLEAN
|
||||
Default value for IPV6_V6ONLY socket option,
|
||||
which restricts use of the IPv6 socket to IPv6 communication
|
||||
which restricts use of the IPv6 socket to IPv6 communication
|
||||
only.
|
||||
TRUE: disable IPv4-mapped address feature
|
||||
FALSE: enable IPv4-mapped address feature
|
||||
@@ -833,19 +845,19 @@ bindv6only - BOOLEAN
|
||||
IPv6 Fragmentation:
|
||||
|
||||
ip6frag_high_thresh - INTEGER
|
||||
Maximum memory used to reassemble IPv6 fragments. When
|
||||
Maximum memory used to reassemble IPv6 fragments. When
|
||||
ip6frag_high_thresh bytes of memory is allocated for this purpose,
|
||||
the fragment handler will toss packets until ip6frag_low_thresh
|
||||
is reached.
|
||||
|
||||
|
||||
ip6frag_low_thresh - INTEGER
|
||||
See ip6frag_high_thresh
|
||||
See ip6frag_high_thresh
|
||||
|
||||
ip6frag_time - INTEGER
|
||||
Time in seconds to keep an IPv6 fragment in memory.
|
||||
|
||||
ip6frag_secret_interval - INTEGER
|
||||
Regeneration interval (in seconds) of the hash secret (or lifetime
|
||||
Regeneration interval (in seconds) of the hash secret (or lifetime
|
||||
for the hash secret) for IPv6 fragments.
|
||||
Default: 600
|
||||
|
||||
@@ -854,17 +866,17 @@ conf/default/*:
|
||||
|
||||
|
||||
conf/all/*:
|
||||
Change all the interface-specific settings.
|
||||
Change all the interface-specific settings.
|
||||
|
||||
[XXX: Other special features than forwarding?]
|
||||
|
||||
conf/all/forwarding - BOOLEAN
|
||||
Enable global IPv6 forwarding between all interfaces.
|
||||
Enable global IPv6 forwarding between all interfaces.
|
||||
|
||||
IPv4 and IPv6 work differently here; e.g. netfilter must be used
|
||||
IPv4 and IPv6 work differently here; e.g. netfilter must be used
|
||||
to control which interfaces may forward packets and which not.
|
||||
|
||||
This also sets all interfaces' Host/Router setting
|
||||
This also sets all interfaces' Host/Router setting
|
||||
'forwarding' to the specified value. See below for details.
|
||||
|
||||
This referred to as global forwarding.
|
||||
@@ -875,12 +887,12 @@ proxy_ndp - BOOLEAN
|
||||
conf/interface/*:
|
||||
Change special settings per interface.
|
||||
|
||||
The functional behaviour for certain settings is different
|
||||
The functional behaviour for certain settings is different
|
||||
depending on whether local forwarding is enabled or not.
|
||||
|
||||
accept_ra - BOOLEAN
|
||||
Accept Router Advertisements; autoconfigure using them.
|
||||
|
||||
|
||||
Functional default: enabled if local forwarding is disabled.
|
||||
disabled if local forwarding is enabled.
|
||||
|
||||
@@ -926,7 +938,7 @@ accept_source_route - INTEGER
|
||||
Default: 0
|
||||
|
||||
autoconf - BOOLEAN
|
||||
Autoconfigure addresses using Prefix Information in Router
|
||||
Autoconfigure addresses using Prefix Information in Router
|
||||
Advertisements.
|
||||
|
||||
Functional default: enabled if accept_ra_pinfo is enabled.
|
||||
@@ -935,11 +947,11 @@ autoconf - BOOLEAN
|
||||
dad_transmits - INTEGER
|
||||
The amount of Duplicate Address Detection probes to send.
|
||||
Default: 1
|
||||
|
||||
forwarding - BOOLEAN
|
||||
Configure interface-specific Host/Router behaviour.
|
||||
|
||||
Note: It is recommended to have the same setting on all
|
||||
forwarding - BOOLEAN
|
||||
Configure interface-specific Host/Router behaviour.
|
||||
|
||||
Note: It is recommended to have the same setting on all
|
||||
interfaces; mixed router/host scenarios are rather uncommon.
|
||||
|
||||
FALSE:
|
||||
@@ -948,13 +960,13 @@ forwarding - BOOLEAN
|
||||
|
||||
1. IsRouter flag is not set in Neighbour Advertisements.
|
||||
2. Router Solicitations are being sent when necessary.
|
||||
3. If accept_ra is TRUE (default), accept Router
|
||||
3. If accept_ra is TRUE (default), accept Router
|
||||
Advertisements (and do autoconfiguration).
|
||||
4. If accept_redirects is TRUE (default), accept Redirects.
|
||||
|
||||
TRUE:
|
||||
|
||||
If local forwarding is enabled, Router behaviour is assumed.
|
||||
If local forwarding is enabled, Router behaviour is assumed.
|
||||
This means exactly the reverse from the above:
|
||||
|
||||
1. IsRouter flag is set in Neighbour Advertisements.
|
||||
@@ -989,7 +1001,7 @@ router_solicitation_interval - INTEGER
|
||||
Default: 4
|
||||
|
||||
router_solicitations - INTEGER
|
||||
Number of Router Solicitations to send until assuming no
|
||||
Number of Router Solicitations to send until assuming no
|
||||
routers are present.
|
||||
Default: 3
|
||||
|
||||
@@ -1013,11 +1025,11 @@ temp_prefered_lft - INTEGER
|
||||
|
||||
max_desync_factor - INTEGER
|
||||
Maximum value for DESYNC_FACTOR, which is a random value
|
||||
that ensures that clients don't synchronize with each
|
||||
that ensures that clients don't synchronize with each
|
||||
other and generate new addresses at exactly the same time.
|
||||
value is in seconds.
|
||||
Default: 600
|
||||
|
||||
|
||||
regen_max_retry - INTEGER
|
||||
Number of attempts before give up attempting to generate
|
||||
valid temporary addresses.
|
||||
@@ -1025,13 +1037,15 @@ regen_max_retry - INTEGER
|
||||
|
||||
max_addresses - INTEGER
|
||||
Number of maximum addresses per interface. 0 disables limitation.
|
||||
It is recommended not set too large value (or 0) because it would
|
||||
be too easy way to crash kernel to allow to create too much of
|
||||
It is recommended not set too large value (or 0) because it would
|
||||
be too easy way to crash kernel to allow to create too much of
|
||||
autoconfigured addresses.
|
||||
Default: 16
|
||||
|
||||
disable_ipv6 - BOOLEAN
|
||||
Disable IPv6 operation.
|
||||
Disable IPv6 operation. If accept_dad is set to 2, this value
|
||||
will be dynamically set to TRUE if DAD fails for the link-local
|
||||
address.
|
||||
Default: FALSE (enable IPv6 operation)
|
||||
|
||||
accept_dad - INTEGER
|
||||
|
||||
@@ -0,0 +1,199 @@
|
||||
Linux Base Driver for 10 Gigabit PCI Express Intel(R) Network Connection
|
||||
========================================================================
|
||||
|
||||
March 10, 2009
|
||||
|
||||
|
||||
Contents
|
||||
========
|
||||
|
||||
- In This Release
|
||||
- Identifying Your Adapter
|
||||
- Building and Installation
|
||||
- Additional Configurations
|
||||
- Support
|
||||
|
||||
|
||||
|
||||
In This Release
|
||||
===============
|
||||
|
||||
This file describes the ixgbe Linux Base Driver for the 10 Gigabit PCI
|
||||
Express Intel(R) Network Connection. This driver includes support for
|
||||
Itanium(R)2-based systems.
|
||||
|
||||
For questions related to hardware requirements, refer to the documentation
|
||||
supplied with your 10 Gigabit adapter. All hardware requirements listed apply
|
||||
to use with Linux.
|
||||
|
||||
The following features are available in this kernel:
|
||||
- Native VLANs
|
||||
- Channel Bonding (teaming)
|
||||
- SNMP
|
||||
- Generic Receive Offload
|
||||
- Data Center Bridging
|
||||
|
||||
Channel Bonding documentation can be found in the Linux kernel source:
|
||||
/Documentation/networking/bonding.txt
|
||||
|
||||
Ethtool, lspci, and ifconfig can be used to display device and driver
|
||||
specific information.
|
||||
|
||||
|
||||
Identifying Your Adapter
|
||||
========================
|
||||
|
||||
This driver supports devices based on the 82598 controller and the 82599
|
||||
controller.
|
||||
|
||||
For specific information on identifying which adapter you have, please visit:
|
||||
|
||||
http://support.intel.com/support/network/sb/CS-008441.htm
|
||||
|
||||
|
||||
Building and Installation
|
||||
=========================
|
||||
|
||||
select m for "Intel(R) 10GbE PCI Express adapters support" located at:
|
||||
Location:
|
||||
-> Device Drivers
|
||||
-> Network device support (NETDEVICES [=y])
|
||||
-> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
|
||||
|
||||
1. make modules & make modules_install
|
||||
|
||||
2. Load the module:
|
||||
|
||||
# modprobe ixgbe
|
||||
|
||||
The insmod command can be used if the full
|
||||
path to the driver module is specified. For example:
|
||||
|
||||
insmod /lib/modules/<KERNEL VERSION>/kernel/drivers/net/ixgbe/ixgbe.ko
|
||||
|
||||
With 2.6 based kernels also make sure that older ixgbe drivers are
|
||||
removed from the kernel, before loading the new module:
|
||||
|
||||
rmmod ixgbe; modprobe ixgbe
|
||||
|
||||
3. Assign an IP address to the interface by entering the following, where
|
||||
x is the interface number:
|
||||
|
||||
ifconfig ethx <IP_address>
|
||||
|
||||
4. Verify that the interface works. Enter the following, where <IP_address>
|
||||
is the IP address for another machine on the same subnet as the interface
|
||||
that is being tested:
|
||||
|
||||
ping <IP_address>
|
||||
|
||||
|
||||
Additional Configurations
|
||||
=========================
|
||||
|
||||
Viewing Link Messages
|
||||
---------------------
|
||||
Link messages will not be displayed to the console if the distribution is
|
||||
restricting system messages. In order to see network driver link messages on
|
||||
your console, set dmesg to eight by entering the following:
|
||||
|
||||
dmesg -n 8
|
||||
|
||||
NOTE: This setting is not saved across reboots.
|
||||
|
||||
|
||||
Jumbo Frames
|
||||
------------
|
||||
The driver supports Jumbo Frames for all adapters. Jumbo Frames support is
|
||||
enabled by changing the MTU to a value larger than the default of 1500.
|
||||
The maximum value for the MTU is 16110. Use the ifconfig command to
|
||||
increase the MTU size. For example:
|
||||
|
||||
ifconfig ethx mtu 9000 up
|
||||
|
||||
The maximum MTU setting for Jumbo Frames is 16110. This value coincides
|
||||
with the maximum Jumbo Frames size of 16128.
|
||||
|
||||
Generic Receive Offload, aka GRO
|
||||
--------------------------------
|
||||
The driver supports the in-kernel software implementation of GRO. GRO has
|
||||
shown that by coalescing Rx traffic into larger chunks of data, CPU
|
||||
utilization can be significantly reduced when under large Rx load. GRO is an
|
||||
evolution of the previously-used LRO interface. GRO is able to coalesce
|
||||
other protocols besides TCP. It's also safe to use with configurations that
|
||||
are problematic for LRO, namely bridging and iSCSI.
|
||||
|
||||
GRO is enabled by default in the driver. Future versions of ethtool will
|
||||
support disabling and re-enabling GRO on the fly.
|
||||
|
||||
|
||||
Data Center Bridging, aka DCB
|
||||
-----------------------------
|
||||
|
||||
DCB is a configuration Quality of Service implementation in hardware.
|
||||
It uses the VLAN priority tag (802.1p) to filter traffic. That means
|
||||
that there are 8 different priorities that traffic can be filtered into.
|
||||
It also enables priority flow control which can limit or eliminate the
|
||||
number of dropped packets during network stress. Bandwidth can be
|
||||
allocated to each of these priorities, which is enforced at the hardware
|
||||
level.
|
||||
|
||||
To enable DCB support in ixgbe, you must enable the DCB netlink layer to
|
||||
allow the userspace tools (see below) to communicate with the driver.
|
||||
This can be found in the kernel configuration here:
|
||||
|
||||
-> Networking support
|
||||
-> Networking options
|
||||
-> Data Center Bridging support
|
||||
|
||||
Once this is selected, DCB support must be selected for ixgbe. This can
|
||||
be found here:
|
||||
|
||||
-> Device Drivers
|
||||
-> Network device support (NETDEVICES [=y])
|
||||
-> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
|
||||
-> Intel(R) 10GbE PCI Express adapters support
|
||||
-> Data Center Bridging (DCB) Support
|
||||
|
||||
After these options are selected, you must rebuild your kernel and your
|
||||
modules.
|
||||
|
||||
In order to use DCB, userspace tools must be downloaded and installed.
|
||||
The dcbd tools can be found at:
|
||||
|
||||
http://e1000.sf.net
|
||||
|
||||
|
||||
Ethtool
|
||||
-------
|
||||
The driver utilizes the ethtool interface for driver configuration and
|
||||
diagnostics, as well as displaying statistical information. Ethtool
|
||||
version 3.0 or later is required for this functionality.
|
||||
|
||||
The latest release of ethtool can be found from
|
||||
http://sourceforge.net/projects/gkernel.
|
||||
|
||||
|
||||
NAPI
|
||||
----
|
||||
|
||||
NAPI (Rx polling mode) is supported in the ixgbe driver. NAPI is enabled
|
||||
by default in the driver.
|
||||
|
||||
See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
|
||||
|
||||
|
||||
Support
|
||||
=======
|
||||
|
||||
For general information, go to the Intel support website at:
|
||||
|
||||
http://support.intel.com
|
||||
|
||||
or the Intel Wired Networking project hosted by Sourceforge at:
|
||||
|
||||
http://e1000.sourceforge.net
|
||||
|
||||
If an issue is identified with the released source code on the supported
|
||||
kernel with a supported adapter, email the specific information related
|
||||
to the issue to e1000-devel@lists.sf.net
|
||||
@@ -0,0 +1,356 @@
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
This readme tries to provide some background on the hows and whys of RDS,
|
||||
and will hopefully help you find your way around the code.
|
||||
|
||||
In addition, please see this email about RDS origins:
|
||||
http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html
|
||||
|
||||
RDS Architecture
|
||||
================
|
||||
|
||||
RDS provides reliable, ordered datagram delivery by using a single
|
||||
reliable connection between any two nodes in the cluster. This allows
|
||||
applications to use a single socket to talk to any other process in the
|
||||
cluster - so in a cluster with N processes you need N sockets, in contrast
|
||||
to N*N if you use a connection-oriented socket transport like TCP.
|
||||
|
||||
RDS is not Infiniband-specific; it was designed to support different
|
||||
transports. The current implementation used to support RDS over TCP as well
|
||||
as IB. Work is in progress to support RDS over iWARP, and using DCE to
|
||||
guarantee no dropped packets on Ethernet, it may be possible to use RDS over
|
||||
UDP in the future.
|
||||
|
||||
The high-level semantics of RDS from the application's point of view are
|
||||
|
||||
* Addressing
|
||||
RDS uses IPv4 addresses and 16bit port numbers to identify
|
||||
the end point of a connection. All socket operations that involve
|
||||
passing addresses between kernel and user space generally
|
||||
use a struct sockaddr_in.
|
||||
|
||||
The fact that IPv4 addresses are used does not mean the underlying
|
||||
transport has to be IP-based. In fact, RDS over IB uses a
|
||||
reliable IB connection; the IP address is used exclusively to
|
||||
locate the remote node's GID (by ARPing for the given IP).
|
||||
|
||||
The port space is entirely independent of UDP, TCP or any other
|
||||
protocol.
|
||||
|
||||
* Socket interface
|
||||
RDS sockets work *mostly* as you would expect from a BSD
|
||||
socket. The next section will cover the details. At any rate,
|
||||
all I/O is performed through the standard BSD socket API.
|
||||
Some additions like zerocopy support are implemented through
|
||||
control messages, while other extensions use the getsockopt/
|
||||
setsockopt calls.
|
||||
|
||||
Sockets must be bound before you can send or receive data.
|
||||
This is needed because binding also selects a transport and
|
||||
attaches it to the socket. Once bound, the transport assignment
|
||||
does not change. RDS will tolerate IPs moving around (eg in
|
||||
a active-active HA scenario), but only as long as the address
|
||||
doesn't move to a different transport.
|
||||
|
||||
* sysctls
|
||||
RDS supports a number of sysctls in /proc/sys/net/rds
|
||||
|
||||
|
||||
Socket Interface
|
||||
================
|
||||
|
||||
AF_RDS, PF_RDS, SOL_RDS
|
||||
These constants haven't been assigned yet, because RDS isn't in
|
||||
mainline yet. Currently, the kernel module assigns some constant
|
||||
and publishes it to user space through two sysctl files
|
||||
/proc/sys/net/rds/pf_rds
|
||||
/proc/sys/net/rds/sol_rds
|
||||
|
||||
fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
|
||||
This creates a new, unbound RDS socket.
|
||||
|
||||
setsockopt(SOL_SOCKET): send and receive buffer size
|
||||
RDS honors the send and receive buffer size socket options.
|
||||
You are not allowed to queue more than SO_SNDSIZE bytes to
|
||||
a socket. A message is queued when sendmsg is called, and
|
||||
it leaves the queue when the remote system acknowledges
|
||||
its arrival.
|
||||
|
||||
The SO_RCVSIZE option controls the maximum receive queue length.
|
||||
This is a soft limit rather than a hard limit - RDS will
|
||||
continue to accept and queue incoming messages, even if that
|
||||
takes the queue length over the limit. However, it will also
|
||||
mark the port as "congested" and send a congestion update to
|
||||
the source node. The source node is supposed to throttle any
|
||||
processes sending to this congested port.
|
||||
|
||||
bind(fd, &sockaddr_in, ...)
|
||||
This binds the socket to a local IP address and port, and a
|
||||
transport.
|
||||
|
||||
sendmsg(fd, ...)
|
||||
Sends a message to the indicated recipient. The kernel will
|
||||
transparently establish the underlying reliable connection
|
||||
if it isn't up yet.
|
||||
|
||||
An attempt to send a message that exceeds SO_SNDSIZE will
|
||||
return with -EMSGSIZE
|
||||
|
||||
An attempt to send a message that would take the total number
|
||||
of queued bytes over the SO_SNDSIZE threshold will return
|
||||
EAGAIN.
|
||||
|
||||
An attempt to send a message to a destination that is marked
|
||||
as "congested" will return ENOBUFS.
|
||||
|
||||
recvmsg(fd, ...)
|
||||
Receives a message that was queued to this socket. The sockets
|
||||
recv queue accounting is adjusted, and if the queue length
|
||||
drops below SO_SNDSIZE, the port is marked uncongested, and
|
||||
a congestion update is sent to all peers.
|
||||
|
||||
Applications can ask the RDS kernel module to receive
|
||||
notifications via control messages (for instance, there is a
|
||||
notification when a congestion update arrived, or when a RDMA
|
||||
operation completes). These notifications are received through
|
||||
the msg.msg_control buffer of struct msghdr. The format of the
|
||||
messages is described in manpages.
|
||||
|
||||
poll(fd)
|
||||
RDS supports the poll interface to allow the application
|
||||
to implement async I/O.
|
||||
|
||||
POLLIN handling is pretty straightforward. When there's an
|
||||
incoming message queued to the socket, or a pending notification,
|
||||
we signal POLLIN.
|
||||
|
||||
POLLOUT is a little harder. Since you can essentially send
|
||||
to any destination, RDS will always signal POLLOUT as long as
|
||||
there's room on the send queue (ie the number of bytes queued
|
||||
is less than the sendbuf size).
|
||||
|
||||
However, the kernel will refuse to accept messages to
|
||||
a destination marked congested - in this case you will loop
|
||||
forever if you rely on poll to tell you what to do.
|
||||
This isn't a trivial problem, but applications can deal with
|
||||
this - by using congestion notifications, and by checking for
|
||||
ENOBUFS errors returned by sendmsg.
|
||||
|
||||
setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in)
|
||||
This allows the application to discard all messages queued to a
|
||||
specific destination on this particular socket.
|
||||
|
||||
This allows the application to cancel outstanding messages if
|
||||
it detects a timeout. For instance, if it tried to send a message,
|
||||
and the remote host is unreachable, RDS will keep trying forever.
|
||||
The application may decide it's not worth it, and cancel the
|
||||
operation. In this case, it would use RDS_CANCEL_SENT_TO to
|
||||
nuke any pending messages.
|
||||
|
||||
|
||||
RDMA for RDS
|
||||
============
|
||||
|
||||
see rds-rdma(7) manpage (available in rds-tools)
|
||||
|
||||
|
||||
Congestion Notifications
|
||||
========================
|
||||
|
||||
see rds(7) manpage
|
||||
|
||||
|
||||
RDS Protocol
|
||||
============
|
||||
|
||||
Message header
|
||||
|
||||
The message header is a 'struct rds_header' (see rds.h):
|
||||
Fields:
|
||||
h_sequence:
|
||||
per-packet sequence number
|
||||
h_ack:
|
||||
piggybacked acknowledgment of last packet received
|
||||
h_len:
|
||||
length of data, not including header
|
||||
h_sport:
|
||||
source port
|
||||
h_dport:
|
||||
destination port
|
||||
h_flags:
|
||||
CONG_BITMAP - this is a congestion update bitmap
|
||||
ACK_REQUIRED - receiver must ack this packet
|
||||
RETRANSMITTED - packet has previously been sent
|
||||
h_credit:
|
||||
indicate to other end of connection that
|
||||
it has more credits available (i.e. there is
|
||||
more send room)
|
||||
h_padding[4]:
|
||||
unused, for future use
|
||||
h_csum:
|
||||
header checksum
|
||||
h_exthdr:
|
||||
optional data can be passed here. This is currently used for
|
||||
passing RDMA-related information.
|
||||
|
||||
ACK and retransmit handling
|
||||
|
||||
One might think that with reliable IB connections you wouldn't need
|
||||
to ack messages that have been received. The problem is that IB
|
||||
hardware generates an ack message before it has DMAed the message
|
||||
into memory. This creates a potential message loss if the HCA is
|
||||
disabled for any reason between when it sends the ack and before
|
||||
the message is DMAed and processed. This is only a potential issue
|
||||
if another HCA is available for fail-over.
|
||||
|
||||
Sending an ack immediately would allow the sender to free the sent
|
||||
message from their send queue quickly, but could cause excessive
|
||||
traffic to be used for acks. RDS piggybacks acks on sent data
|
||||
packets. Ack-only packets are reduced by only allowing one to be
|
||||
in flight at a time, and by the sender only asking for acks when
|
||||
its send buffers start to fill up. All retransmissions are also
|
||||
acked.
|
||||
|
||||
Flow Control
|
||||
|
||||
RDS's IB transport uses a credit-based mechanism to verify that
|
||||
there is space in the peer's receive buffers for more data. This
|
||||
eliminates the need for hardware retries on the connection.
|
||||
|
||||
Congestion
|
||||
|
||||
Messages waiting in the receive queue on the receiving socket
|
||||
are accounted against the sockets SO_RCVBUF option value. Only
|
||||
the payload bytes in the message are accounted for. If the
|
||||
number of bytes queued equals or exceeds rcvbuf then the socket
|
||||
is congested. All sends attempted to this socket's address
|
||||
should return block or return -EWOULDBLOCK.
|
||||
|
||||
Applications are expected to be reasonably tuned such that this
|
||||
situation very rarely occurs. An application encountering this
|
||||
"back-pressure" is considered a bug.
|
||||
|
||||
This is implemented by having each node maintain bitmaps which
|
||||
indicate which ports on bound addresses are congested. As the
|
||||
bitmap changes it is sent through all the connections which
|
||||
terminate in the local address of the bitmap which changed.
|
||||
|
||||
The bitmaps are allocated as connections are brought up. This
|
||||
avoids allocation in the interrupt handling path which queues
|
||||
sages on sockets. The dense bitmaps let transports send the
|
||||
entire bitmap on any bitmap change reasonably efficiently. This
|
||||
is much easier to implement than some finer-grained
|
||||
communication of per-port congestion. The sender does a very
|
||||
inexpensive bit test to test if the port it's about to send to
|
||||
is congested or not.
|
||||
|
||||
|
||||
RDS Transport Layer
|
||||
==================
|
||||
|
||||
As mentioned above, RDS is not IB-specific. Its code is divided
|
||||
into a general RDS layer and a transport layer.
|
||||
|
||||
The general layer handles the socket API, congestion handling,
|
||||
loopback, stats, usermem pinning, and the connection state machine.
|
||||
|
||||
The transport layer handles the details of the transport. The IB
|
||||
transport, for example, handles all the queue pairs, work requests,
|
||||
CM event handlers, and other Infiniband details.
|
||||
|
||||
|
||||
RDS Kernel Structures
|
||||
=====================
|
||||
|
||||
struct rds_message
|
||||
aka possibly "rds_outgoing", the generic RDS layer copies data to
|
||||
be sent and sets header fields as needed, based on the socket API.
|
||||
This is then queued for the individual connection and sent by the
|
||||
connection's transport.
|
||||
struct rds_incoming
|
||||
a generic struct referring to incoming data that can be handed from
|
||||
the transport to the general code and queued by the general code
|
||||
while the socket is awoken. It is then passed back to the transport
|
||||
code to handle the actual copy-to-user.
|
||||
struct rds_socket
|
||||
per-socket information
|
||||
struct rds_connection
|
||||
per-connection information
|
||||
struct rds_transport
|
||||
pointers to transport-specific functions
|
||||
struct rds_statistics
|
||||
non-transport-specific statistics
|
||||
struct rds_cong_map
|
||||
wraps the raw congestion bitmap, contains rbnode, waitq, etc.
|
||||
|
||||
Connection management
|
||||
=====================
|
||||
|
||||
Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and
|
||||
ERROR states.
|
||||
|
||||
The first time an attempt is made by an RDS socket to send data to
|
||||
a node, a connection is allocated and connected. That connection is
|
||||
then maintained forever -- if there are transport errors, the
|
||||
connection will be dropped and re-established.
|
||||
|
||||
Dropping a connection while packets are queued will cause queued or
|
||||
partially-sent datagrams to be retransmitted when the connection is
|
||||
re-established.
|
||||
|
||||
|
||||
The send path
|
||||
=============
|
||||
|
||||
rds_sendmsg()
|
||||
struct rds_message built from incoming data
|
||||
CMSGs parsed (e.g. RDMA ops)
|
||||
transport connection alloced and connected if not already
|
||||
rds_message placed on send queue
|
||||
send worker awoken
|
||||
rds_send_worker()
|
||||
calls rds_send_xmit() until queue is empty
|
||||
rds_send_xmit()
|
||||
transmits congestion map if one is pending
|
||||
may set ACK_REQUIRED
|
||||
calls transport to send either non-RDMA or RDMA message
|
||||
(RDMA ops never retransmitted)
|
||||
rds_ib_xmit()
|
||||
allocs work requests from send ring
|
||||
adds any new send credits available to peer (h_credits)
|
||||
maps the rds_message's sg list
|
||||
piggybacks ack
|
||||
populates work requests
|
||||
post send to connection's queue pair
|
||||
|
||||
The recv path
|
||||
=============
|
||||
|
||||
rds_ib_recv_cq_comp_handler()
|
||||
looks at write completions
|
||||
unmaps recv buffer from device
|
||||
no errors, call rds_ib_process_recv()
|
||||
refill recv ring
|
||||
rds_ib_process_recv()
|
||||
validate header checksum
|
||||
copy header to rds_ib_incoming struct if start of a new datagram
|
||||
add to ibinc's fraglist
|
||||
if competed datagram:
|
||||
update cong map if datagram was cong update
|
||||
call rds_recv_incoming() otherwise
|
||||
note if ack is required
|
||||
rds_recv_incoming()
|
||||
drop duplicate packets
|
||||
respond to pings
|
||||
find the sock associated with this datagram
|
||||
add to sock queue
|
||||
wake up sock
|
||||
do some congestion calculations
|
||||
rds_recvmsg
|
||||
copy data into user iovec
|
||||
handle CMSGs
|
||||
return to application
|
||||
|
||||
|
||||
@@ -0,0 +1,180 @@
|
||||
The existing interfaces for getting network packages time stamped are:
|
||||
|
||||
* SO_TIMESTAMP
|
||||
Generate time stamp for each incoming packet using the (not necessarily
|
||||
monotonous!) system time. Result is returned via recv_msg() in a
|
||||
control message as timeval (usec resolution).
|
||||
|
||||
* SO_TIMESTAMPNS
|
||||
Same time stamping mechanism as SO_TIMESTAMP, but returns result as
|
||||
timespec (nsec resolution).
|
||||
|
||||
* IP_MULTICAST_LOOP + SO_TIMESTAMP[NS]
|
||||
Only for multicasts: approximate send time stamp by receiving the looped
|
||||
packet and using its receive time stamp.
|
||||
|
||||
The following interface complements the existing ones: receive time
|
||||
stamps can be generated and returned for arbitrary packets and much
|
||||
closer to the point where the packet is really sent. Time stamps can
|
||||
be generated in software (as before) or in hardware (if the hardware
|
||||
has such a feature).
|
||||
|
||||
SO_TIMESTAMPING:
|
||||
|
||||
Instructs the socket layer which kind of information is wanted. The
|
||||
parameter is an integer with some of the following bits set. Setting
|
||||
other bits is an error and doesn't change the current state.
|
||||
|
||||
SOF_TIMESTAMPING_TX_HARDWARE: try to obtain send time stamp in hardware
|
||||
SOF_TIMESTAMPING_TX_SOFTWARE: if SOF_TIMESTAMPING_TX_HARDWARE is off or
|
||||
fails, then do it in software
|
||||
SOF_TIMESTAMPING_RX_HARDWARE: return the original, unmodified time stamp
|
||||
as generated by the hardware
|
||||
SOF_TIMESTAMPING_RX_SOFTWARE: if SOF_TIMESTAMPING_RX_HARDWARE is off or
|
||||
fails, then do it in software
|
||||
SOF_TIMESTAMPING_RAW_HARDWARE: return original raw hardware time stamp
|
||||
SOF_TIMESTAMPING_SYS_HARDWARE: return hardware time stamp transformed to
|
||||
the system time base
|
||||
SOF_TIMESTAMPING_SOFTWARE: return system time stamp generated in
|
||||
software
|
||||
|
||||
SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
|
||||
SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
|
||||
following control message:
|
||||
struct scm_timestamping {
|
||||
struct timespec systime;
|
||||
struct timespec hwtimetrans;
|
||||
struct timespec hwtimeraw;
|
||||
};
|
||||
|
||||
recvmsg() can be used to get this control message for regular incoming
|
||||
packets. For send time stamps the outgoing packet is looped back to
|
||||
the socket's error queue with the send time stamp(s) attached. It can
|
||||
be received with recvmsg(flags=MSG_ERRQUEUE). The call returns the
|
||||
original outgoing packet data including all headers preprended down to
|
||||
and including the link layer, the scm_timestamping control message and
|
||||
a sock_extended_err control message with ee_errno==ENOMSG and
|
||||
ee_origin==SO_EE_ORIGIN_TIMESTAMPING. A socket with such a pending
|
||||
bounced packet is ready for reading as far as select() is concerned.
|
||||
If the outgoing packet has to be fragmented, then only the first
|
||||
fragment is time stamped and returned to the sending socket.
|
||||
|
||||
All three values correspond to the same event in time, but were
|
||||
generated in different ways. Each of these values may be empty (= all
|
||||
zero), in which case no such value was available. If the application
|
||||
is not interested in some of these values, they can be left blank to
|
||||
avoid the potential overhead of calculating them.
|
||||
|
||||
systime is the value of the system time at that moment. This
|
||||
corresponds to the value also returned via SO_TIMESTAMP[NS]. If the
|
||||
time stamp was generated by hardware, then this field is
|
||||
empty. Otherwise it is filled in if SOF_TIMESTAMPING_SOFTWARE is
|
||||
set.
|
||||
|
||||
hwtimeraw is the original hardware time stamp. Filled in if
|
||||
SOF_TIMESTAMPING_RAW_HARDWARE is set. No assumptions about its
|
||||
relation to system time should be made.
|
||||
|
||||
hwtimetrans is the hardware time stamp transformed so that it
|
||||
corresponds as good as possible to system time. This correlation is
|
||||
not perfect; as a consequence, sorting packets received via different
|
||||
NICs by their hwtimetrans may differ from the order in which they were
|
||||
received. hwtimetrans may be non-monotonic even for the same NIC.
|
||||
Filled in if SOF_TIMESTAMPING_SYS_HARDWARE is set. Requires support
|
||||
by the network device and will be empty without that support.
|
||||
|
||||
|
||||
SIOCSHWTSTAMP:
|
||||
|
||||
Hardware time stamping must also be initialized for each device driver
|
||||
that is expected to do hardware time stamping. The parameter is:
|
||||
|
||||
struct hwtstamp_config {
|
||||
int flags; /* no flags defined right now, must be zero */
|
||||
int tx_type; /* HWTSTAMP_TX_* */
|
||||
int rx_filter; /* HWTSTAMP_FILTER_* */
|
||||
};
|
||||
|
||||
Desired behavior is passed into the kernel and to a specific device by
|
||||
calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose
|
||||
ifr_data points to a struct hwtstamp_config. The tx_type and
|
||||
rx_filter are hints to the driver what it is expected to do. If
|
||||
the requested fine-grained filtering for incoming packets is not
|
||||
supported, the driver may time stamp more than just the requested types
|
||||
of packets.
|
||||
|
||||
A driver which supports hardware time stamping shall update the struct
|
||||
with the actual, possibly more permissive configuration. If the
|
||||
requested packets cannot be time stamped, then nothing should be
|
||||
changed and ERANGE shall be returned (in contrast to EINVAL, which
|
||||
indicates that SIOCSHWTSTAMP is not supported at all).
|
||||
|
||||
Only a processes with admin rights may change the configuration. User
|
||||
space is responsible to ensure that multiple processes don't interfere
|
||||
with each other and that the settings are reset.
|
||||
|
||||
/* possible values for hwtstamp_config->tx_type */
|
||||
enum {
|
||||
/*
|
||||
* no outgoing packet will need hardware time stamping;
|
||||
* should a packet arrive which asks for it, no hardware
|
||||
* time stamping will be done
|
||||
*/
|
||||
HWTSTAMP_TX_OFF,
|
||||
|
||||
/*
|
||||
* enables hardware time stamping for outgoing packets;
|
||||
* the sender of the packet decides which are to be
|
||||
* time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE
|
||||
* before sending the packet
|
||||
*/
|
||||
HWTSTAMP_TX_ON,
|
||||
};
|
||||
|
||||
/* possible values for hwtstamp_config->rx_filter */
|
||||
enum {
|
||||
/* time stamp no incoming packet at all */
|
||||
HWTSTAMP_FILTER_NONE,
|
||||
|
||||
/* time stamp any incoming packet */
|
||||
HWTSTAMP_FILTER_ALL,
|
||||
|
||||
/* return value: time stamp all packets requested plus some others */
|
||||
HWTSTAMP_FILTER_SOME,
|
||||
|
||||
/* PTP v1, UDP, any kind of event packet */
|
||||
HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
|
||||
|
||||
...
|
||||
};
|
||||
|
||||
|
||||
DEVICE IMPLEMENTATION
|
||||
|
||||
A driver which supports hardware time stamping must support the
|
||||
SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored
|
||||
in the skb with skb_hwtstamp_set().
|
||||
|
||||
Time stamps for outgoing packets are to be generated as follows:
|
||||
- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware()
|
||||
returns non-zero. If yes, then the driver is expected
|
||||
to do hardware time stamping.
|
||||
- If this is possible for the skb and requested, then declare
|
||||
that the driver is doing the time stamping by calling
|
||||
skb_hwtstamp_tx_in_progress(). A driver not supporting
|
||||
hardware time stamping doesn't do that. A driver must never
|
||||
touch sk_buff::tstamp! It is used to store how time stamping
|
||||
for an outgoing packets is to be done.
|
||||
- As soon as the driver has sent the packet and/or obtained a
|
||||
hardware time stamp for it, it passes the time stamp back by
|
||||
calling skb_hwtstamp_tx() with the original skb, the raw
|
||||
hardware time stamp and a handle to the device (necessary
|
||||
to convert the hardware time stamp to system time). If obtaining
|
||||
the hardware time stamp somehow fails, then the driver should
|
||||
not fall back to software time stamping. The rationale is that
|
||||
this would occur at a later time in the processing pipeline
|
||||
than other software time stamping and therefore could lead
|
||||
to unexpected deltas between time stamps.
|
||||
- If the driver did not call skb_hwtstamp_tx_in_progress(), then
|
||||
dev_hard_start_xmit() checks whether software time stamping
|
||||
is wanted as fallback and potentially generates the time stamp.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user