You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'linus' into timers/core
Reason: Further posix_cpu_timer patches depend on mainline changes Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
This commit is contained in:
@@ -160,7 +160,7 @@ Description:
|
||||
match the driver to the device. For example:
|
||||
# echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
|
||||
|
||||
What: /sys/bus/usb/device/.../avoid_reset
|
||||
What: /sys/bus/usb/device/.../avoid_reset_quirk
|
||||
Date: December 2009
|
||||
Contact: Oliver Neukum <oliver@neukum.org>
|
||||
Description:
|
||||
|
||||
@@ -107,10 +107,6 @@ void (*dev_config) (struct ata_port *, struct ata_device *);
|
||||
issue of SET FEATURES - XFER MODE, and prior to operation.
|
||||
</para>
|
||||
<para>
|
||||
Called by ata_device_add() after ata_dev_identify() determines
|
||||
a device is present.
|
||||
</para>
|
||||
<para>
|
||||
This entry may be specified as NULL in ata_port_operations.
|
||||
</para>
|
||||
|
||||
@@ -154,8 +150,8 @@ unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned in
|
||||
|
||||
<sect2><title>Taskfile read/write</title>
|
||||
<programlisting>
|
||||
void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
|
||||
void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
|
||||
void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
|
||||
void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
@@ -164,36 +160,35 @@ void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
|
||||
hardware registers / DMA buffers, to obtain the current set of
|
||||
taskfile register values.
|
||||
Most drivers for taskfile-based hardware (PIO or MMIO) use
|
||||
ata_tf_load() and ata_tf_read() for these hooks.
|
||||
ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2><title>PIO data read/write</title>
|
||||
<programlisting>
|
||||
void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
|
||||
void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
All bmdma-style drivers must implement this hook. This is the low-level
|
||||
operation that actually copies the data bytes during a PIO data
|
||||
transfer.
|
||||
Typically the driver
|
||||
will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or
|
||||
ata_mmio_data_xfer().
|
||||
Typically the driver will choose one of ata_sff_data_xfer_noirq(),
|
||||
ata_sff_data_xfer(), or ata_sff_data_xfer32().
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2><title>ATA command execute</title>
|
||||
<programlisting>
|
||||
void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
|
||||
void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
causes an ATA command, previously loaded with
|
||||
->tf_load(), to be initiated in hardware.
|
||||
Most drivers for taskfile-based hardware use ata_exec_command()
|
||||
Most drivers for taskfile-based hardware use ata_sff_exec_command()
|
||||
for this hook.
|
||||
</para>
|
||||
|
||||
@@ -218,8 +213,8 @@ command.
|
||||
|
||||
<sect2><title>Read specific ATA shadow registers</title>
|
||||
<programlisting>
|
||||
u8 (*check_status)(struct ata_port *ap);
|
||||
u8 (*check_altstatus)(struct ata_port *ap);
|
||||
u8 (*sff_check_status)(struct ata_port *ap);
|
||||
u8 (*sff_check_altstatus)(struct ata_port *ap);
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
@@ -227,20 +222,14 @@ u8 (*check_altstatus)(struct ata_port *ap);
|
||||
hardware. On some hardware, reading the Status register has
|
||||
the side effect of clearing the interrupt condition.
|
||||
Most drivers for taskfile-based hardware use
|
||||
ata_check_status() for this hook.
|
||||
</para>
|
||||
<para>
|
||||
Note that because this is called from ata_device_add(), at
|
||||
least a dummy function that clears device interrupts must be
|
||||
provided for all drivers, even if the controller doesn't
|
||||
actually have a taskfile status register.
|
||||
ata_sff_check_status() for this hook.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2><title>Select ATA device on bus</title>
|
||||
<programlisting>
|
||||
void (*dev_select)(struct ata_port *ap, unsigned int device);
|
||||
void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
@@ -251,9 +240,7 @@ void (*dev_select)(struct ata_port *ap, unsigned int device);
|
||||
</para>
|
||||
<para>
|
||||
Most drivers for taskfile-based hardware use
|
||||
ata_std_dev_select() for this hook. Controllers which do not
|
||||
support second drives on a port (such as SATA contollers) will
|
||||
use ata_noop_dev_select().
|
||||
ata_sff_dev_select() for this hook.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
@@ -441,13 +428,13 @@ void (*irq_clear) (struct ata_port *);
|
||||
to struct ata_host_set.
|
||||
</para>
|
||||
<para>
|
||||
Most legacy IDE drivers use ata_interrupt() for the
|
||||
Most legacy IDE drivers use ata_sff_interrupt() for the
|
||||
irq_handler hook, which scans all ports in the host_set,
|
||||
determines which queued command was active (if any), and calls
|
||||
ata_host_intr(ap,qc).
|
||||
ata_sff_host_intr(ap,qc).
|
||||
</para>
|
||||
<para>
|
||||
Most legacy IDE drivers use ata_bmdma_irq_clear() for the
|
||||
Most legacy IDE drivers use ata_sff_irq_clear() for the
|
||||
irq_clear() hook, which simply clears the interrupt and error
|
||||
flags in the DMA status register.
|
||||
</para>
|
||||
@@ -496,10 +483,6 @@ void (*host_stop) (struct ata_host_set *host_set);
|
||||
data from port at this time.
|
||||
</para>
|
||||
<para>
|
||||
Many drivers use ata_port_stop() as this hook, which frees the
|
||||
PRD table.
|
||||
</para>
|
||||
<para>
|
||||
->host_stop() is called after all ->port_stop() calls
|
||||
have completed. The hook must finalize hardware shutdown, release DMA
|
||||
and other resources, etc.
|
||||
|
||||
@@ -16,6 +16,15 @@
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
<author>
|
||||
<firstname>William</firstname>
|
||||
<surname>Cohen</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>wcohen@redhat.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<legalnotice>
|
||||
@@ -91,4 +100,8 @@
|
||||
!Iinclude/trace/events/signal.h
|
||||
</chapter>
|
||||
|
||||
<chapter id="block">
|
||||
<title>Block IO</title>
|
||||
!Iinclude/trace/events/block.h
|
||||
</chapter>
|
||||
</book>
|
||||
|
||||
+1
-1
@@ -234,7 +234,7 @@ process is as follows:
|
||||
Linus, usually the patches that have already been included in the
|
||||
-next kernel for a few weeks. The preferred way to submit big changes
|
||||
is using git (the kernel's source management tool, more information
|
||||
can be found at http://git.or.cz/) but plain patches are also just
|
||||
can be found at http://git-scm.com/) but plain patches are also just
|
||||
fine.
|
||||
- After two weeks a -rc1 kernel is released it is now possible to push
|
||||
only patches that do not include new features that could affect the
|
||||
|
||||
@@ -34,7 +34,7 @@ NMI handler.
|
||||
cpu = smp_processor_id();
|
||||
++nmi_count(cpu);
|
||||
|
||||
if (!rcu_dereference(nmi_callback)(regs, cpu))
|
||||
if (!rcu_dereference_sched(nmi_callback)(regs, cpu))
|
||||
default_do_nmi(regs);
|
||||
|
||||
nmi_exit();
|
||||
@@ -47,12 +47,13 @@ function pointer. If this handler returns zero, do_nmi() invokes the
|
||||
default_do_nmi() function to handle a machine-specific NMI. Finally,
|
||||
preemption is restored.
|
||||
|
||||
Strictly speaking, rcu_dereference() is not needed, since this code runs
|
||||
only on i386, which does not need rcu_dereference() anyway. However,
|
||||
it is a good documentation aid, particularly for anyone attempting to
|
||||
do something similar on Alpha.
|
||||
In theory, rcu_dereference_sched() is not needed, since this code runs
|
||||
only on i386, which in theory does not need rcu_dereference_sched()
|
||||
anyway. However, in practice it is a good documentation aid, particularly
|
||||
for anyone attempting to do something similar on Alpha or on systems
|
||||
with aggressive optimizing compilers.
|
||||
|
||||
Quick Quiz: Why might the rcu_dereference() be necessary on Alpha,
|
||||
Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
|
||||
given that the code referenced by the pointer is read-only?
|
||||
|
||||
|
||||
@@ -99,17 +100,21 @@ invoke irq_enter() and irq_exit() on NMI entry and exit, respectively.
|
||||
|
||||
Answer to Quick Quiz
|
||||
|
||||
Why might the rcu_dereference() be necessary on Alpha, given
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given
|
||||
that the code referenced by the pointer is read-only?
|
||||
|
||||
Answer: The caller to set_nmi_callback() might well have
|
||||
initialized some data that is to be used by the
|
||||
new NMI handler. In this case, the rcu_dereference()
|
||||
would be needed, because otherwise a CPU that received
|
||||
an NMI just after the new handler was set might see
|
||||
the pointer to the new NMI handler, but the old
|
||||
pre-initialized version of the handler's data.
|
||||
initialized some data that is to be used by the new NMI
|
||||
handler. In this case, the rcu_dereference_sched() would
|
||||
be needed, because otherwise a CPU that received an NMI
|
||||
just after the new handler was set might see the pointer
|
||||
to the new NMI handler, but the old pre-initialized
|
||||
version of the handler's data.
|
||||
|
||||
More important, the rcu_dereference() makes it clear
|
||||
to someone reading the code that the pointer is being
|
||||
protected by RCU.
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
being protected by RCU-sched.
|
||||
|
||||
@@ -260,7 +260,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
The reason that it is permissible to use RCU list-traversal
|
||||
primitives when the update-side lock is held is that doing so
|
||||
can be quite helpful in reducing code bloat when common code is
|
||||
shared between readers and updaters.
|
||||
shared between readers and updaters. Additional primitives
|
||||
are provided for this case, as discussed in lockdep.txt.
|
||||
|
||||
10. Conversely, if you are in an RCU read-side critical section,
|
||||
and you don't hold the appropriate update-side lock, you -must-
|
||||
@@ -344,8 +345,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
requiring SRCU's read-side deadlock immunity or low read-side
|
||||
realtime latency.
|
||||
|
||||
Note that, rcu_assign_pointer() and rcu_dereference() relate to
|
||||
SRCU just as they do to other forms of RCU.
|
||||
Note that, rcu_assign_pointer() relates to SRCU just as they do
|
||||
to other forms of RCU.
|
||||
|
||||
15. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
is to wait until all pre-existing readers have finished before
|
||||
|
||||
@@ -32,9 +32,20 @@ checking of rcu_dereference() primitives:
|
||||
srcu_dereference(p, sp):
|
||||
Check for SRCU read-side critical section.
|
||||
rcu_dereference_check(p, c):
|
||||
Use explicit check expression "c".
|
||||
Use explicit check expression "c". This is useful in
|
||||
code that is invoked by both readers and updaters.
|
||||
rcu_dereference_raw(p)
|
||||
Don't check. (Use sparingly, if at all.)
|
||||
rcu_dereference_protected(p, c):
|
||||
Use explicit check expression "c", and omit all barriers
|
||||
and compiler constraints. This is useful when the data
|
||||
structure cannot change, for example, in code that is
|
||||
invoked only by updaters.
|
||||
rcu_access_pointer(p):
|
||||
Return the value of the pointer and omit all barriers,
|
||||
but retain the compiler constraints that prevent duplicating
|
||||
or coalescsing. This is useful when when testing the
|
||||
value of the pointer itself, for example, against NULL.
|
||||
|
||||
The rcu_dereference_check() check expression can be any boolean
|
||||
expression, but would normally include one of the rcu_read_lock_held()
|
||||
@@ -59,7 +70,20 @@ In case (1), the pointer is picked up in an RCU-safe manner for vanilla
|
||||
RCU read-side critical sections, in case (2) the ->file_lock prevents
|
||||
any change from taking place, and finally, in case (3) the current task
|
||||
is the only task accessing the file_struct, again preventing any change
|
||||
from taking place.
|
||||
from taking place. If the above statement was invoked only from updater
|
||||
code, it could instead be written as follows:
|
||||
|
||||
file = rcu_dereference_protected(fdt->fd[fd],
|
||||
lockdep_is_held(&files->file_lock) ||
|
||||
atomic_read(&files->count) == 1);
|
||||
|
||||
This would verify cases #2 and #3 above, and furthermore lockdep would
|
||||
complain if this was used in an RCU read-side critical section unless one
|
||||
of these two cases held. Because rcu_dereference_protected() omits all
|
||||
barriers and compiler constraints, it generates better code than do the
|
||||
other flavors of rcu_dereference(). On the other hand, it is illegal
|
||||
to use rcu_dereference_protected() if either the RCU-protected pointer
|
||||
or the RCU-protected data that it points to can change concurrently.
|
||||
|
||||
There are currently only "universal" versions of the rcu_assign_pointer()
|
||||
and RCU list-/tree-traversal primitives, which do not (yet) check for
|
||||
|
||||
@@ -840,6 +840,12 @@ SRCU: Initialization/cleanup
|
||||
init_srcu_struct
|
||||
cleanup_srcu_struct
|
||||
|
||||
All: lockdep-checked RCU-protected pointer access
|
||||
|
||||
rcu_dereference_check
|
||||
rcu_dereference_protected
|
||||
rcu_access_pointer
|
||||
|
||||
See the comment headers in the source code (or the docbook generated
|
||||
from them) for more information.
|
||||
|
||||
|
||||
@@ -1162,8 +1162,8 @@ where a driver received a request ala this before:
|
||||
|
||||
As mentioned, there is no virtual mapping of a bio. For DMA, this is
|
||||
not a problem as the driver probably never will need a virtual mapping.
|
||||
Instead it needs a bus mapping (pci_map_page for a single segment or
|
||||
use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For
|
||||
Instead it needs a bus mapping (dma_map_page for a single segment or
|
||||
use dma_map_sg for scatter gather) to be able to ship it to the driver. For
|
||||
PIO drivers (or drivers that need to revert to PIO transfer once in a
|
||||
while (IDE for example)), where the CPU is doing the actual data
|
||||
transfer a virtual mapping is needed. If the driver supports highmem I/O,
|
||||
|
||||
@@ -235,8 +235,7 @@ containing the following files describing that cgroup:
|
||||
- cgroup.procs: list of tgids in the cgroup. This list is not
|
||||
guaranteed to be sorted or free of duplicate tgids, and userspace
|
||||
should sort/uniquify the list if this property is required.
|
||||
Writing a tgid into this file moves all threads with that tgid into
|
||||
this cgroup.
|
||||
This is a read-only file, for now.
|
||||
- notify_on_release flag: run the release agent on exit?
|
||||
- release_agent: the path to use for release notifications (this file
|
||||
exists in the top cgroup only)
|
||||
|
||||
@@ -340,7 +340,7 @@ Note:
|
||||
5.3 swappiness
|
||||
Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
|
||||
|
||||
Following cgroups' swapiness can't be changed.
|
||||
Following cgroups' swappiness can't be changed.
|
||||
- root cgroup (uses /proc/sys/vm/swappiness).
|
||||
- a cgroup which uses hierarchy and it has child cgroup.
|
||||
- a cgroup which uses hierarchy and not the root of hierarchy.
|
||||
|
||||
@@ -0,0 +1,234 @@
|
||||
================
|
||||
CIRCULAR BUFFERS
|
||||
================
|
||||
|
||||
By: David Howells <dhowells@redhat.com>
|
||||
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
|
||||
|
||||
Linux provides a number of features that can be used to implement circular
|
||||
buffering. There are two sets of such features:
|
||||
|
||||
(1) Convenience functions for determining information about power-of-2 sized
|
||||
buffers.
|
||||
|
||||
(2) Memory barriers for when the producer and the consumer of objects in the
|
||||
buffer don't want to share a lock.
|
||||
|
||||
To use these facilities, as discussed below, there needs to be just one
|
||||
producer and just one consumer. It is possible to handle multiple producers by
|
||||
serialising them, and to handle multiple consumers by serialising them.
|
||||
|
||||
|
||||
Contents:
|
||||
|
||||
(*) What is a circular buffer?
|
||||
|
||||
(*) Measuring power-of-2 buffers.
|
||||
|
||||
(*) Using memory barriers with circular buffers.
|
||||
- The producer.
|
||||
- The consumer.
|
||||
|
||||
|
||||
==========================
|
||||
WHAT IS A CIRCULAR BUFFER?
|
||||
==========================
|
||||
|
||||
First of all, what is a circular buffer? A circular buffer is a buffer of
|
||||
fixed, finite size into which there are two indices:
|
||||
|
||||
(1) A 'head' index - the point at which the producer inserts items into the
|
||||
buffer.
|
||||
|
||||
(2) A 'tail' index - the point at which the consumer finds the next item in
|
||||
the buffer.
|
||||
|
||||
Typically when the tail pointer is equal to the head pointer, the buffer is
|
||||
empty; and the buffer is full when the head pointer is one less than the tail
|
||||
pointer.
|
||||
|
||||
The head index is incremented when items are added, and the tail index when
|
||||
items are removed. The tail index should never jump the head index, and both
|
||||
indices should be wrapped to 0 when they reach the end of the buffer, thus
|
||||
allowing an infinite amount of data to flow through the buffer.
|
||||
|
||||
Typically, items will all be of the same unit size, but this isn't strictly
|
||||
required to use the techniques below. The indices can be increased by more
|
||||
than 1 if multiple items or variable-sized items are to be included in the
|
||||
buffer, provided that neither index overtakes the other. The implementer must
|
||||
be careful, however, as a region more than one unit in size may wrap the end of
|
||||
the buffer and be broken into two segments.
|
||||
|
||||
|
||||
============================
|
||||
MEASURING POWER-OF-2 BUFFERS
|
||||
============================
|
||||
|
||||
Calculation of the occupancy or the remaining capacity of an arbitrarily sized
|
||||
circular buffer would normally be a slow operation, requiring the use of a
|
||||
modulus (divide) instruction. However, if the buffer is of a power-of-2 size,
|
||||
then a much quicker bitwise-AND instruction can be used instead.
|
||||
|
||||
Linux provides a set of macros for handling power-of-2 circular buffers. These
|
||||
can be made use of by:
|
||||
|
||||
#include <linux/circ_buf.h>
|
||||
|
||||
The macros are:
|
||||
|
||||
(*) Measure the remaining capacity of a buffer:
|
||||
|
||||
CIRC_SPACE(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the amount of space left in the buffer[1] into which items
|
||||
can be inserted.
|
||||
|
||||
|
||||
(*) Measure the maximum consecutive immediate space in a buffer:
|
||||
|
||||
CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the amount of consecutive space left in the buffer[1] into
|
||||
which items can be immediately inserted without having to wrap back to the
|
||||
beginning of the buffer.
|
||||
|
||||
|
||||
(*) Measure the occupancy of a buffer:
|
||||
|
||||
CIRC_CNT(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the number of items currently occupying a buffer[2].
|
||||
|
||||
|
||||
(*) Measure the non-wrapping occupancy of a buffer:
|
||||
|
||||
CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the number of consecutive items[2] that can be extracted from
|
||||
the buffer without having to wrap back to the beginning of the buffer.
|
||||
|
||||
|
||||
Each of these macros will nominally return a value between 0 and buffer_size-1,
|
||||
however:
|
||||
|
||||
[1] CIRC_SPACE*() are intended to be used in the producer. To the producer
|
||||
they will return a lower bound as the producer controls the head index,
|
||||
but the consumer may still be depleting the buffer on another CPU and
|
||||
moving the tail index.
|
||||
|
||||
To the consumer it will show an upper bound as the producer may be busy
|
||||
depleting the space.
|
||||
|
||||
[2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they
|
||||
will return a lower bound as the consumer controls the tail index, but the
|
||||
producer may still be filling the buffer on another CPU and moving the
|
||||
head index.
|
||||
|
||||
To the producer it will show an upper bound as the consumer may be busy
|
||||
emptying the buffer.
|
||||
|
||||
[3] To a third party, the order in which the writes to the indices by the
|
||||
producer and consumer become visible cannot be guaranteed as they are
|
||||
independent and may be made on different CPUs - so the result in such a
|
||||
situation will merely be a guess, and may even be negative.
|
||||
|
||||
|
||||
===========================================
|
||||
USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
|
||||
===========================================
|
||||
|
||||
By using memory barriers in conjunction with circular buffers, you can avoid
|
||||
the need to:
|
||||
|
||||
(1) use a single lock to govern access to both ends of the buffer, thus
|
||||
allowing the buffer to be filled and emptied at the same time; and
|
||||
|
||||
(2) use atomic counter operations.
|
||||
|
||||
There are two sides to this: the producer that fills the buffer, and the
|
||||
consumer that empties it. Only one thing should be filling a buffer at any one
|
||||
time, and only one thing should be emptying a buffer at any one time, but the
|
||||
two sides can operate simultaneously.
|
||||
|
||||
|
||||
THE PRODUCER
|
||||
------------
|
||||
|
||||
The producer will look something like this:
|
||||
|
||||
spin_lock(&producer_lock);
|
||||
|
||||
unsigned long head = buffer->head;
|
||||
unsigned long tail = ACCESS_ONCE(buffer->tail);
|
||||
|
||||
if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
|
||||
/* insert one item into the buffer */
|
||||
struct item *item = buffer[head];
|
||||
|
||||
produce_item(item);
|
||||
|
||||
smp_wmb(); /* commit the item before incrementing the head */
|
||||
|
||||
buffer->head = (head + 1) & (buffer->size - 1);
|
||||
|
||||
/* wake_up() will make sure that the head is committed before
|
||||
* waking anyone up */
|
||||
wake_up(consumer);
|
||||
}
|
||||
|
||||
spin_unlock(&producer_lock);
|
||||
|
||||
This will instruct the CPU that the contents of the new item must be written
|
||||
before the head index makes it available to the consumer and then instructs the
|
||||
CPU that the revised head index must be written before the consumer is woken.
|
||||
|
||||
Note that wake_up() doesn't have to be the exact mechanism used, but whatever
|
||||
is used must guarantee a (write) memory barrier between the update of the head
|
||||
index and the change of state of the consumer, if a change of state occurs.
|
||||
|
||||
|
||||
THE CONSUMER
|
||||
------------
|
||||
|
||||
The consumer will look something like this:
|
||||
|
||||
spin_lock(&consumer_lock);
|
||||
|
||||
unsigned long head = ACCESS_ONCE(buffer->head);
|
||||
unsigned long tail = buffer->tail;
|
||||
|
||||
if (CIRC_CNT(head, tail, buffer->size) >= 1) {
|
||||
/* read index before reading contents at that index */
|
||||
smp_read_barrier_depends();
|
||||
|
||||
/* extract one item from the buffer */
|
||||
struct item *item = buffer[tail];
|
||||
|
||||
consume_item(item);
|
||||
|
||||
smp_mb(); /* finish reading descriptor before incrementing tail */
|
||||
|
||||
buffer->tail = (tail + 1) & (buffer->size - 1);
|
||||
}
|
||||
|
||||
spin_unlock(&consumer_lock);
|
||||
|
||||
This will instruct the CPU to make sure the index is up to date before reading
|
||||
the new item, and then it shall make sure the CPU has finished reading the item
|
||||
before it writes the new tail pointer, which will erase the item.
|
||||
|
||||
|
||||
Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
|
||||
This prevents the compiler from discarding and reloading its cached value -
|
||||
which some compilers will do across smp_read_barrier_depends(). This isn't
|
||||
strictly needed if you can be sure that the opposition index will _only_ be
|
||||
used the once.
|
||||
|
||||
|
||||
===============
|
||||
FURTHER READING
|
||||
===============
|
||||
|
||||
See also Documentation/memory-barriers.txt for a description of Linux's memory
|
||||
barrier facilities.
|
||||
@@ -25,6 +25,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/timer.h>
|
||||
|
||||
#include <linux/connector.h>
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
|
||||
What is imacfb?
|
||||
What is efifb?
|
||||
===============
|
||||
|
||||
This is a generic EFI platform driver for Intel based Apple computers.
|
||||
Imacfb is only for EFI booted Intel Macs.
|
||||
efifb is only for EFI booted Intel Macs.
|
||||
|
||||
Supported Hardware
|
||||
==================
|
||||
@@ -16,16 +16,16 @@ MacMini
|
||||
How to use it?
|
||||
==============
|
||||
|
||||
Imacfb does not have any kind of autodetection of your machine.
|
||||
efifb does not have any kind of autodetection of your machine.
|
||||
You have to add the following kernel parameters in your elilo.conf:
|
||||
Macbook :
|
||||
video=imacfb:macbook
|
||||
video=efifb:macbook
|
||||
MacMini :
|
||||
video=imacfb:mini
|
||||
video=efifb:mini
|
||||
Macbook Pro 15", iMac 17" :
|
||||
video=imacfb:i17
|
||||
video=efifb:i17
|
||||
Macbook Pro 17", iMac 20" :
|
||||
video=imacfb:i20
|
||||
video=efifb:i20
|
||||
|
||||
--
|
||||
Edgar Hucek <gimli@dark-green.com>
|
||||
@@ -599,3 +599,26 @@ Why: Useful in 2003, implementation is a hack.
|
||||
Generally invoked by accident today.
|
||||
Seen as doing more harm than good.
|
||||
Who: Len Brown <len.brown@intel.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: video4linux /dev/vtx teletext API support
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/saa5246a.c drivers/media/video/saa5249.c
|
||||
include/linux/videotext.h
|
||||
Why: The vtx device nodes have been superseded by vbi device nodes
|
||||
for many years. No applications exist that use the vtx support.
|
||||
Of the two i2c drivers that actually support this API the saa5249
|
||||
has been impossible to use for a year now and no known hardware
|
||||
that supports this device exists. The saa5246a is theoretically
|
||||
supported by the old mxb boards, but it never actually worked.
|
||||
|
||||
In summary: there is no hardware that can use this API and there
|
||||
are no applications actually implementing this API.
|
||||
|
||||
The vtx support still reserves minors 192-223 and we would really
|
||||
like to reuse those for upcoming new functionality. In the unlikely
|
||||
event that new hardware appears that wants to use the functionality
|
||||
provided by the vtx API, then that functionality should be build
|
||||
around the sliced VBI API instead.
|
||||
Who: Hans Verkuil <hverkuil@xs4all.nl>
|
||||
|
||||
@@ -16,6 +16,8 @@ befs.txt
|
||||
- information about the BeOS filesystem for Linux.
|
||||
bfs.txt
|
||||
- info for the SCO UnixWare Boot Filesystem (BFS).
|
||||
ceph.txt
|
||||
- info for the Ceph Distributed File System
|
||||
cifs.txt
|
||||
- description of the CIFS filesystem.
|
||||
coda.txt
|
||||
|
||||
@@ -37,6 +37,15 @@ For Plan 9 From User Space applications (http://swtch.com/plan9)
|
||||
|
||||
mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
|
||||
|
||||
For server running on QEMU host with virtio transport:
|
||||
|
||||
mount -t 9p -o trans=virtio <mount_tag> /mnt/9
|
||||
|
||||
where mount_tag is the tag associated by the server to each of the exported
|
||||
mount points. Each 9P export is seen by the client as a virtio device with an
|
||||
associated "mount_tag" property. Available mount tags can be
|
||||
seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files.
|
||||
|
||||
OPTIONS
|
||||
=======
|
||||
|
||||
@@ -47,7 +56,7 @@ OPTIONS
|
||||
fd - used passed file descriptors for connection
|
||||
(see rfdno and wfdno)
|
||||
virtio - connect to the next virtio channel available
|
||||
(from lguest or KVM with trans_virtio module)
|
||||
(from QEMU with trans_virtio module)
|
||||
rdma - connect to a specified RDMA channel
|
||||
|
||||
uname=name user name to attempt mount as on the remote server. The
|
||||
@@ -85,7 +94,12 @@ OPTIONS
|
||||
|
||||
port=n port to connect to on the remote server
|
||||
|
||||
noextend force legacy mode (no 9p2000.u semantics)
|
||||
noextend force legacy mode (no 9p2000.u or 9p2000.L semantics)
|
||||
|
||||
version=name Select 9P protocol version. Valid options are:
|
||||
9p2000 - Legacy mode (same as noextend)
|
||||
9p2000.u - Use 9P2000.u protocol
|
||||
9p2000.L - Use 9P2000.L protocol
|
||||
|
||||
dfltuid attempt to mount as a particular uid
|
||||
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
Ceph Distributed File System
|
||||
============================
|
||||
|
||||
Ceph is a distributed network file system designed to provide good
|
||||
performance, reliability, and scalability.
|
||||
|
||||
Basic features include:
|
||||
|
||||
* POSIX semantics
|
||||
* Seamless scaling from 1 to many thousands of nodes
|
||||
* High availability and reliability. No single point of failure.
|
||||
* N-way replication of data across storage nodes
|
||||
* Fast recovery from node failures
|
||||
* Automatic rebalancing of data on node addition/removal
|
||||
* Easy deployment: most FS components are userspace daemons
|
||||
|
||||
Also,
|
||||
* Flexible snapshots (on any directory)
|
||||
* Recursive accounting (nested files, directories, bytes)
|
||||
|
||||
In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
|
||||
on symmetric access by all clients to shared block devices, Ceph
|
||||
separates data and metadata management into independent server
|
||||
clusters, similar to Lustre. Unlike Lustre, however, metadata and
|
||||
storage nodes run entirely as user space daemons. Storage nodes
|
||||
utilize btrfs to store data objects, leveraging its advanced features
|
||||
(checksumming, metadata replication, etc.). File data is striped
|
||||
across storage nodes in large chunks to distribute workload and
|
||||
facilitate high throughputs. When storage nodes fail, data is
|
||||
re-replicated in a distributed fashion by the storage nodes themselves
|
||||
(with some minimal coordination from a cluster monitor), making the
|
||||
system extremely efficient and scalable.
|
||||
|
||||
Metadata servers effectively form a large, consistent, distributed
|
||||
in-memory cache above the file namespace that is extremely scalable,
|
||||
dynamically redistributes metadata in response to workload changes,
|
||||
and can tolerate arbitrary (well, non-Byzantine) node failures. The
|
||||
metadata server takes a somewhat unconventional approach to metadata
|
||||
storage to significantly improve performance for common workloads. In
|
||||
particular, inodes with only a single link are embedded in
|
||||
directories, allowing entire directories of dentries and inodes to be
|
||||
loaded into its cache with a single I/O operation. The contents of
|
||||
extremely large directories can be fragmented and managed by
|
||||
independent metadata servers, allowing scalable concurrent access.
|
||||
|
||||
The system offers automatic data rebalancing/migration when scaling
|
||||
from a small cluster of just a few nodes to many hundreds, without
|
||||
requiring an administrator carve the data set into static volumes or
|
||||
go through the tedious process of migrating data between servers.
|
||||
When the file system approaches full, new nodes can be easily added
|
||||
and things will "just work."
|
||||
|
||||
Ceph includes flexible snapshot mechanism that allows a user to create
|
||||
a snapshot on any subdirectory (and its nested contents) in the
|
||||
system. Snapshot creation and deletion are as simple as 'mkdir
|
||||
.snap/foo' and 'rmdir .snap/foo'.
|
||||
|
||||
Ceph also provides some recursive accounting on directories for nested
|
||||
files and bytes. That is, a 'getfattr -d foo' on any directory in the
|
||||
system will reveal the total number of nested regular files and
|
||||
subdirectories, and a summation of all nested file sizes. This makes
|
||||
the identification of large disk space consumers relatively quick, as
|
||||
no 'du' or similar recursive scan of the file system is required.
|
||||
|
||||
|
||||
Mount Syntax
|
||||
============
|
||||
|
||||
The basic mount syntax is:
|
||||
|
||||
# mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
|
||||
|
||||
You only need to specify a single monitor, as the client will get the
|
||||
full list when it connects. (However, if the monitor you specify
|
||||
happens to be down, the mount won't succeed.) The port can be left
|
||||
off if the monitor is using the default. So if the monitor is at
|
||||
1.2.3.4,
|
||||
|
||||
# mount -t ceph 1.2.3.4:/ /mnt/ceph
|
||||
|
||||
is sufficient. If /sbin/mount.ceph is installed, a hostname can be
|
||||
used instead of an IP address.
|
||||
|
||||
|
||||
|
||||
Mount Options
|
||||
=============
|
||||
|
||||
ip=A.B.C.D[:N]
|
||||
Specify the IP and/or port the client should bind to locally.
|
||||
There is normally not much reason to do this. If the IP is not
|
||||
specified, the client's IP address is determined by looking at the
|
||||
address it's connection to the monitor originates from.
|
||||
|
||||
wsize=X
|
||||
Specify the maximum write size in bytes. By default there is no
|
||||
maximum. Ceph will normally size writes based on the file stripe
|
||||
size.
|
||||
|
||||
rsize=X
|
||||
Specify the maximum readahead.
|
||||
|
||||
mount_timeout=X
|
||||
Specify the timeout value for mount (in seconds), in the case
|
||||
of a non-responsive Ceph file system. The default is 30
|
||||
seconds.
|
||||
|
||||
rbytes
|
||||
When stat() is called on a directory, set st_size to 'rbytes',
|
||||
the summation of file sizes over all files nested beneath that
|
||||
directory. This is the default.
|
||||
|
||||
norbytes
|
||||
When stat() is called on a directory, set st_size to the
|
||||
number of entries in that directory.
|
||||
|
||||
nocrc
|
||||
Disable CRC32C calculation for data writes. If set, the storage node
|
||||
must rely on TCP's error correction to detect data corruption
|
||||
in the data payload.
|
||||
|
||||
noasyncreaddir
|
||||
Disable client's use its local cache to satisfy readdir
|
||||
requests. (This does not change correctness; the client uses
|
||||
cached metadata only when a lease or capability ensures it is
|
||||
valid.)
|
||||
|
||||
|
||||
More Information
|
||||
================
|
||||
|
||||
For more information on Ceph, see the home page at
|
||||
http://ceph.newdream.net/
|
||||
|
||||
The Linux kernel client source tree is available at
|
||||
git://ceph.newdream.net/git/ceph-client.git
|
||||
git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
|
||||
|
||||
and the source for the full system is at
|
||||
git://ceph.newdream.net/git/ceph.git
|
||||
@@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
|
||||
all files in that instance (if CONFIG_NUMA is enabled) - which can be
|
||||
adjusted on the fly via 'mount -o remount ...'
|
||||
|
||||
mpol=default prefers to allocate memory from the local node
|
||||
mpol=default use the process allocation policy
|
||||
(see set_mempolicy(2))
|
||||
mpol=prefer:Node prefers to allocate memory from the given Node
|
||||
mpol=bind:NodeList allocates memory only from nodes in NodeList
|
||||
mpol=interleave prefers to allocate from each node in turn
|
||||
mpol=interleave:NodeList allocates from each node of NodeList in turn
|
||||
mpol=local prefers to allocate memory from the local node
|
||||
|
||||
NodeList format is a comma-separated list of decimal numbers and ranges,
|
||||
a range being two hyphen-separated decimal numbers, the smallest and
|
||||
@@ -134,3 +136,5 @@ Author:
|
||||
Christoph Rohland <cr@sap.com>, 1.12.01
|
||||
Updated:
|
||||
Hugh Dickins, 4 June 2007
|
||||
Updated:
|
||||
KOSAKI Motohiro, 16 Mar 2010
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user