You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'for-3.10/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "It might look big in volume, but when categorized, not a lot of drivers are touched. The pull request contains: - mtip32xx fixes from Micron. - A slew of drbd updates, this time in a nicer series. - bcache, a flash/ssd caching framework from Kent. - Fixes for cciss" * 'for-3.10/drivers' of git://git.kernel.dk/linux-block: (66 commits) bcache: Use bd_link_disk_holder() bcache: Allocator cleanup/fixes cciss: bug fix to prevent cciss from loading in kdump crash kernel cciss: add cciss_allow_hpsa module parameter drivers/block/mg_disk.c: add CONFIG_PM_SLEEP to suspend/resume functions mtip32xx: Workaround for unaligned writes bcache: Make sure blocksize isn't smaller than device blocksize bcache: Fix merge_bvec_fn usage for when it modifies the bvm bcache: Correctly check against BIO_MAX_PAGES bcache: Hack around stuff that clones up to bi_max_vecs bcache: Set ra_pages based on backing device's ra_pages bcache: Take data offset from the bdev superblock. mtip32xx: mtip32xx: Disable TRIM support mtip32xx: fix a smatch warning bcache: Disable broken btree fuzz tester bcache: Fix a format string overflow bcache: Fix a minor memory leak on device teardown bcache: Documentation updates bcache: Use WARN_ONCE() instead of __WARN() bcache: Add missing #include <linux/prefetch.h> ...
This commit is contained in:
@@ -0,0 +1,156 @@
|
||||
What: /sys/block/<disk>/bcache/unregister
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
A write to this file causes the backing device or cache to be
|
||||
unregistered. If a backing device had dirty data in the cache,
|
||||
writeback mode is automatically disabled and all dirty data is
|
||||
flushed before the device is unregistered. Caches unregister
|
||||
all associated backing devices before unregistering themselves.
|
||||
|
||||
What: /sys/block/<disk>/bcache/clear_stats
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
Writing to this file resets all the statistics for the device.
|
||||
|
||||
What: /sys/block/<disk>/bcache/cache
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a backing device that has cache, a symlink to
|
||||
the bcache/ dir of that cache.
|
||||
|
||||
What: /sys/block/<disk>/bcache/cache_hits
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: integer number of full cache hits,
|
||||
counted per bio. A partial cache hit counts as a miss.
|
||||
|
||||
What: /sys/block/<disk>/bcache/cache_misses
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: integer number of cache misses.
|
||||
|
||||
What: /sys/block/<disk>/bcache/cache_hit_ratio
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: cache hits as a percentage.
|
||||
|
||||
What: /sys/block/<disk>/bcache/sequential_cutoff
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: Threshold past which sequential IO will
|
||||
skip the cache. Read and written as bytes in human readable
|
||||
units (i.e. echo 10M > sequntial_cutoff).
|
||||
|
||||
What: /sys/block/<disk>/bcache/bypassed
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
Sum of all reads and writes that have bypassed the cache (due
|
||||
to the sequential cutoff). Expressed as bytes in human
|
||||
readable units.
|
||||
|
||||
What: /sys/block/<disk>/bcache/writeback
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: When on, writeback caching is enabled and
|
||||
writes will be buffered in the cache. When off, caching is in
|
||||
writethrough mode; reads and writes will be added to the
|
||||
cache but no write buffering will take place.
|
||||
|
||||
What: /sys/block/<disk>/bcache/writeback_running
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: when off, dirty data will not be written
|
||||
from the cache to the backing device. The cache will still be
|
||||
used to buffer writes until it is mostly full, at which point
|
||||
writes transparently revert to writethrough mode. Intended only
|
||||
for benchmarking/testing.
|
||||
|
||||
What: /sys/block/<disk>/bcache/writeback_delay
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: In writeback mode, when dirty data is
|
||||
written to the cache and the cache held no dirty data for that
|
||||
backing device, writeback from cache to backing device starts
|
||||
after this delay, expressed as an integer number of seconds.
|
||||
|
||||
What: /sys/block/<disk>/bcache/writeback_percent
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For backing devices: If nonzero, writeback from cache to
|
||||
backing device only takes place when more than this percentage
|
||||
of the cache is used, allowing more write coalescing to take
|
||||
place and reducing total number of writes sent to the backing
|
||||
device. Integer between 0 and 40.
|
||||
|
||||
What: /sys/block/<disk>/bcache/synchronous
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, a boolean that allows synchronous mode to be
|
||||
switched on and off. In synchronous mode all writes are ordered
|
||||
such that the cache can reliably recover from unclean shutdown;
|
||||
if disabled bcache will not generally wait for writes to
|
||||
complete but if the cache is not shut down cleanly all data
|
||||
will be discarded from the cache. Should not be turned off with
|
||||
writeback caching enabled.
|
||||
|
||||
What: /sys/block/<disk>/bcache/discard
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, a boolean allowing discard/TRIM to be turned off
|
||||
or back on if the device supports it.
|
||||
|
||||
What: /sys/block/<disk>/bcache/bucket_size
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, bucket size in human readable units, as set at
|
||||
cache creation time; should match the erase block size of the
|
||||
SSD for optimal performance.
|
||||
|
||||
What: /sys/block/<disk>/bcache/nbuckets
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, the number of usable buckets.
|
||||
|
||||
What: /sys/block/<disk>/bcache/tree_depth
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, height of the btree excluding leaf nodes (i.e. a
|
||||
one node tree will have a depth of 0).
|
||||
|
||||
What: /sys/block/<disk>/bcache/btree_cache_size
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
Number of btree buckets/nodes that are currently cached in
|
||||
memory; cache dynamically grows and shrinks in response to
|
||||
memory pressure from the rest of the system.
|
||||
|
||||
What: /sys/block/<disk>/bcache/written
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, total amount of data in human readable units
|
||||
written to the cache, excluding all metadata.
|
||||
|
||||
What: /sys/block/<disk>/bcache/btree_written
|
||||
Date: November 2010
|
||||
Contact: Kent Overstreet <kent.overstreet@gmail.com>
|
||||
Description:
|
||||
For a cache, sum of all btree writes in human readable units.
|
||||
@@ -0,0 +1,431 @@
|
||||
Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be
|
||||
nice if you could use them as cache... Hence bcache.
|
||||
|
||||
Wiki and git repositories are at:
|
||||
http://bcache.evilpiepirate.org
|
||||
http://evilpiepirate.org/git/linux-bcache.git
|
||||
http://evilpiepirate.org/git/bcache-tools.git
|
||||
|
||||
It's designed around the performance characteristics of SSDs - it only allocates
|
||||
in erase block sized buckets, and it uses a hybrid btree/log to track cached
|
||||
extants (which can be anywhere from a single sector to the bucket size). It's
|
||||
designed to avoid random writes at all costs; it fills up an erase block
|
||||
sequentially, then issues a discard before reusing it.
|
||||
|
||||
Both writethrough and writeback caching are supported. Writeback defaults to
|
||||
off, but can be switched on and off arbitrarily at runtime. Bcache goes to
|
||||
great lengths to protect your data - it reliably handles unclean shutdown. (It
|
||||
doesn't even have a notion of a clean shutdown; bcache simply doesn't return
|
||||
writes as completed until they're on stable storage).
|
||||
|
||||
Writeback caching can use most of the cache for buffering writes - writing
|
||||
dirty data to the backing device is always done sequentially, scanning from the
|
||||
start to the end of the index.
|
||||
|
||||
Since random IO is what SSDs excel at, there generally won't be much benefit
|
||||
to caching large sequential IO. Bcache detects sequential IO and skips it;
|
||||
it also keeps a rolling average of the IO sizes per task, and as long as the
|
||||
average is above the cutoff it will skip all IO from that task - instead of
|
||||
caching the first 512k after every seek. Backups and large file copies should
|
||||
thus entirely bypass the cache.
|
||||
|
||||
In the event of a data IO error on the flash it will try to recover by reading
|
||||
from disk or invalidating cache entries. For unrecoverable errors (meta data
|
||||
or dirty data), caching is automatically disabled; if dirty data was present
|
||||
in the cache it first disables writeback caching and waits for all dirty data
|
||||
to be flushed.
|
||||
|
||||
Getting started:
|
||||
You'll need make-bcache from the bcache-tools repository. Both the cache device
|
||||
and backing device must be formatted before use.
|
||||
make-bcache -B /dev/sdb
|
||||
make-bcache -C /dev/sdc
|
||||
|
||||
make-bcache has the ability to format multiple devices at the same time - if
|
||||
you format your backing devices and cache device at the same time, you won't
|
||||
have to manually attach:
|
||||
make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
|
||||
|
||||
To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
|
||||
|
||||
echo /dev/sdb > /sys/fs/bcache/register
|
||||
echo /dev/sdc > /sys/fs/bcache/register
|
||||
|
||||
To register your bcache devices automatically, you could add something like
|
||||
this to an init script:
|
||||
|
||||
echo /dev/sd* > /sys/fs/bcache/register_quiet
|
||||
|
||||
It'll look for bcache superblocks and ignore everything that doesn't have one.
|
||||
|
||||
Registering the backing device makes the bcache show up in /dev; you can now
|
||||
format it and use it as normal. But the first time using a new bcache device,
|
||||
it'll be running in passthrough mode until you attach it to a cache. See the
|
||||
section on attaching.
|
||||
|
||||
The devices show up at /dev/bcacheN, and can be controlled via sysfs from
|
||||
/sys/block/bcacheN/bcache:
|
||||
|
||||
mkfs.ext4 /dev/bcache0
|
||||
mount /dev/bcache0 /mnt
|
||||
|
||||
Cache devices are managed as sets; multiple caches per set isn't supported yet
|
||||
but will allow for mirroring of metadata and dirty data in the future. Your new
|
||||
cache set shows up as /sys/fs/bcache/<UUID>
|
||||
|
||||
ATTACHING:
|
||||
|
||||
After your cache device and backing device are registered, the backing device
|
||||
must be attached to your cache set to enable caching. Attaching a backing
|
||||
device to a cache set is done thusly, with the UUID of the cache set in
|
||||
/sys/fs/bcache:
|
||||
|
||||
echo <UUID> > /sys/block/bcache0/bcache/attach
|
||||
|
||||
This only has to be done once. The next time you reboot, just reregister all
|
||||
your bcache devices. If a backing device has data in a cache somewhere, the
|
||||
/dev/bcache# device won't be created until the cache shows up - particularly
|
||||
important if you have writeback caching turned on.
|
||||
|
||||
If you're booting up and your cache device is gone and never coming back, you
|
||||
can force run the backing device:
|
||||
|
||||
echo 1 > /sys/block/sdb/bcache/running
|
||||
|
||||
(You need to use /sys/block/sdb (or whatever your backing device is called), not
|
||||
/sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a
|
||||
partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache)
|
||||
|
||||
The backing device will still use that cache set if it shows up in the future,
|
||||
but all the cached data will be invalidated. If there was dirty data in the
|
||||
cache, don't expect the filesystem to be recoverable - you will have massive
|
||||
filesystem corruption, though ext4's fsck does work miracles.
|
||||
|
||||
ERROR HANDLING:
|
||||
|
||||
Bcache tries to transparently handle IO errors to/from the cache device without
|
||||
affecting normal operation; if it sees too many errors (the threshold is
|
||||
configurable, and defaults to 0) it shuts down the cache device and switches all
|
||||
the backing devices to passthrough mode.
|
||||
|
||||
- For reads from the cache, if they error we just retry the read from the
|
||||
backing device.
|
||||
|
||||
- For writethrough writes, if the write to the cache errors we just switch to
|
||||
invalidating the data at that lba in the cache (i.e. the same thing we do for
|
||||
a write that bypasses the cache)
|
||||
|
||||
- For writeback writes, we currently pass that error back up to the
|
||||
filesystem/userspace. This could be improved - we could retry it as a write
|
||||
that skips the cache so we don't have to error the write.
|
||||
|
||||
- When we detach, we first try to flush any dirty data (if we were running in
|
||||
writeback mode). It currently doesn't do anything intelligent if it fails to
|
||||
read some of the dirty data, though.
|
||||
|
||||
TROUBLESHOOTING PERFORMANCE:
|
||||
|
||||
Bcache has a bunch of config options and tunables. The defaults are intended to
|
||||
be reasonable for typical desktop and server workloads, but they're not what you
|
||||
want for getting the best possible numbers when benchmarking.
|
||||
|
||||
- Bad write performance
|
||||
|
||||
If write performance is not what you expected, you probably wanted to be
|
||||
running in writeback mode, which isn't the default (not due to a lack of
|
||||
maturity, but simply because in writeback mode you'll lose data if something
|
||||
happens to your SSD)
|
||||
|
||||
# echo writeback > /sys/block/bcache0/cache_mode
|
||||
|
||||
- Bad performance, or traffic not going to the SSD that you'd expect
|
||||
|
||||
By default, bcache doesn't cache everything. It tries to skip sequential IO -
|
||||
because you really want to be caching the random IO, and if you copy a 10
|
||||
gigabyte file you probably don't want that pushing 10 gigabytes of randomly
|
||||
accessed data out of your cache.
|
||||
|
||||
But if you want to benchmark reads from cache, and you start out with fio
|
||||
writing an 8 gigabyte test file - so you want to disable that.
|
||||
|
||||
# echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
|
||||
To set it back to the default (4 mb), do
|
||||
|
||||
# echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
|
||||
- Traffic's still going to the spindle/still getting cache misses
|
||||
|
||||
In the real world, SSDs don't always keep up with disks - particularly with
|
||||
slower SSDs, many disks being cached by one SSD, or mostly sequential IO. So
|
||||
you want to avoid being bottlenecked by the SSD and having it slow everything
|
||||
down.
|
||||
|
||||
To avoid that bcache tracks latency to the cache device, and gradually
|
||||
throttles traffic if the latency exceeds a threshold (it does this by
|
||||
cranking down the sequential bypass).
|
||||
|
||||
You can disable this if you need to by setting the thresholds to 0:
|
||||
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
|
||||
|
||||
The default is 2000 us (2 milliseconds) for reads, and 20000 for writes.
|
||||
|
||||
- Still getting cache misses, of the same data
|
||||
|
||||
One last issue that sometimes trips people up is actually an old bug, due to
|
||||
the way cache coherency is handled for cache misses. If a btree node is full,
|
||||
a cache miss won't be able to insert a key for the new data and the data
|
||||
won't be written to the cache.
|
||||
|
||||
In practice this isn't an issue because as soon as a write comes along it'll
|
||||
cause the btree node to be split, and you need almost no write traffic for
|
||||
this to not show up enough to be noticable (especially since bcache's btree
|
||||
nodes are huge and index large regions of the device). But when you're
|
||||
benchmarking, if you're trying to warm the cache by reading a bunch of data
|
||||
and there's no other traffic - that can be a problem.
|
||||
|
||||
Solution: warm the cache by doing writes, or use the testing branch (there's
|
||||
a fix for the issue there).
|
||||
|
||||
SYSFS - BACKING DEVICE:
|
||||
|
||||
attach
|
||||
Echo the UUID of a cache set to this file to enable caching.
|
||||
|
||||
cache_mode
|
||||
Can be one of either writethrough, writeback, writearound or none.
|
||||
|
||||
clear_stats
|
||||
Writing to this file resets the running total stats (not the day/hour/5 minute
|
||||
decaying versions).
|
||||
|
||||
detach
|
||||
Write to this file to detach from a cache set. If there is dirty data in the
|
||||
cache, it will be flushed first.
|
||||
|
||||
dirty_data
|
||||
Amount of dirty data for this backing device in the cache. Continuously
|
||||
updated unlike the cache set's version, but may be slightly off.
|
||||
|
||||
label
|
||||
Name of underlying device.
|
||||
|
||||
readahead
|
||||
Size of readahead that should be performed. Defaults to 0. If set to e.g.
|
||||
1M, it will round cache miss reads up to that size, but without overlapping
|
||||
existing cache entries.
|
||||
|
||||
running
|
||||
1 if bcache is running (i.e. whether the /dev/bcache device exists, whether
|
||||
it's in passthrough mode or caching).
|
||||
|
||||
sequential_cutoff
|
||||
A sequential IO will bypass the cache once it passes this threshhold; the
|
||||
most recent 128 IOs are tracked so sequential IO can be detected even when
|
||||
it isn't all done at once.
|
||||
|
||||
sequential_merge
|
||||
If non zero, bcache keeps a list of the last 128 requests submitted to compare
|
||||
against all new requests to determine which new requests are sequential
|
||||
continuations of previous requests for the purpose of determining sequential
|
||||
cutoff. This is necessary if the sequential cutoff value is greater than the
|
||||
maximum acceptable sequential size for any single request.
|
||||
|
||||
state
|
||||
The backing device can be in one of four different states:
|
||||
|
||||
no cache: Has never been attached to a cache set.
|
||||
|
||||
clean: Part of a cache set, and there is no cached dirty data.
|
||||
|
||||
dirty: Part of a cache set, and there is cached dirty data.
|
||||
|
||||
inconsistent: The backing device was forcibly run by the user when there was
|
||||
dirty data cached but the cache set was unavailable; whatever data was on the
|
||||
backing device has likely been corrupted.
|
||||
|
||||
stop
|
||||
Write to this file to shut down the bcache device and close the backing
|
||||
device.
|
||||
|
||||
writeback_delay
|
||||
When dirty data is written to the cache and it previously did not contain
|
||||
any, waits some number of seconds before initiating writeback. Defaults to
|
||||
30.
|
||||
|
||||
writeback_percent
|
||||
If nonzero, bcache tries to keep around this percentage of the cache dirty by
|
||||
throttling background writeback and using a PD controller to smoothly adjust
|
||||
the rate.
|
||||
|
||||
writeback_rate
|
||||
Rate in sectors per second - if writeback_percent is nonzero, background
|
||||
writeback is throttled to this rate. Continuously adjusted by bcache but may
|
||||
also be set by the user.
|
||||
|
||||
writeback_running
|
||||
If off, writeback of dirty data will not take place at all. Dirty data will
|
||||
still be added to the cache until it is mostly full; only meant for
|
||||
benchmarking. Defaults to on.
|
||||
|
||||
SYSFS - BACKING DEVICE STATS:
|
||||
|
||||
There are directories with these numbers for a running total, as well as
|
||||
versions that decay over the past day, hour and 5 minutes; they're also
|
||||
aggregated in the cache set directory as well.
|
||||
|
||||
bypassed
|
||||
Amount of IO (both reads and writes) that has bypassed the cache
|
||||
|
||||
cache_hits
|
||||
cache_misses
|
||||
cache_hit_ratio
|
||||
Hits and misses are counted per individual IO as bcache sees them; a
|
||||
partial hit is counted as a miss.
|
||||
|
||||
cache_bypass_hits
|
||||
cache_bypass_misses
|
||||
Hits and misses for IO that is intended to skip the cache are still counted,
|
||||
but broken out here.
|
||||
|
||||
cache_miss_collisions
|
||||
Counts instances where data was going to be inserted into the cache from a
|
||||
cache miss, but raced with a write and data was already present (usually 0
|
||||
since the synchronization for cache misses was rewritten)
|
||||
|
||||
cache_readaheads
|
||||
Count of times readahead occured.
|
||||
|
||||
SYSFS - CACHE SET:
|
||||
|
||||
average_key_size
|
||||
Average data per key in the btree.
|
||||
|
||||
bdev<0..n>
|
||||
Symlink to each of the attached backing devices.
|
||||
|
||||
block_size
|
||||
Block size of the cache devices.
|
||||
|
||||
btree_cache_size
|
||||
Amount of memory currently used by the btree cache
|
||||
|
||||
bucket_size
|
||||
Size of buckets
|
||||
|
||||
cache<0..n>
|
||||
Symlink to each of the cache devices comprising this cache set.
|
||||
|
||||
cache_available_percent
|
||||
Percentage of cache device free.
|
||||
|
||||
clear_stats
|
||||
Clears the statistics associated with this cache
|
||||
|
||||
dirty_data
|
||||
Amount of dirty data is in the cache (updated when garbage collection runs).
|
||||
|
||||
flash_vol_create
|
||||
Echoing a size to this file (in human readable units, k/M/G) creates a thinly
|
||||
provisioned volume backed by the cache set.
|
||||
|
||||
io_error_halflife
|
||||
io_error_limit
|
||||
These determines how many errors we accept before disabling the cache.
|
||||
Each error is decayed by the half life (in # ios). If the decaying count
|
||||
reaches io_error_limit dirty data is written out and the cache is disabled.
|
||||
|
||||
journal_delay_ms
|
||||
Journal writes will delay for up to this many milliseconds, unless a cache
|
||||
flush happens sooner. Defaults to 100.
|
||||
|
||||
root_usage_percent
|
||||
Percentage of the root btree node in use. If this gets too high the node
|
||||
will split, increasing the tree depth.
|
||||
|
||||
stop
|
||||
Write to this file to shut down the cache set - waits until all attached
|
||||
backing devices have been shut down.
|
||||
|
||||
tree_depth
|
||||
Depth of the btree (A single node btree has depth 0).
|
||||
|
||||
unregister
|
||||
Detaches all backing devices and closes the cache devices; if dirty data is
|
||||
present it will disable writeback caching and wait for it to be flushed.
|
||||
|
||||
SYSFS - CACHE SET INTERNAL:
|
||||
|
||||
This directory also exposes timings for a number of internal operations, with
|
||||
separate files for average duration, average frequency, last occurence and max
|
||||
duration: garbage collection, btree read, btree node sorts and btree splits.
|
||||
|
||||
active_journal_entries
|
||||
Number of journal entries that are newer than the index.
|
||||
|
||||
btree_nodes
|
||||
Total nodes in the btree.
|
||||
|
||||
btree_used_percent
|
||||
Average fraction of btree in use.
|
||||
|
||||
bset_tree_stats
|
||||
Statistics about the auxiliary search trees
|
||||
|
||||
btree_cache_max_chain
|
||||
Longest chain in the btree node cache's hash table
|
||||
|
||||
cache_read_races
|
||||
Counts instances where while data was being read from the cache, the bucket
|
||||
was reused and invalidated - i.e. where the pointer was stale after the read
|
||||
completed. When this occurs the data is reread from the backing device.
|
||||
|
||||
trigger_gc
|
||||
Writing to this file forces garbage collection to run.
|
||||
|
||||
SYSFS - CACHE DEVICE:
|
||||
|
||||
block_size
|
||||
Minimum granularity of writes - should match hardware sector size.
|
||||
|
||||
btree_written
|
||||
Sum of all btree writes, in (kilo/mega/giga) bytes
|
||||
|
||||
bucket_size
|
||||
Size of buckets
|
||||
|
||||
cache_replacement_policy
|
||||
One of either lru, fifo or random.
|
||||
|
||||
discard
|
||||
Boolean; if on a discard/TRIM will be issued to each bucket before it is
|
||||
reused. Defaults to off, since SATA TRIM is an unqueued command (and thus
|
||||
slow).
|
||||
|
||||
freelist_percent
|
||||
Size of the freelist as a percentage of nbuckets. Can be written to to
|
||||
increase the number of buckets kept on the freelist, which lets you
|
||||
artificially reduce the size of the cache at runtime. Mostly for testing
|
||||
purposes (i.e. testing how different size caches affect your hit rate), but
|
||||
since buckets are discarded when they move on to the freelist will also make
|
||||
the SSD's garbage collection easier by effectively giving it more reserved
|
||||
space.
|
||||
|
||||
io_errors
|
||||
Number of errors that have occured, decayed by io_error_halflife.
|
||||
|
||||
metadata_written
|
||||
Sum of all non data writes (btree writes and all other metadata).
|
||||
|
||||
nbuckets
|
||||
Total buckets in this cache
|
||||
|
||||
priority_stats
|
||||
Statistics about how recently data in the cache has been accessed. This can
|
||||
reveal your working set size.
|
||||
|
||||
written
|
||||
Sum of all data that has been written to the cache; comparison with
|
||||
btree_written gives the amount of write inflation in bcache.
|
||||
@@ -1620,6 +1620,13 @@ W: http://www.baycom.org/~tom/ham/ham.html
|
||||
S: Maintained
|
||||
F: drivers/net/hamradio/baycom*
|
||||
|
||||
BCACHE (BLOCK LAYER CACHE)
|
||||
M: Kent Overstreet <koverstreet@google.com>
|
||||
L: linux-bcache@vger.kernel.org
|
||||
W: http://bcache.evilpiepirate.org
|
||||
S: Maintained:
|
||||
F: drivers/md/bcache/
|
||||
|
||||
BEFS FILE SYSTEM
|
||||
S: Orphan
|
||||
F: Documentation/filesystems/befs.txt
|
||||
|
||||
@@ -920,16 +920,14 @@ bio_pagedec(struct bio *bio)
|
||||
static void
|
||||
bufinit(struct buf *buf, struct request *rq, struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bv;
|
||||
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
buf->rq = rq;
|
||||
buf->bio = bio;
|
||||
buf->resid = bio->bi_size;
|
||||
buf->sector = bio->bi_sector;
|
||||
bio_pageinc(bio);
|
||||
buf->bv = bv = bio_iovec(bio);
|
||||
buf->bv_resid = bv->bv_len;
|
||||
buf->bv = bio_iovec(bio);
|
||||
buf->bv_resid = buf->bv->bv_len;
|
||||
WARN_ON(buf->bv_resid == 0);
|
||||
}
|
||||
|
||||
|
||||
+21
-1
@@ -75,6 +75,12 @@ module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
|
||||
MODULE_PARM_DESC(cciss_simple_mode,
|
||||
"Use 'simple mode' rather than 'performant mode'");
|
||||
|
||||
static int cciss_allow_hpsa;
|
||||
module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
|
||||
MODULE_PARM_DESC(cciss_allow_hpsa,
|
||||
"Prevent cciss driver from accessing hardware known to be "
|
||||
" supported by the hpsa driver");
|
||||
|
||||
static DEFINE_MUTEX(cciss_mutex);
|
||||
static struct proc_dir_entry *proc_cciss;
|
||||
|
||||
@@ -4115,9 +4121,13 @@ static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
|
||||
*board_id = ((subsystem_device_id << 16) & 0xffff0000) |
|
||||
subsystem_vendor_id;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(products); i++)
|
||||
for (i = 0; i < ARRAY_SIZE(products); i++) {
|
||||
/* Stand aside for hpsa driver on request */
|
||||
if (cciss_allow_hpsa)
|
||||
return -ENODEV;
|
||||
if (*board_id == products[i].board_id)
|
||||
return i;
|
||||
}
|
||||
dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
|
||||
*board_id);
|
||||
return -ENODEV;
|
||||
@@ -4959,6 +4969,16 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
|
||||
ctlr_info_t *h;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* By default the cciss driver is used for all older HP Smart Array
|
||||
* controllers. There are module paramaters that allow a user to
|
||||
* override this behavior and instead use the hpsa SCSI driver. If
|
||||
* this is the case cciss may be loaded first from the kdump initrd
|
||||
* image and cause a kernel panic. So if reset_devices is true and
|
||||
* cciss_allow_hpsa is set just bail.
|
||||
*/
|
||||
if ((reset_devices) && (cciss_allow_hpsa == 1))
|
||||
return -ENODEV;
|
||||
rc = cciss_init_reset_devices(pdev);
|
||||
if (rc) {
|
||||
if (rc != -ENOTSUPP)
|
||||
|
||||
@@ -104,7 +104,6 @@ struct update_al_work {
|
||||
int err;
|
||||
};
|
||||
|
||||
static int al_write_transaction(struct drbd_conf *mdev);
|
||||
|
||||
void *drbd_md_get_buffer(struct drbd_conf *mdev)
|
||||
{
|
||||
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
|
||||
bio->bi_end_io = drbd_md_io_complete;
|
||||
bio->bi_rw = rw;
|
||||
|
||||
if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
|
||||
if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
|
||||
/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
|
||||
;
|
||||
else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
|
||||
/* Corresponding put_ldev in drbd_md_io_complete() */
|
||||
dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
|
||||
err = -ENODEV;
|
||||
goto out;
|
||||
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
|
||||
|
||||
BUG_ON(!bdev->md_bdev);
|
||||
|
||||
dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
|
||||
dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
|
||||
current->comm, current->pid, __func__,
|
||||
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
|
||||
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
|
||||
(void*)_RET_IP_ );
|
||||
|
||||
if (sector < drbd_md_first_sector(bdev) ||
|
||||
sector + 7 > drbd_md_last_sector(bdev))
|
||||
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
|
||||
current->comm, current->pid, __func__,
|
||||
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
|
||||
|
||||
err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
|
||||
/* we do all our meta data IO in aligned 4k blocks. */
|
||||
err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
|
||||
if (err) {
|
||||
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
|
||||
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
|
||||
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
|
||||
static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
|
||||
{
|
||||
struct lc_element *al_ext;
|
||||
struct lc_element *tmp;
|
||||
int wake;
|
||||
|
||||
spin_lock_irq(&mdev->al_lock);
|
||||
tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
|
||||
if (unlikely(tmp != NULL)) {
|
||||
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
|
||||
if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
|
||||
wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
|
||||
spin_unlock_irq(&mdev->al_lock);
|
||||
if (wake)
|
||||
wake_up(&mdev->al_wait);
|
||||
return NULL;
|
||||
}
|
||||
if (test_bit(BME_NO_WRITES, &bm_ext->flags))
|
||||
return bm_ext;
|
||||
}
|
||||
al_ext = lc_get(mdev->act_log, enr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
|
||||
{
|
||||
struct lc_element *al_ext;
|
||||
struct bm_extent *bm_ext;
|
||||
int wake;
|
||||
|
||||
spin_lock_irq(&mdev->al_lock);
|
||||
bm_ext = find_active_resync_extent(mdev, enr);
|
||||
if (bm_ext) {
|
||||
wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
|
||||
spin_unlock_irq(&mdev->al_lock);
|
||||
if (wake)
|
||||
wake_up(&mdev->al_wait);
|
||||
return NULL;
|
||||
}
|
||||
if (nonblock)
|
||||
al_ext = lc_try_get(mdev->act_log, enr);
|
||||
else
|
||||
al_ext = lc_get(mdev->act_log, enr);
|
||||
spin_unlock_irq(&mdev->al_lock);
|
||||
return al_ext;
|
||||
}
|
||||
|
||||
void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
{
|
||||
/* for bios crossing activity log extent boundaries,
|
||||
* we may need to activate two extents in one go */
|
||||
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
|
||||
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
|
||||
|
||||
D_ASSERT((unsigned)(last - first) <= 1);
|
||||
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
|
||||
|
||||
/* FIXME figure out a fast path for bios crossing AL extent boundaries */
|
||||
if (first != last)
|
||||
return false;
|
||||
|
||||
return _al_get(mdev, first, true);
|
||||
}
|
||||
|
||||
bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
{
|
||||
/* for bios crossing activity log extent boundaries,
|
||||
* we may need to activate two extents in one go */
|
||||
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
|
||||
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
|
||||
unsigned enr;
|
||||
bool locked = false;
|
||||
|
||||
bool need_transaction = false;
|
||||
|
||||
D_ASSERT(first <= last);
|
||||
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
|
||||
|
||||
for (enr = first; enr <= last; enr++)
|
||||
wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
|
||||
for (enr = first; enr <= last; enr++) {
|
||||
struct lc_element *al_ext;
|
||||
wait_event(mdev->al_wait,
|
||||
(al_ext = _al_get(mdev, enr, false)) != NULL);
|
||||
if (al_ext->lc_number != enr)
|
||||
need_transaction = true;
|
||||
}
|
||||
return need_transaction;
|
||||
}
|
||||
|
||||
static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
|
||||
|
||||
/* When called through generic_make_request(), we must delegate
|
||||
* activity log I/O to the worker thread: a further request
|
||||
* submitted via generic_make_request() within the same task
|
||||
* would be queued on current->bio_list, and would only start
|
||||
* after this function returns (see generic_make_request()).
|
||||
*
|
||||
* However, if we *are* the worker, we must not delegate to ourselves.
|
||||
*/
|
||||
|
||||
/*
|
||||
* @delegate: delegate activity log I/O to the worker thread
|
||||
*/
|
||||
void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
|
||||
{
|
||||
bool locked = false;
|
||||
|
||||
BUG_ON(delegate && current == mdev->tconn->worker.task);
|
||||
|
||||
/* Serialize multiple transactions.
|
||||
* This uses test_and_set_bit, memory barrier is implicit.
|
||||
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
(locked = lc_try_lock_for_transaction(mdev->act_log)));
|
||||
|
||||
if (locked) {
|
||||
/* drbd_al_write_transaction(mdev,al_ext,enr);
|
||||
* recurses into generic_make_request(), which
|
||||
* disallows recursion, bios being serialized on the
|
||||
* current->bio_tail list now.
|
||||
* we have to delegate updates to the activity log
|
||||
* to the worker thread. */
|
||||
|
||||
/* Double check: it may have been committed by someone else,
|
||||
* while we have been waiting for the lock. */
|
||||
if (mdev->act_log->pending_changes) {
|
||||
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
|
||||
rcu_read_unlock();
|
||||
|
||||
if (write_al_updates) {
|
||||
al_write_transaction(mdev);
|
||||
mdev->al_writ_cnt++;
|
||||
}
|
||||
|
||||
if (write_al_updates)
|
||||
al_write_transaction(mdev, delegate);
|
||||
spin_lock_irq(&mdev->al_lock);
|
||||
/* FIXME
|
||||
if (err)
|
||||
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @delegate: delegate activity log I/O to the worker thread
|
||||
*/
|
||||
void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
|
||||
{
|
||||
BUG_ON(delegate && current == mdev->tconn->worker.task);
|
||||
|
||||
if (drbd_al_begin_io_prepare(mdev, i))
|
||||
drbd_al_begin_io_commit(mdev, delegate);
|
||||
}
|
||||
|
||||
int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
{
|
||||
struct lru_cache *al = mdev->act_log;
|
||||
/* for bios crossing activity log extent boundaries,
|
||||
* we may need to activate two extents in one go */
|
||||
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
|
||||
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
|
||||
unsigned nr_al_extents;
|
||||
unsigned available_update_slots;
|
||||
unsigned enr;
|
||||
|
||||
D_ASSERT(first <= last);
|
||||
|
||||
nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
|
||||
available_update_slots = min(al->nr_elements - al->used,
|
||||
al->max_pending_changes - al->pending_changes);
|
||||
|
||||
/* We want all necessary updates for a given request within the same transaction
|
||||
* We could first check how many updates are *actually* needed,
|
||||
* and use that instead of the worst-case nr_al_extents */
|
||||
if (available_update_slots < nr_al_extents)
|
||||
return -EWOULDBLOCK;
|
||||
|
||||
/* Is resync active in this area? */
|
||||
for (enr = first; enr <= last; enr++) {
|
||||
struct lc_element *tmp;
|
||||
tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
|
||||
if (unlikely(tmp != NULL)) {
|
||||
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
|
||||
if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
|
||||
if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
|
||||
return -EBUSY;
|
||||
return -EWOULDBLOCK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Checkout the refcounts.
|
||||
* Given that we checked for available elements and update slots above,
|
||||
* this has to be successful. */
|
||||
for (enr = first; enr <= last; enr++) {
|
||||
struct lc_element *al_ext;
|
||||
al_ext = lc_get_cumulative(mdev->act_log, enr);
|
||||
if (!al_ext)
|
||||
dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
|
||||
{
|
||||
/* for bios crossing activity log extent boundaries,
|
||||
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
|
||||
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
|
||||
}
|
||||
|
||||
static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
|
||||
{
|
||||
const unsigned int stripes = mdev->ldev->md.al_stripes;
|
||||
const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
|
||||
|
||||
/* transaction number, modulo on-disk ring buffer wrap around */
|
||||
unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
|
||||
|
||||
/* ... to aligned 4k on disk block */
|
||||
t = ((t % stripes) * stripe_size_4kB) + t/stripes;
|
||||
|
||||
/* ... to 512 byte sector in activity log */
|
||||
t *= 8;
|
||||
|
||||
/* ... plus offset to the on disk position */
|
||||
return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
|
||||
}
|
||||
|
||||
static int
|
||||
_al_write_transaction(struct drbd_conf *mdev)
|
||||
{
|
||||
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
|
||||
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
|
||||
mdev->al_tr_cycle = 0;
|
||||
|
||||
sector = mdev->ldev->md.md_offset
|
||||
+ mdev->ldev->md.al_offset
|
||||
+ mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
|
||||
sector = al_tr_number_to_on_disk_sector(mdev);
|
||||
|
||||
crc = crc32c(0, buffer, 4096);
|
||||
buffer->crc32c = cpu_to_be32(crc);
|
||||
|
||||
if (drbd_bm_write_hinted(mdev))
|
||||
err = -EIO;
|
||||
/* drbd_chk_io_error done already */
|
||||
else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
|
||||
err = -EIO;
|
||||
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
|
||||
} else {
|
||||
/* advance ringbuffer position and transaction counter */
|
||||
mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
|
||||
mdev->al_tr_number++;
|
||||
else {
|
||||
bool write_al_updates;
|
||||
rcu_read_lock();
|
||||
write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
|
||||
rcu_read_unlock();
|
||||
if (write_al_updates) {
|
||||
if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
|
||||
err = -EIO;
|
||||
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
|
||||
} else {
|
||||
mdev->al_tr_number++;
|
||||
mdev->al_writ_cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
drbd_md_put_buffer(mdev);
|
||||
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
|
||||
/* Calls from worker context (see w_restart_disk_io()) need to write the
|
||||
transaction directly. Others came through generic_make_request(),
|
||||
those need to delegate it to the worker. */
|
||||
static int al_write_transaction(struct drbd_conf *mdev)
|
||||
static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
|
||||
{
|
||||
struct update_al_work al_work;
|
||||
|
||||
if (current == mdev->tconn->worker.task)
|
||||
if (delegate) {
|
||||
struct update_al_work al_work;
|
||||
init_completion(&al_work.event);
|
||||
al_work.w.cb = w_al_write_transaction;
|
||||
al_work.w.mdev = mdev;
|
||||
drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
|
||||
wait_for_completion(&al_work.event);
|
||||
return al_work.err;
|
||||
} else
|
||||
return _al_write_transaction(mdev);
|
||||
|
||||
init_completion(&al_work.event);
|
||||
al_work.w.cb = w_al_write_transaction;
|
||||
al_work.w.mdev = mdev;
|
||||
drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
|
||||
wait_for_completion(&al_work.event);
|
||||
|
||||
return al_work.err;
|
||||
}
|
||||
|
||||
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
|
||||
|
||||
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
|
||||
}
|
||||
}
|
||||
|
||||
/* For the layout, see comment above drbd_md_set_sector_offsets(). */
|
||||
static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
|
||||
{
|
||||
u64 bitmap_sectors;
|
||||
if (ldev->md.al_offset == 8)
|
||||
bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
|
||||
else
|
||||
bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
|
||||
return bitmap_sectors << (9 + 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* make sure the bitmap has enough room for the attached storage,
|
||||
* if necessary, resize.
|
||||
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
|
||||
words = ALIGN(bits, 64) >> LN2_BPL;
|
||||
|
||||
if (get_ldev(mdev)) {
|
||||
u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
|
||||
u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
|
||||
put_ldev(mdev);
|
||||
if (bits > bits_on_disk) {
|
||||
dev_info(DEV, "bits = %lu\n", bits);
|
||||
|
||||
@@ -753,13 +753,16 @@ struct drbd_md {
|
||||
u32 flags;
|
||||
u32 md_size_sect;
|
||||
|
||||
s32 al_offset; /* signed relative sector offset to al area */
|
||||
s32 al_offset; /* signed relative sector offset to activity log */
|
||||
s32 bm_offset; /* signed relative sector offset to bitmap */
|
||||
|
||||
/* u32 al_nr_extents; important for restoring the AL
|
||||
* is stored into ldev->dc.al_extents, which in turn
|
||||
* gets applied to act_log->nr_elements
|
||||
*/
|
||||
/* cached value of bdev->disk_conf->meta_dev_idx (see below) */
|
||||
s32 meta_dev_idx;
|
||||
|
||||
/* see al_tr_number_to_on_disk_sector() */
|
||||
u32 al_stripes;
|
||||
u32 al_stripe_size_4k;
|
||||
u32 al_size_4k; /* cached product of the above */
|
||||
};
|
||||
|
||||
struct drbd_backing_dev {
|
||||
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */
|
||||
} send;
|
||||
};
|
||||
|
||||
struct submit_worker {
|
||||
struct workqueue_struct *wq;
|
||||
struct work_struct worker;
|
||||
|
||||
spinlock_t lock;
|
||||
struct list_head writes;
|
||||
};
|
||||
|
||||
struct drbd_conf {
|
||||
struct drbd_tconn *tconn;
|
||||
int vnr; /* volume number within the connection */
|
||||
@@ -1009,7 +1020,6 @@ struct drbd_conf {
|
||||
struct lru_cache *act_log; /* activity log */
|
||||
unsigned int al_tr_number;
|
||||
int al_tr_cycle;
|
||||
int al_tr_pos; /* position of the next transaction in the journal */
|
||||
wait_queue_head_t seq_wait;
|
||||
atomic_t packet_seq;
|
||||
unsigned int peer_seq;
|
||||
@@ -1032,6 +1042,10 @@ struct drbd_conf {
|
||||
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
|
||||
unsigned int peer_max_bio_size;
|
||||
unsigned int local_max_bio_size;
|
||||
|
||||
/* any requests that would block in drbd_make_request()
|
||||
* are deferred to this single-threaded work queue */
|
||||
struct submit_worker submit;
|
||||
};
|
||||
|
||||
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
|
||||
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
|
||||
char *why, enum bm_flag flags);
|
||||
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
|
||||
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
|
||||
extern void drbd_go_diskless(struct drbd_conf *mdev);
|
||||
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
|
||||
|
||||
/* Meta data layout
|
||||
We reserve a 128MB Block (4k aligned)
|
||||
* either at the end of the backing device
|
||||
* or on a separate meta data device. */
|
||||
*
|
||||
* We currently have two possible layouts.
|
||||
* Offsets in (512 byte) sectors.
|
||||
* external:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [ 4k superblock ][ activity log ][ Bitmap ]
|
||||
* | al_offset == 8 |
|
||||
* | bm_offset = al_offset + X |
|
||||
* ==> bitmap sectors = md_size_sect - bm_offset
|
||||
*
|
||||
* Variants:
|
||||
* old, indexed fixed size meta data:
|
||||
*
|
||||
* internal:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
|
||||
* | al_offset < 0 |
|
||||
* | bm_offset = al_offset - Y |
|
||||
* ==> bitmap sectors = Y = al_offset - bm_offset
|
||||
*
|
||||
* [padding*] are zero or up to 7 unused 512 Byte sectors to the
|
||||
* end of the device, so that the [4k superblock] will be 4k aligned.
|
||||
*
|
||||
* The activity log consists of 4k transaction blocks,
|
||||
* which are written in a ring-buffer, or striped ring-buffer like fashion,
|
||||
* which are writtensize used to be fixed 32kB,
|
||||
* but is about to become configurable.
|
||||
*/
|
||||
|
||||
/* The following numbers are sectors */
|
||||
/* Allows up to about 3.8TB, so if you want more,
|
||||
/* Our old fixed size meta data layout
|
||||
* allows up to about 3.8TB, so if you want more,
|
||||
* you need to use the "flexible" meta data format. */
|
||||
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
|
||||
#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
|
||||
#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
|
||||
#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
|
||||
|
||||
/* we do all meta data IO in 4k blocks */
|
||||
#define MD_BLOCK_SHIFT 12
|
||||
#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
|
||||
#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
|
||||
#define MD_4kB_SECT 8
|
||||
#define MD_32kB_SECT 64
|
||||
|
||||
/* One activity log extent represents 4M of storage */
|
||||
#define AL_EXTENT_SHIFT 22
|
||||
@@ -1256,7 +1289,6 @@ struct bm_extent {
|
||||
|
||||
/* in one sector of the bitmap, we have this many activity_log extents. */
|
||||
#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
|
||||
#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
|
||||
|
||||
#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
|
||||
#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
|
||||
@@ -1276,16 +1308,18 @@ struct bm_extent {
|
||||
*/
|
||||
|
||||
#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
|
||||
#define DRBD_MAX_SECTORS_BM \
|
||||
((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
|
||||
#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
|
||||
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
|
||||
#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
|
||||
/* we have a certain meta data variant that has a fixed on-disk size of 128
|
||||
* MiB, of which 4k are our "superblock", and 32k are the fixed size activity
|
||||
* log, leaving this many sectors for the bitmap.
|
||||
*/
|
||||
|
||||
#define DRBD_MAX_SECTORS_FIXED_BM \
|
||||
((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
|
||||
#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
|
||||
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
|
||||
#else
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
|
||||
/* 16 TB in units of sectors */
|
||||
#if BITS_PER_LONG == 32
|
||||
/* adjust by one page worth of bitmap,
|
||||
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn);
|
||||
extern int proc_details;
|
||||
|
||||
/* drbd_req */
|
||||
extern void do_submit(struct work_struct *ws);
|
||||
extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
|
||||
extern void drbd_make_request(struct request_queue *q, struct bio *bio);
|
||||
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
|
||||
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s);
|
||||
extern const char *drbd_role_str(enum drbd_role s);
|
||||
|
||||
/* drbd_actlog.c */
|
||||
extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
|
||||
extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i);
|
||||
extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate);
|
||||
extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i);
|
||||
extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate);
|
||||
extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
|
||||
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
|
||||
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
|
||||
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
|
||||
* BTW, for internal meta data, this happens to be the maximum capacity
|
||||
* we could agree upon with our peer node.
|
||||
*/
|
||||
static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
|
||||
static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
switch (meta_dev_idx) {
|
||||
switch (bdev->md.meta_dev_idx) {
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
return bdev->md.md_offset + bdev->md.bm_offset;
|
||||
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi
|
||||
}
|
||||
}
|
||||
|
||||
static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
int meta_dev_idx;
|
||||
|
||||
rcu_read_lock();
|
||||
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
|
||||
rcu_read_unlock();
|
||||
|
||||
return _drbd_md_first_sector(meta_dev_idx, bdev);
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_md_last_sector() - Return the last sector number of the meta data area
|
||||
* @bdev: Meta data block device.
|
||||
*/
|
||||
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
int meta_dev_idx;
|
||||
|
||||
rcu_read_lock();
|
||||
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
|
||||
rcu_read_unlock();
|
||||
|
||||
switch (meta_dev_idx) {
|
||||
switch (bdev->md.meta_dev_idx) {
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
return bdev->md.md_offset + MD_AL_OFFSET - 1;
|
||||
return bdev->md.md_offset + MD_4kB_SECT -1;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
default:
|
||||
return bdev->md.md_offset + bdev->md.md_size_sect;
|
||||
return bdev->md.md_offset + bdev->md.md_size_sect -1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
|
||||
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
sector_t s;
|
||||
int meta_dev_idx;
|
||||
|
||||
rcu_read_lock();
|
||||
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
|
||||
rcu_read_unlock();
|
||||
|
||||
switch (meta_dev_idx) {
|
||||
switch (bdev->md.meta_dev_idx) {
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
s = drbd_get_capacity(bdev->backing_bdev)
|
||||
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
|
||||
_drbd_md_first_sector(meta_dev_idx, bdev))
|
||||
drbd_md_first_sector(bdev))
|
||||
: 0;
|
||||
break;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_md_ss__() - Return the sector number of our meta data super block
|
||||
* @mdev: DRBD device.
|
||||
* drbd_md_ss() - Return the sector number of our meta data super block
|
||||
* @bdev: Meta data block device.
|
||||
*/
|
||||
static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
|
||||
struct drbd_backing_dev *bdev)
|
||||
static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
int meta_dev_idx;
|
||||
const int meta_dev_idx = bdev->md.meta_dev_idx;
|
||||
|
||||
rcu_read_lock();
|
||||
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
|
||||
rcu_read_unlock();
|
||||
|
||||
switch (meta_dev_idx) {
|
||||
default: /* external, some index */
|
||||
return MD_RESERVED_SECT * meta_dev_idx;
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
/* with drbd08, internal meta data is always "flexible" */
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
/* sizeof(struct md_on_disk_07) == 4k
|
||||
* position: last 4k aligned block of 4k size */
|
||||
if (!bdev->backing_bdev) {
|
||||
if (__ratelimit(&drbd_ratelimit_state)) {
|
||||
dev_err(DEV, "bdev->backing_bdev==NULL\n");
|
||||
dump_stack();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
|
||||
- MD_AL_OFFSET;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Since drbd08, internal meta data is always "flexible".
|
||||
* position: last 4k aligned block of 4k size */
|
||||
if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
|
||||
meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
|
||||
return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
|
||||
|
||||
/* external, some index; this is the old fixed size layout */
|
||||
return MD_128MB_SECT * bdev->md.meta_dev_idx;
|
||||
}
|
||||
|
||||
static inline void
|
||||
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
|
||||
if (mdev->state.disk == D_DISKLESS)
|
||||
/* even internal references gone, safe to destroy */
|
||||
drbd_ldev_destroy(mdev);
|
||||
if (mdev->state.disk == D_FAILED)
|
||||
if (mdev->state.disk == D_FAILED) {
|
||||
/* all application IO references gone. */
|
||||
drbd_go_diskless(mdev);
|
||||
if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
|
||||
drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
|
||||
}
|
||||
wake_up(&mdev->misc_wait);
|
||||
}
|
||||
}
|
||||
|
||||
+213
-42
@@ -45,7 +45,7 @@
|
||||
#include <linux/reboot.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
#include <linux/workqueue.h>
|
||||
#define __KERNEL_SYSCALLS__
|
||||
#include <linux/unistd.h>
|
||||
#include <linux/vmalloc.h>
|
||||
@@ -2299,6 +2299,7 @@ static void drbd_cleanup(void)
|
||||
idr_for_each_entry(&minors, mdev, i) {
|
||||
idr_remove(&minors, mdev_to_minor(mdev));
|
||||
idr_remove(&mdev->tconn->volumes, mdev->vnr);
|
||||
destroy_workqueue(mdev->submit.wq);
|
||||
del_gendisk(mdev->vdisk);
|
||||
/* synchronize_rcu(); No other threads running at this point */
|
||||
kref_put(&mdev->kref, &drbd_minor_destroy);
|
||||
@@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref)
|
||||
kfree(tconn);
|
||||
}
|
||||
|
||||
int init_submitter(struct drbd_conf *mdev)
|
||||
{
|
||||
/* opencoded create_singlethread_workqueue(),
|
||||
* to be able to say "drbd%d", ..., minor */
|
||||
mdev->submit.wq = alloc_workqueue("drbd%u_submit",
|
||||
WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor);
|
||||
if (!mdev->submit.wq)
|
||||
return -ENOMEM;
|
||||
|
||||
INIT_WORK(&mdev->submit.worker, do_submit);
|
||||
spin_lock_init(&mdev->submit.lock);
|
||||
INIT_LIST_HEAD(&mdev->submit.writes);
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
|
||||
{
|
||||
struct drbd_conf *mdev;
|
||||
@@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
|
||||
goto out_idr_remove_minor;
|
||||
}
|
||||
|
||||
if (init_submitter(mdev)) {
|
||||
err = ERR_NOMEM;
|
||||
drbd_msg_put_info("unable to create submit workqueue");
|
||||
goto out_idr_remove_vol;
|
||||
}
|
||||
|
||||
add_disk(disk);
|
||||
kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
|
||||
|
||||
@@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
|
||||
|
||||
return NO_ERROR;
|
||||
|
||||
out_idr_remove_vol:
|
||||
idr_remove(&tconn->volumes, vnr_got);
|
||||
out_idr_remove_minor:
|
||||
idr_remove(&minors, minor_got);
|
||||
synchronize_rcu();
|
||||
@@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
|
||||
blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
||||
blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
||||
|
||||
kfree(ldev->disk_conf);
|
||||
kfree(ldev);
|
||||
}
|
||||
|
||||
@@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* aligned 4kByte */
|
||||
struct meta_data_on_disk {
|
||||
u64 la_size; /* last agreed size. */
|
||||
u64 la_size_sect; /* last agreed size. */
|
||||
u64 uuid[UI_SIZE]; /* UUIDs. */
|
||||
u64 device_uuid;
|
||||
u64 reserved_u64_1;
|
||||
@@ -2842,13 +2868,17 @@ struct meta_data_on_disk {
|
||||
u32 magic;
|
||||
u32 md_size_sect;
|
||||
u32 al_offset; /* offset to this block */
|
||||
u32 al_nr_extents; /* important for restoring the AL */
|
||||
u32 al_nr_extents; /* important for restoring the AL (userspace) */
|
||||
/* `-- act_log->nr_elements <-- ldev->dc.al_extents */
|
||||
u32 bm_offset; /* offset to the bitmap, from here */
|
||||
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
|
||||
u32 la_peer_max_bio_size; /* last peer max_bio_size */
|
||||
u32 reserved_u32[3];
|
||||
|
||||
/* see al_tr_number_to_on_disk_sector() */
|
||||
u32 al_stripes;
|
||||
u32 al_stripe_size_4k;
|
||||
|
||||
u8 reserved_u8[4096 - (7*8 + 10*4)];
|
||||
} __packed;
|
||||
|
||||
/**
|
||||
@@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
|
||||
sector_t sector;
|
||||
int i;
|
||||
|
||||
/* Don't accidentally change the DRBD meta data layout. */
|
||||
BUILD_BUG_ON(UI_SIZE != 4);
|
||||
BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
|
||||
|
||||
del_timer(&mdev->md_sync_timer);
|
||||
/* timer may be rearmed by drbd_md_mark_dirty() now. */
|
||||
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
|
||||
@@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
|
||||
if (!buffer)
|
||||
goto out;
|
||||
|
||||
memset(buffer, 0, 512);
|
||||
memset(buffer, 0, sizeof(*buffer));
|
||||
|
||||
buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
|
||||
buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
|
||||
for (i = UI_CURRENT; i < UI_SIZE; i++)
|
||||
buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
|
||||
buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
|
||||
@@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
|
||||
buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
|
||||
buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
|
||||
|
||||
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
|
||||
buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes);
|
||||
buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k);
|
||||
|
||||
D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset);
|
||||
sector = mdev->ldev->md.md_offset;
|
||||
|
||||
if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
|
||||
@@ -2910,13 +2947,141 @@ out:
|
||||
put_ldev(mdev);
|
||||
}
|
||||
|
||||
static int check_activity_log_stripe_size(struct drbd_conf *mdev,
|
||||
struct meta_data_on_disk *on_disk,
|
||||
struct drbd_md *in_core)
|
||||
{
|
||||
u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
|
||||
u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
|
||||
u64 al_size_4k;
|
||||
|
||||
/* both not set: default to old fixed size activity log */
|
||||
if (al_stripes == 0 && al_stripe_size_4k == 0) {
|
||||
al_stripes = 1;
|
||||
al_stripe_size_4k = MD_32kB_SECT/8;
|
||||
}
|
||||
|
||||
/* some paranoia plausibility checks */
|
||||
|
||||
/* we need both values to be set */
|
||||
if (al_stripes == 0 || al_stripe_size_4k == 0)
|
||||
goto err;
|
||||
|
||||
al_size_4k = (u64)al_stripes * al_stripe_size_4k;
|
||||
|
||||
/* Upper limit of activity log area, to avoid potential overflow
|
||||
* problems in al_tr_number_to_on_disk_sector(). As right now, more
|
||||
* than 72 * 4k blocks total only increases the amount of history,
|
||||
* limiting this arbitrarily to 16 GB is not a real limitation ;-) */
|
||||
if (al_size_4k > (16 * 1024 * 1024/4))
|
||||
goto err;
|
||||
|
||||
/* Lower limit: we need at least 8 transaction slots (32kB)
|
||||
* to not break existing setups */
|
||||
if (al_size_4k < MD_32kB_SECT/8)
|
||||
goto err;
|
||||
|
||||
in_core->al_stripe_size_4k = al_stripe_size_4k;
|
||||
in_core->al_stripes = al_stripes;
|
||||
in_core->al_size_4k = al_size_4k;
|
||||
|
||||
return 0;
|
||||
err:
|
||||
dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
|
||||
al_stripes, al_stripe_size_4k);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
|
||||
{
|
||||
sector_t capacity = drbd_get_capacity(bdev->md_bdev);
|
||||
struct drbd_md *in_core = &bdev->md;
|
||||
s32 on_disk_al_sect;
|
||||
s32 on_disk_bm_sect;
|
||||
|
||||
/* The on-disk size of the activity log, calculated from offsets, and
|
||||
* the size of the activity log calculated from the stripe settings,
|
||||
* should match.
|
||||
* Though we could relax this a bit: it is ok, if the striped activity log
|
||||
* fits in the available on-disk activity log size.
|
||||
* Right now, that would break how resize is implemented.
|
||||
* TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
|
||||
* of possible unused padding space in the on disk layout. */
|
||||
if (in_core->al_offset < 0) {
|
||||
if (in_core->bm_offset > in_core->al_offset)
|
||||
goto err;
|
||||
on_disk_al_sect = -in_core->al_offset;
|
||||
on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
|
||||
} else {
|
||||
if (in_core->al_offset != MD_4kB_SECT)
|
||||
goto err;
|
||||
if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
|
||||
goto err;
|
||||
|
||||
on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
|
||||
on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
|
||||
}
|
||||
|
||||
/* old fixed size meta data is exactly that: fixed. */
|
||||
if (in_core->meta_dev_idx >= 0) {
|
||||
if (in_core->md_size_sect != MD_128MB_SECT
|
||||
|| in_core->al_offset != MD_4kB_SECT
|
||||
|| in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
|
||||
|| in_core->al_stripes != 1
|
||||
|| in_core->al_stripe_size_4k != MD_32kB_SECT/8)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (capacity < in_core->md_size_sect)
|
||||
goto err;
|
||||
if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
|
||||
goto err;
|
||||
|
||||
/* should be aligned, and at least 32k */
|
||||
if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
|
||||
goto err;
|
||||
|
||||
/* should fit (for now: exactly) into the available on-disk space;
|
||||
* overflow prevention is in check_activity_log_stripe_size() above. */
|
||||
if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
|
||||
goto err;
|
||||
|
||||
/* again, should be aligned */
|
||||
if (in_core->bm_offset & 7)
|
||||
goto err;
|
||||
|
||||
/* FIXME check for device grow with flex external meta data? */
|
||||
|
||||
/* can the available bitmap space cover the last agreed device size? */
|
||||
if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
dev_err(DEV, "meta data offsets don't make sense: idx=%d "
|
||||
"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
|
||||
"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
|
||||
in_core->meta_dev_idx,
|
||||
in_core->al_stripes, in_core->al_stripe_size_4k,
|
||||
in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
|
||||
(unsigned long long)in_core->la_size_sect,
|
||||
(unsigned long long)capacity);
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* drbd_md_read() - Reads in the meta data super block
|
||||
* @mdev: DRBD device.
|
||||
* @bdev: Device from which the meta data should be read in.
|
||||
*
|
||||
* Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
|
||||
* Return NO_ERROR on success, and an enum drbd_ret_code in case
|
||||
* something goes wrong.
|
||||
*
|
||||
* Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
|
||||
* even before @bdev is assigned to @mdev->ldev.
|
||||
*/
|
||||
int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
|
||||
{
|
||||
@@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
|
||||
u32 magic, flags;
|
||||
int i, rv = NO_ERROR;
|
||||
|
||||
if (!get_ldev_if_state(mdev, D_ATTACHING))
|
||||
return ERR_IO_MD_DISK;
|
||||
if (mdev->state.disk != D_DISKLESS)
|
||||
return ERR_DISK_CONFIGURED;
|
||||
|
||||
buffer = drbd_md_get_buffer(mdev);
|
||||
if (!buffer)
|
||||
goto out;
|
||||
return ERR_NOMEM;
|
||||
|
||||
/* First, figure out where our meta data superblock is located,
|
||||
* and read it. */
|
||||
bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
|
||||
bdev->md.md_offset = drbd_md_ss(bdev);
|
||||
|
||||
if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
|
||||
/* NOTE: can't do normal error processing here as this is
|
||||
@@ -2948,46 +3118,52 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
|
||||
rv = ERR_MD_UNCLEAN;
|
||||
goto err;
|
||||
}
|
||||
|
||||
rv = ERR_MD_INVALID;
|
||||
if (magic != DRBD_MD_MAGIC_08) {
|
||||
if (magic == DRBD_MD_MAGIC_07)
|
||||
dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
|
||||
else
|
||||
dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
|
||||
rv = ERR_MD_INVALID;
|
||||
goto err;
|
||||
}
|
||||
if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
|
||||
dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
|
||||
be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
|
||||
rv = ERR_MD_INVALID;
|
||||
goto err;
|
||||
}
|
||||
if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
|
||||
dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
|
||||
be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
|
||||
rv = ERR_MD_INVALID;
|
||||
goto err;
|
||||
}
|
||||
if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
|
||||
dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
|
||||
be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
|
||||
rv = ERR_MD_INVALID;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
|
||||
dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
|
||||
be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
|
||||
rv = ERR_MD_INVALID;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
|
||||
|
||||
/* convert to in_core endian */
|
||||
bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
|
||||
for (i = UI_CURRENT; i < UI_SIZE; i++)
|
||||
bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
|
||||
bdev->md.flags = be32_to_cpu(buffer->flags);
|
||||
bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
|
||||
|
||||
bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
|
||||
bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
|
||||
bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
|
||||
|
||||
if (check_activity_log_stripe_size(mdev, buffer, &bdev->md))
|
||||
goto err;
|
||||
if (check_offsets_and_sizes(mdev, bdev))
|
||||
goto err;
|
||||
|
||||
if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
|
||||
dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
|
||||
be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
|
||||
goto err;
|
||||
}
|
||||
if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
|
||||
dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
|
||||
be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
|
||||
goto err;
|
||||
}
|
||||
|
||||
rv = NO_ERROR;
|
||||
|
||||
spin_lock_irq(&mdev->tconn->req_lock);
|
||||
if (mdev->state.conn < C_CONNECTED) {
|
||||
unsigned int peer;
|
||||
@@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
|
||||
|
||||
err:
|
||||
drbd_md_put_buffer(mdev);
|
||||
out:
|
||||
put_ldev(mdev);
|
||||
|
||||
return rv;
|
||||
}
|
||||
@@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused)
|
||||
* end up here after a failed attach, before ldev was even assigned.
|
||||
*/
|
||||
if (mdev->bitmap && mdev->ldev) {
|
||||
/* An interrupted resync or similar is allowed to recounts bits
|
||||
* while we detach.
|
||||
* Any modifications would not be expected anymore, though.
|
||||
*/
|
||||
if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
|
||||
"detach", BM_LOCKED_MASK)) {
|
||||
"detach", BM_LOCKED_TEST_ALLOWED)) {
|
||||
if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
|
||||
drbd_md_set_flag(mdev, MDF_FULL_SYNC);
|
||||
drbd_md_sync(mdev);
|
||||
@@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void drbd_go_diskless(struct drbd_conf *mdev)
|
||||
{
|
||||
D_ASSERT(mdev->state.disk == D_FAILED);
|
||||
if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
|
||||
drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
|
||||
* @mdev: DRBD device.
|
||||
|
||||
+124
-76
@@ -696,37 +696,52 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* initializes the md.*_offset members, so we are able to find
|
||||
* the on disk meta data */
|
||||
/* Initializes the md.*_offset members, so we are able to find
|
||||
* the on disk meta data.
|
||||
*
|
||||
* We currently have two possible layouts:
|
||||
* external:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [ 4k superblock ][ activity log ][ Bitmap ]
|
||||
* | al_offset == 8 |
|
||||
* | bm_offset = al_offset + X |
|
||||
* ==> bitmap sectors = md_size_sect - bm_offset
|
||||
*
|
||||
* internal:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [data.....][ Bitmap ][ activity log ][ 4k superblock ]
|
||||
* | al_offset < 0 |
|
||||
* | bm_offset = al_offset - Y |
|
||||
* ==> bitmap sectors = Y = al_offset - bm_offset
|
||||
*
|
||||
* Activity log size used to be fixed 32kB,
|
||||
* but is about to become configurable.
|
||||
*/
|
||||
static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
|
||||
struct drbd_backing_dev *bdev)
|
||||
{
|
||||
sector_t md_size_sect = 0;
|
||||
int meta_dev_idx;
|
||||
unsigned int al_size_sect = bdev->md.al_size_4k * 8;
|
||||
|
||||
rcu_read_lock();
|
||||
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
|
||||
bdev->md.md_offset = drbd_md_ss(bdev);
|
||||
|
||||
switch (meta_dev_idx) {
|
||||
switch (bdev->md.meta_dev_idx) {
|
||||
default:
|
||||
/* v07 style fixed size indexed meta data */
|
||||
bdev->md.md_size_sect = MD_RESERVED_SECT;
|
||||
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
|
||||
bdev->md.al_offset = MD_AL_OFFSET;
|
||||
bdev->md.bm_offset = MD_BM_OFFSET;
|
||||
bdev->md.md_size_sect = MD_128MB_SECT;
|
||||
bdev->md.al_offset = MD_4kB_SECT;
|
||||
bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
|
||||
break;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
/* just occupy the full device; unit: sectors */
|
||||
bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
|
||||
bdev->md.md_offset = 0;
|
||||
bdev->md.al_offset = MD_AL_OFFSET;
|
||||
bdev->md.bm_offset = MD_BM_OFFSET;
|
||||
bdev->md.al_offset = MD_4kB_SECT;
|
||||
bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
|
||||
break;
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
|
||||
/* al size is still fixed */
|
||||
bdev->md.al_offset = -MD_AL_SECTORS;
|
||||
bdev->md.al_offset = -al_size_sect;
|
||||
/* we need (slightly less than) ~ this much bitmap sectors: */
|
||||
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
|
||||
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
|
||||
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
|
||||
|
||||
/* plus the "drbd meta data super block",
|
||||
* and the activity log; */
|
||||
md_size_sect += MD_BM_OFFSET;
|
||||
md_size_sect += MD_4kB_SECT + al_size_sect;
|
||||
|
||||
bdev->md.md_size_sect = md_size_sect;
|
||||
/* bitmap offset is adjusted by 'super' block size */
|
||||
bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
|
||||
bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* input size is expected to be in KB */
|
||||
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
|
||||
enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
|
||||
{
|
||||
sector_t prev_first_sect, prev_size; /* previous meta location */
|
||||
sector_t la_size, u_size;
|
||||
sector_t la_size_sect, u_size;
|
||||
sector_t size;
|
||||
char ppb[10];
|
||||
|
||||
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
|
||||
prev_first_sect = drbd_md_first_sector(mdev->ldev);
|
||||
prev_size = mdev->ldev->md.md_size_sect;
|
||||
la_size = mdev->ldev->md.la_size_sect;
|
||||
la_size_sect = mdev->ldev->md.la_size_sect;
|
||||
|
||||
/* TODO: should only be some assert here, not (re)init... */
|
||||
drbd_md_set_sector_offsets(mdev, mdev->ldev);
|
||||
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
if (rv == dev_size_error)
|
||||
goto out;
|
||||
|
||||
la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
|
||||
la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
|
||||
|
||||
md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
|
||||
|| prev_size != mdev->ldev->md.md_size_sect;
|
||||
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
drbd_md_mark_dirty(mdev);
|
||||
}
|
||||
|
||||
if (size > la_size)
|
||||
if (size > la_size_sect)
|
||||
rv = grew;
|
||||
if (size < la_size)
|
||||
if (size < la_size_sect)
|
||||
rv = shrunk;
|
||||
out:
|
||||
lc_unlock(mdev->act_log);
|
||||
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
|
||||
sector_t u_size, int assume_peer_has_space)
|
||||
{
|
||||
sector_t p_size = mdev->p_size; /* partner's disk size. */
|
||||
sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
|
||||
sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
|
||||
sector_t m_size; /* my size */
|
||||
sector_t size = 0;
|
||||
|
||||
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
|
||||
if (p_size && m_size) {
|
||||
size = min_t(sector_t, p_size, m_size);
|
||||
} else {
|
||||
if (la_size) {
|
||||
size = la_size;
|
||||
if (la_size_sect) {
|
||||
size = la_size_sect;
|
||||
if (m_size && m_size < size)
|
||||
size = m_size;
|
||||
if (p_size && p_size < size)
|
||||
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
|
||||
return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
|
||||
}
|
||||
|
||||
static void enforce_disk_conf_limits(struct disk_conf *dc)
|
||||
static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
|
||||
dc->al_extents = DRBD_AL_EXTENTS_MIN;
|
||||
if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
|
||||
dc->al_extents = DRBD_AL_EXTENTS_MAX;
|
||||
/* This is limited by 16 bit "slot" numbers,
|
||||
* and by available on-disk context storage.
|
||||
*
|
||||
* Also (u16)~0 is special (denotes a "free" extent).
|
||||
*
|
||||
* One transaction occupies one 4kB on-disk block,
|
||||
* we have n such blocks in the on disk ring buffer,
|
||||
* the "current" transaction may fail (n-1),
|
||||
* and there is 919 slot numbers context information per transaction.
|
||||
*
|
||||
* 72 transaction blocks amounts to more than 2**16 context slots,
|
||||
* so cap there first.
|
||||
*/
|
||||
const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
|
||||
const unsigned int sufficient_on_disk =
|
||||
(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
|
||||
/AL_CONTEXT_PER_TRANSACTION;
|
||||
|
||||
if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
|
||||
dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
|
||||
unsigned int al_size_4k = bdev->md.al_size_4k;
|
||||
|
||||
if (al_size_4k > sufficient_on_disk)
|
||||
return max_al_nr;
|
||||
|
||||
return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
|
||||
}
|
||||
|
||||
int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
|
||||
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
|
||||
if (!expect(new_disk_conf->resync_rate >= 1))
|
||||
new_disk_conf->resync_rate = 1;
|
||||
|
||||
enforce_disk_conf_limits(new_disk_conf);
|
||||
if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
|
||||
new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
|
||||
if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
|
||||
new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
|
||||
|
||||
if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
|
||||
new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
|
||||
|
||||
fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
|
||||
if (fifo_size != mdev->rs_plan_s->size) {
|
||||
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
enforce_disk_conf_limits(new_disk_conf);
|
||||
if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
|
||||
new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
|
||||
|
||||
new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
|
||||
if (!new_plan) {
|
||||
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
write_lock_irq(&global_state_lock);
|
||||
retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
|
||||
write_unlock_irq(&global_state_lock);
|
||||
if (retcode != NO_ERROR)
|
||||
goto fail;
|
||||
|
||||
rcu_read_lock();
|
||||
nc = rcu_dereference(mdev->tconn->net_conf);
|
||||
if (nc) {
|
||||
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
|
||||
drbd_md_set_sector_offsets(mdev, nbc);
|
||||
/* Read our meta data super block early.
|
||||
* This also sets other on-disk offsets. */
|
||||
retcode = drbd_md_read(mdev, nbc);
|
||||
if (retcode != NO_ERROR)
|
||||
goto fail;
|
||||
|
||||
if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
|
||||
new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
|
||||
if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
|
||||
new_disk_conf->al_extents = drbd_al_extents_max(nbc);
|
||||
|
||||
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
|
||||
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
|
||||
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
min_md_device_sectors = (2<<10);
|
||||
} else {
|
||||
max_possible_sectors = DRBD_MAX_SECTORS;
|
||||
min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
|
||||
min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
|
||||
}
|
||||
|
||||
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
|
||||
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
if (!get_ldev_if_state(mdev, D_ATTACHING))
|
||||
goto force_diskless;
|
||||
|
||||
drbd_md_set_sector_offsets(mdev, nbc);
|
||||
|
||||
if (!mdev->bitmap) {
|
||||
if (drbd_bm_init(mdev)) {
|
||||
retcode = ERR_NOMEM;
|
||||
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
}
|
||||
}
|
||||
|
||||
retcode = drbd_md_read(mdev, nbc);
|
||||
if (retcode != NO_ERROR)
|
||||
goto force_diskless_dec;
|
||||
|
||||
if (mdev->state.conn < C_CONNECTED &&
|
||||
mdev->state.role == R_PRIMARY &&
|
||||
(mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
|
||||
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
|
||||
return SS_SUCCESS;
|
||||
case SS_PRIMARY_NOP:
|
||||
/* Our state checking code wants to see the peer outdated. */
|
||||
rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
|
||||
pdsk, D_OUTDATED), CS_VERBOSE);
|
||||
rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
|
||||
|
||||
if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
|
||||
rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
|
||||
|
||||
break;
|
||||
case SS_CW_FAILED_BY_PEER:
|
||||
/* The peer probably wants to see us outdated. */
|
||||
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
|
||||
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
|
||||
drbd_flush_workqueue(mdev);
|
||||
|
||||
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
|
||||
|
||||
if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
|
||||
/* If we happen to be C_STANDALONE R_SECONDARY, just change to
|
||||
* D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
|
||||
* try to start a resync handshake as sync target for full sync.
|
||||
*/
|
||||
if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) {
|
||||
retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT));
|
||||
if (retcode >= SS_SUCCESS) {
|
||||
if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
|
||||
"set_n_write from invalidate", BM_LOCKED_MASK))
|
||||
retcode = ERR_IO_MD_DISK;
|
||||
}
|
||||
} else
|
||||
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
|
||||
|
||||
while (retcode == SS_NEED_CONNECTION) {
|
||||
spin_lock_irq(&mdev->tconn->req_lock);
|
||||
if (mdev->state.conn < C_CONNECTED)
|
||||
retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
|
||||
spin_unlock_irq(&mdev->tconn->req_lock);
|
||||
|
||||
if (retcode != SS_NEED_CONNECTION)
|
||||
break;
|
||||
|
||||
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
|
||||
}
|
||||
drbd_resume_io(mdev);
|
||||
|
||||
out:
|
||||
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
|
||||
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
|
||||
drbd_flush_workqueue(mdev);
|
||||
|
||||
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
|
||||
if (retcode < SS_SUCCESS) {
|
||||
if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
|
||||
/* The peer will get a resync upon connect anyways.
|
||||
* Just make that into a full resync. */
|
||||
retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
|
||||
if (retcode >= SS_SUCCESS) {
|
||||
if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
|
||||
"set_n_write from invalidate_peer",
|
||||
BM_LOCKED_SET_ALLOWED))
|
||||
retcode = ERR_IO_MD_DISK;
|
||||
}
|
||||
} else
|
||||
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
|
||||
}
|
||||
/* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
|
||||
* in the bitmap. Otherwise, try to start a resync handshake
|
||||
* as sync source for full sync.
|
||||
*/
|
||||
if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) {
|
||||
/* The peer will get a resync upon connect anyways. Just make that
|
||||
into a full resync. */
|
||||
retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
|
||||
if (retcode >= SS_SUCCESS) {
|
||||
if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
|
||||
"set_n_write from invalidate_peer",
|
||||
BM_LOCKED_SET_ALLOWED))
|
||||
retcode = ERR_IO_MD_DISK;
|
||||
}
|
||||
} else
|
||||
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
|
||||
drbd_resume_io(mdev);
|
||||
|
||||
out:
|
||||
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
|
||||
CS_VERBOSE + CS_WAIT_COMPLETE);
|
||||
idr_remove(&mdev->tconn->volumes, mdev->vnr);
|
||||
idr_remove(&minors, mdev_to_minor(mdev));
|
||||
destroy_workqueue(mdev->submit.wq);
|
||||
del_gendisk(mdev->vdisk);
|
||||
synchronize_rcu();
|
||||
kref_put(&mdev->kref, &drbd_minor_destroy);
|
||||
|
||||
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
|
||||
|
||||
static int drbd_proc_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
if (try_module_get(THIS_MODULE))
|
||||
return single_open(file, drbd_seq_show, PDE_DATA(inode));
|
||||
int err;
|
||||
|
||||
if (try_module_get(THIS_MODULE)) {
|
||||
err = single_open(file, drbd_seq_show, PDE_DATA(inode));
|
||||
if (err)
|
||||
module_put(THIS_MODULE);
|
||||
return err;
|
||||
}
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
|
||||
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev)
|
||||
err = drbd_send_current_state(mdev);
|
||||
clear_bit(USE_DEGR_WFC_T, &mdev->flags);
|
||||
clear_bit(RESIZE_PENDING, &mdev->flags);
|
||||
atomic_set(&mdev->ap_in_flight, 0);
|
||||
mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
|
||||
return err;
|
||||
}
|
||||
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
|
||||
drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
|
||||
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
|
||||
peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
|
||||
drbd_al_begin_io(mdev, &peer_req->i);
|
||||
drbd_al_begin_io(mdev, &peer_req->i, true);
|
||||
}
|
||||
|
||||
err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
|
||||
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
|
||||
if (hg == -1 && mdev->state.role == R_PRIMARY) {
|
||||
enum drbd_state_rv rv2;
|
||||
|
||||
drbd_set_role(mdev, R_SECONDARY, 0);
|
||||
/* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
|
||||
* we might be here in C_WF_REPORT_PARAMS which is transient.
|
||||
* we do not need to wait for the after state change work either. */
|
||||
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
|
||||
|
||||
clear_bit(DISCARD_MY_DATA, &mdev->flags);
|
||||
|
||||
drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
|
||||
drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn)
|
||||
#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
|
||||
static int drbd_do_auth(struct drbd_tconn *tconn)
|
||||
{
|
||||
dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
|
||||
dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
|
||||
conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
|
||||
conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi)
|
||||
bool ping_timeout_active = false;
|
||||
struct net_conf *nc;
|
||||
int ping_timeo, tcp_cork, ping_int;
|
||||
struct sched_param param = { .sched_priority = 2 };
|
||||
|
||||
current->policy = SCHED_RR; /* Make this a realtime task! */
|
||||
current->rt_priority = 2; /* more important than all other tasks */
|
||||
rv = sched_setscheduler(current, SCHED_RR, ¶m);
|
||||
if (rv < 0)
|
||||
conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv);
|
||||
|
||||
while (get_t_state(thi) == RUNNING) {
|
||||
drbd_thread_current_set_cpu(thi);
|
||||
|
||||
+161
-31
@@ -34,14 +34,14 @@
|
||||
static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
|
||||
|
||||
/* Update disk stats at start of I/O request */
|
||||
static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
|
||||
static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
|
||||
{
|
||||
const int rw = bio_data_dir(bio);
|
||||
const int rw = bio_data_dir(req->master_bio);
|
||||
int cpu;
|
||||
cpu = part_stat_lock();
|
||||
part_round_stats(cpu, &mdev->vdisk->part0);
|
||||
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
|
||||
part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
|
||||
part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9);
|
||||
(void) cpu; /* The macro invocations above want the cpu argument, I do not like
|
||||
the compiler warning about cpu only assigned but never used... */
|
||||
part_inc_in_flight(&mdev->vdisk->part0, rw);
|
||||
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
|
||||
else
|
||||
root = &mdev->read_requests;
|
||||
drbd_remove_request_interval(root, req);
|
||||
} else if (!(s & RQ_POSTPONED))
|
||||
D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
|
||||
}
|
||||
|
||||
/* Before we can signal completion to the upper layers,
|
||||
* we may need to close the current transfer log epoch.
|
||||
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
||||
D_ASSERT(req->rq_state & RQ_NET_PENDING);
|
||||
mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
|
||||
break;
|
||||
|
||||
case QUEUE_AS_DRBD_BARRIER:
|
||||
start_new_tl_epoch(mdev->tconn);
|
||||
mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
|
||||
break;
|
||||
};
|
||||
|
||||
return rv;
|
||||
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev)
|
||||
bool congested = false;
|
||||
enum drbd_on_congestion on_congestion;
|
||||
|
||||
rcu_read_lock();
|
||||
nc = rcu_dereference(tconn->net_conf);
|
||||
on_congestion = nc ? nc->on_congestion : OC_BLOCK;
|
||||
rcu_read_unlock();
|
||||
if (on_congestion == OC_BLOCK ||
|
||||
tconn->agreed_pro_version < 96)
|
||||
return;
|
||||
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req)
|
||||
struct drbd_conf *mdev = req->w.mdev;
|
||||
int remote, send_oos;
|
||||
|
||||
rcu_read_lock();
|
||||
remote = drbd_should_do_remote(mdev->state);
|
||||
if (remote) {
|
||||
maybe_pull_ahead(mdev);
|
||||
remote = drbd_should_do_remote(mdev->state);
|
||||
}
|
||||
send_oos = drbd_should_send_out_of_sync(mdev->state);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Need to replicate writes. Unless it is an empty flush,
|
||||
* which is better mapped to a DRBD P_BARRIER packet,
|
||||
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req)
|
||||
/* The only size==0 bios we expect are empty flushes. */
|
||||
D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
|
||||
if (remote)
|
||||
start_new_tl_epoch(mdev->tconn);
|
||||
return 0;
|
||||
_req_mod(req, QUEUE_AS_DRBD_BARRIER);
|
||||
return remote;
|
||||
}
|
||||
|
||||
if (!remote && !send_oos)
|
||||
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req)
|
||||
bio_endio(bio, -EIO);
|
||||
}
|
||||
|
||||
void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
|
||||
static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req)
|
||||
{
|
||||
const int rw = bio_rw(bio);
|
||||
struct bio_and_error m = { NULL, };
|
||||
spin_lock(&mdev->submit.lock);
|
||||
list_add_tail(&req->tl_requests, &mdev->submit.writes);
|
||||
spin_unlock(&mdev->submit.lock);
|
||||
queue_work(mdev->submit.wq, &mdev->submit.worker);
|
||||
}
|
||||
|
||||
/* returns the new drbd_request pointer, if the caller is expected to
|
||||
* drbd_send_and_submit() it (to save latency), or NULL if we queued the
|
||||
* request on the submitter thread.
|
||||
* Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
|
||||
*/
|
||||
struct drbd_request *
|
||||
drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
|
||||
{
|
||||
const int rw = bio_data_dir(bio);
|
||||
struct drbd_request *req;
|
||||
bool no_remote = false;
|
||||
|
||||
/* allocate outside of all locks; */
|
||||
req = drbd_req_new(mdev, bio);
|
||||
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
|
||||
* if user cannot handle io errors, that's not our business. */
|
||||
dev_err(DEV, "could not kmalloc() req\n");
|
||||
bio_endio(bio, -ENOMEM);
|
||||
return;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
req->start_time = start_time;
|
||||
|
||||
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
|
||||
req->private_bio = NULL;
|
||||
}
|
||||
|
||||
/* For WRITES going to the local disk, grab a reference on the target
|
||||
* extent. This waits for any resync activity in the corresponding
|
||||
* resync extent to finish, and, if necessary, pulls in the target
|
||||
* extent into the activity log, which involves further disk io because
|
||||
* of transactional on-disk meta data updates.
|
||||
* Empty flushes don't need to go into the activity log, they can only
|
||||
* flush data for pending writes which are already in there. */
|
||||
/* Update disk stats */
|
||||
_drbd_start_io_acct(mdev, req);
|
||||
|
||||
if (rw == WRITE && req->private_bio && req->i.size
|
||||
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
|
||||
if (!drbd_al_begin_io_fastpath(mdev, &req->i)) {
|
||||
drbd_queue_write(mdev, req);
|
||||
return NULL;
|
||||
}
|
||||
req->rq_state |= RQ_IN_ACT_LOG;
|
||||
drbd_al_begin_io(mdev, &req->i);
|
||||
}
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req)
|
||||
{
|
||||
const int rw = bio_rw(req->master_bio);
|
||||
struct bio_and_error m = { NULL, };
|
||||
bool no_remote = false;
|
||||
|
||||
spin_lock_irq(&mdev->tconn->req_lock);
|
||||
if (rw == WRITE) {
|
||||
/* This may temporarily give up the req_lock,
|
||||
* but will re-aquire it before it returns here.
|
||||
* Needs to be before the check on drbd_suspended() */
|
||||
complete_conflicting_writes(req);
|
||||
/* no more giving up req_lock from now on! */
|
||||
|
||||
/* check for congestion, and potentially stop sending
|
||||
* full data updates, but start sending "dirty bits" only. */
|
||||
maybe_pull_ahead(mdev);
|
||||
}
|
||||
|
||||
/* no more giving up req_lock from now on! */
|
||||
|
||||
if (drbd_suspended(mdev)) {
|
||||
/* push back and retry: */
|
||||
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Update disk stats */
|
||||
_drbd_start_io_acct(mdev, req, bio);
|
||||
|
||||
/* We fail READ/READA early, if we can not serve it.
|
||||
* We must do this before req is registered on any lists.
|
||||
* Otherwise, drbd_req_complete() will queue failed READ for retry. */
|
||||
@@ -1137,7 +1158,116 @@ out:
|
||||
|
||||
if (m.bio)
|
||||
complete_master_bio(mdev, &m);
|
||||
return;
|
||||
}
|
||||
|
||||
void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
|
||||
{
|
||||
struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time);
|
||||
if (IS_ERR_OR_NULL(req))
|
||||
return;
|
||||
drbd_send_and_submit(mdev, req);
|
||||
}
|
||||
|
||||
static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming)
|
||||
{
|
||||
struct drbd_request *req, *tmp;
|
||||
list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
|
||||
const int rw = bio_data_dir(req->master_bio);
|
||||
|
||||
if (rw == WRITE /* rw != WRITE should not even end up here! */
|
||||
&& req->private_bio && req->i.size
|
||||
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
|
||||
if (!drbd_al_begin_io_fastpath(mdev, &req->i))
|
||||
continue;
|
||||
|
||||
req->rq_state |= RQ_IN_ACT_LOG;
|
||||
}
|
||||
|
||||
list_del_init(&req->tl_requests);
|
||||
drbd_send_and_submit(mdev, req);
|
||||
}
|
||||
}
|
||||
|
||||
static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev,
|
||||
struct list_head *incoming,
|
||||
struct list_head *pending)
|
||||
{
|
||||
struct drbd_request *req, *tmp;
|
||||
int wake = 0;
|
||||
int err;
|
||||
|
||||
spin_lock_irq(&mdev->al_lock);
|
||||
list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
|
||||
err = drbd_al_begin_io_nonblock(mdev, &req->i);
|
||||
if (err == -EBUSY)
|
||||
wake = 1;
|
||||
if (err)
|
||||
continue;
|
||||
req->rq_state |= RQ_IN_ACT_LOG;
|
||||
list_move_tail(&req->tl_requests, pending);
|
||||
}
|
||||
spin_unlock_irq(&mdev->al_lock);
|
||||
if (wake)
|
||||
wake_up(&mdev->al_wait);
|
||||
|
||||
return !list_empty(pending);
|
||||
}
|
||||
|
||||
void do_submit(struct work_struct *ws)
|
||||
{
|
||||
struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker);
|
||||
LIST_HEAD(incoming);
|
||||
LIST_HEAD(pending);
|
||||
struct drbd_request *req, *tmp;
|
||||
|
||||
for (;;) {
|
||||
spin_lock(&mdev->submit.lock);
|
||||
list_splice_tail_init(&mdev->submit.writes, &incoming);
|
||||
spin_unlock(&mdev->submit.lock);
|
||||
|
||||
submit_fast_path(mdev, &incoming);
|
||||
if (list_empty(&incoming))
|
||||
break;
|
||||
|
||||
wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending));
|
||||
/* Maybe more was queued, while we prepared the transaction?
|
||||
* Try to stuff them into this transaction as well.
|
||||
* Be strictly non-blocking here, no wait_event, we already
|
||||
* have something to commit.
|
||||
* Stop if we don't make any more progres.
|
||||
*/
|
||||
for (;;) {
|
||||
LIST_HEAD(more_pending);
|
||||
LIST_HEAD(more_incoming);
|
||||
bool made_progress;
|
||||
|
||||
/* It is ok to look outside the lock,
|
||||
* it's only an optimization anyways */
|
||||
if (list_empty(&mdev->submit.writes))
|
||||
break;
|
||||
|
||||
spin_lock(&mdev->submit.lock);
|
||||
list_splice_tail_init(&mdev->submit.writes, &more_incoming);
|
||||
spin_unlock(&mdev->submit.lock);
|
||||
|
||||
if (list_empty(&more_incoming))
|
||||
break;
|
||||
|
||||
made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending);
|
||||
|
||||
list_splice_tail_init(&more_pending, &pending);
|
||||
list_splice_tail_init(&more_incoming, &incoming);
|
||||
|
||||
if (!made_progress)
|
||||
break;
|
||||
}
|
||||
drbd_al_begin_io_commit(mdev, false);
|
||||
|
||||
list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
|
||||
list_del_init(&req->tl_requests);
|
||||
drbd_send_and_submit(mdev, req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void drbd_make_request(struct request_queue *q, struct bio *bio)
|
||||
|
||||
@@ -88,6 +88,14 @@ enum drbd_req_event {
|
||||
QUEUE_FOR_NET_READ,
|
||||
QUEUE_FOR_SEND_OOS,
|
||||
|
||||
/* An empty flush is queued as P_BARRIER,
|
||||
* which will cause it to complete "successfully",
|
||||
* even if the local disk flush failed.
|
||||
*
|
||||
* Just like "real" requests, empty flushes (blkdev_issue_flush()) will
|
||||
* only see an error if neither local nor remote data is reachable. */
|
||||
QUEUE_AS_DRBD_BARRIER,
|
||||
|
||||
SEND_CANCELED,
|
||||
SEND_FAILED,
|
||||
HANDED_OVER_TO_NETWORK,
|
||||
|
||||
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
|
||||
mdev->tconn->agreed_pro_version < 88)
|
||||
rv = SS_NOT_SUPPORTED;
|
||||
|
||||
else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
|
||||
rv = SS_NO_UP_TO_DATE_DISK;
|
||||
|
||||
else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
|
||||
ns.pdsk == D_UNKNOWN)
|
||||
rv = SS_NEED_CONNECTION;
|
||||
|
||||
else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
|
||||
rv = SS_CONNECTED_OUTDATES;
|
||||
|
||||
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t
|
||||
&& os.conn < C_WF_REPORT_PARAMS)
|
||||
rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
|
||||
|
||||
if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
|
||||
os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
|
||||
rv = SS_OUTDATE_WO_CONN;
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
|
||||
&drbd_bmio_set_n_write, &abw_start_sync,
|
||||
"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
|
||||
|
||||
/* We are invalidating our self... */
|
||||
if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
|
||||
os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
|
||||
/* other bitmap operation expected during this phase */
|
||||
drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
|
||||
"set_n_write from invalidate", BM_LOCKED_MASK);
|
||||
|
||||
/* first half of local IO error, failure to attach,
|
||||
* or administrative detach */
|
||||
if (os.disk != D_FAILED && ns.disk == D_FAILED) {
|
||||
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state
|
||||
if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
|
||||
return SS_CW_FAILED_BY_PEER;
|
||||
|
||||
rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
|
||||
|
||||
if (rv == SS_UNKNOWN_ERROR)
|
||||
rv = conn_is_valid_transition(tconn, mask, val, 0);
|
||||
|
||||
if (rv == SS_SUCCESS)
|
||||
rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
|
||||
rv = conn_is_valid_transition(tconn, mask, val, 0);
|
||||
if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS)
|
||||
rv = SS_UNKNOWN_ERROR; /* continue waiting */
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
|
||||
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
|
||||
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
|
||||
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
|
||||
[-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
|
||||
[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
|
||||
};
|
||||
|
||||
|
||||
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error)
|
||||
md_io->done = 1;
|
||||
wake_up(&mdev->misc_wait);
|
||||
bio_put(bio);
|
||||
put_ldev(mdev);
|
||||
if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
|
||||
put_ldev(mdev);
|
||||
}
|
||||
|
||||
/* reads on behalf of the partner,
|
||||
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
|
||||
struct drbd_conf *mdev = w->mdev;
|
||||
|
||||
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
|
||||
drbd_al_begin_io(mdev, &req->i);
|
||||
drbd_al_begin_io(mdev, &req->i, false);
|
||||
|
||||
drbd_req_make_private_bio(req, req->master_bio);
|
||||
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
|
||||
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
|
||||
int resync_after;
|
||||
|
||||
while (1) {
|
||||
if (!odev->ldev)
|
||||
if (!odev->ldev || odev->state.disk == D_DISKLESS)
|
||||
return 1;
|
||||
rcu_read_lock();
|
||||
resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
|
||||
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
|
||||
if (resync_after == -1)
|
||||
return 1;
|
||||
odev = minor_to_mdev(resync_after);
|
||||
if (!expect(odev))
|
||||
if (!odev)
|
||||
return 1;
|
||||
if ((odev->state.conn >= C_SYNC_SOURCE &&
|
||||
odev->state.conn <= C_PAUSED_SYNC_T) ||
|
||||
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
|
||||
|
||||
if (o_minor == -1)
|
||||
return NO_ERROR;
|
||||
if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
|
||||
if (o_minor < -1 || o_minor > MINORMASK)
|
||||
return ERR_RESYNC_AFTER;
|
||||
|
||||
/* check for loops */
|
||||
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
|
||||
if (odev == mdev)
|
||||
return ERR_RESYNC_AFTER_CYCLE;
|
||||
|
||||
/* You are free to depend on diskless, non-existing,
|
||||
* or not yet/no longer existing minors.
|
||||
* We only reject dependency loops.
|
||||
* We cannot follow the dependency chain beyond a detached or
|
||||
* missing minor.
|
||||
*/
|
||||
if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
|
||||
return NO_ERROR;
|
||||
|
||||
rcu_read_lock();
|
||||
resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
|
||||
rcu_read_unlock();
|
||||
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
|
||||
clear_bit(B_RS_H_DONE, &mdev->flags);
|
||||
|
||||
write_lock_irq(&global_state_lock);
|
||||
if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
|
||||
/* Did some connection breakage or IO error race with us? */
|
||||
if (mdev->state.conn < C_CONNECTED
|
||||
|| !get_ldev_if_state(mdev, D_NEGOTIATING)) {
|
||||
write_unlock_irq(&global_state_lock);
|
||||
mutex_unlock(mdev->state_mutex);
|
||||
return;
|
||||
|
||||
@@ -780,6 +780,7 @@ static const struct block_device_operations mg_disk_ops = {
|
||||
.getgeo = mg_getgeo
|
||||
};
|
||||
|
||||
#ifdef CONFIG_PM_SLEEP
|
||||
static int mg_suspend(struct device *dev)
|
||||
{
|
||||
struct mg_drv_data *prv_data = dev->platform_data;
|
||||
@@ -824,6 +825,7 @@ static int mg_resume(struct device *dev)
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
|
||||
|
||||
|
||||
@@ -728,7 +728,10 @@ static void mtip_async_complete(struct mtip_port *port,
|
||||
atomic_set(&port->commands[tag].active, 0);
|
||||
release_slot(port, tag);
|
||||
|
||||
up(&port->cmd_slot);
|
||||
if (unlikely(command->unaligned))
|
||||
up(&port->cmd_slot_unal);
|
||||
else
|
||||
up(&port->cmd_slot);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1560,10 +1563,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
|
||||
/* Demux ID.DRAT & ID.RZAT to determine trim support */
|
||||
if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
|
||||
port->dd->trim_supp = true;
|
||||
else
|
||||
#endif
|
||||
port->dd->trim_supp = false;
|
||||
|
||||
/* Set the identify buffer as valid. */
|
||||
@@ -2557,7 +2562,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
|
||||
*/
|
||||
static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
|
||||
int nsect, int nents, int tag, void *callback,
|
||||
void *data, int dir)
|
||||
void *data, int dir, int unaligned)
|
||||
{
|
||||
struct host_to_dev_fis *fis;
|
||||
struct mtip_port *port = dd->port;
|
||||
@@ -2570,6 +2575,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
|
||||
|
||||
command->scatter_ents = nents;
|
||||
|
||||
command->unaligned = unaligned;
|
||||
/*
|
||||
* The number of retries for this command before it is
|
||||
* reported as a failure to the upper layers.
|
||||
@@ -2598,6 +2604,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
|
||||
fis->res3 = 0;
|
||||
fill_command_sg(dd, command, nents);
|
||||
|
||||
if (unaligned)
|
||||
fis->device |= 1 << 7;
|
||||
|
||||
/* Populate the command header */
|
||||
command->command_header->opts =
|
||||
__force_bit2int cpu_to_le32(
|
||||
@@ -2644,9 +2653,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
|
||||
* return value
|
||||
* None
|
||||
*/
|
||||
static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
|
||||
static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
|
||||
int unaligned)
|
||||
{
|
||||
struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
|
||||
&dd->port->cmd_slot;
|
||||
release_slot(dd->port, tag);
|
||||
up(sem);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2661,22 +2674,25 @@ static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
|
||||
* or NULL if no command slots are available.
|
||||
*/
|
||||
static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
|
||||
int *tag)
|
||||
int *tag, int unaligned)
|
||||
{
|
||||
struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
|
||||
&dd->port->cmd_slot;
|
||||
|
||||
/*
|
||||
* It is possible that, even with this semaphore, a thread
|
||||
* may think that no command slots are available. Therefore, we
|
||||
* need to make an attempt to get_slot().
|
||||
*/
|
||||
down(&dd->port->cmd_slot);
|
||||
down(sem);
|
||||
*tag = get_slot(dd->port);
|
||||
|
||||
if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
|
||||
up(&dd->port->cmd_slot);
|
||||
up(sem);
|
||||
return NULL;
|
||||
}
|
||||
if (unlikely(*tag < 0)) {
|
||||
up(&dd->port->cmd_slot);
|
||||
up(sem);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -3010,6 +3026,11 @@ static inline void hba_setup(struct driver_data *dd)
|
||||
dd->mmio + HOST_HSORG);
|
||||
}
|
||||
|
||||
static int mtip_device_unaligned_constrained(struct driver_data *dd)
|
||||
{
|
||||
return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Detect the details of the product, and store anything needed
|
||||
* into the driver data structure. This includes product type and
|
||||
@@ -3232,8 +3253,15 @@ static int mtip_hw_init(struct driver_data *dd)
|
||||
for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
|
||||
dd->work[i].port = dd->port;
|
||||
|
||||
/* Enable unaligned IO constraints for some devices */
|
||||
if (mtip_device_unaligned_constrained(dd))
|
||||
dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS;
|
||||
else
|
||||
dd->unal_qdepth = 0;
|
||||
|
||||
/* Counting semaphore to track command slot usage */
|
||||
sema_init(&dd->port->cmd_slot, num_command_slots - 1);
|
||||
sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth);
|
||||
sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
|
||||
|
||||
/* Spinlock to prevent concurrent issue */
|
||||
for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
|
||||
@@ -3836,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
|
||||
struct scatterlist *sg;
|
||||
struct bio_vec *bvec;
|
||||
int nents = 0;
|
||||
int tag = 0;
|
||||
int tag = 0, unaligned = 0;
|
||||
|
||||
if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
|
||||
if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
|
||||
@@ -3872,7 +3900,15 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
|
||||
return;
|
||||
}
|
||||
|
||||
sg = mtip_hw_get_scatterlist(dd, &tag);
|
||||
if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 &&
|
||||
dd->unal_qdepth) {
|
||||
if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */
|
||||
unaligned = 1;
|
||||
else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */
|
||||
unaligned = 1;
|
||||
}
|
||||
|
||||
sg = mtip_hw_get_scatterlist(dd, &tag, unaligned);
|
||||
if (likely(sg != NULL)) {
|
||||
blk_queue_bounce(queue, &bio);
|
||||
|
||||
@@ -3880,7 +3916,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
|
||||
dev_warn(&dd->pdev->dev,
|
||||
"Maximum number of SGL entries exceeded\n");
|
||||
bio_io_error(bio);
|
||||
mtip_hw_release_scatterlist(dd, tag);
|
||||
mtip_hw_release_scatterlist(dd, tag, unaligned);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3900,7 +3936,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
|
||||
tag,
|
||||
bio_endio,
|
||||
bio,
|
||||
bio_data_dir(bio));
|
||||
bio_data_dir(bio),
|
||||
unaligned);
|
||||
} else
|
||||
bio_io_error(bio);
|
||||
}
|
||||
@@ -4156,26 +4193,24 @@ static int mtip_block_remove(struct driver_data *dd)
|
||||
*/
|
||||
static int mtip_block_shutdown(struct driver_data *dd)
|
||||
{
|
||||
dev_info(&dd->pdev->dev,
|
||||
"Shutting down %s ...\n", dd->disk->disk_name);
|
||||
|
||||
/* Delete our gendisk structure, and cleanup the blk queue. */
|
||||
if (dd->disk) {
|
||||
if (dd->disk->queue)
|
||||
del_gendisk(dd->disk);
|
||||
else
|
||||
put_disk(dd->disk);
|
||||
}
|
||||
dev_info(&dd->pdev->dev,
|
||||
"Shutting down %s ...\n", dd->disk->disk_name);
|
||||
|
||||
if (dd->disk->queue) {
|
||||
del_gendisk(dd->disk);
|
||||
blk_cleanup_queue(dd->queue);
|
||||
} else
|
||||
put_disk(dd->disk);
|
||||
dd->disk = NULL;
|
||||
dd->queue = NULL;
|
||||
}
|
||||
|
||||
spin_lock(&rssd_index_lock);
|
||||
ida_remove(&rssd_index_ida, dd->index);
|
||||
spin_unlock(&rssd_index_lock);
|
||||
|
||||
blk_cleanup_queue(dd->queue);
|
||||
dd->disk = NULL;
|
||||
dd->queue = NULL;
|
||||
|
||||
mtip_hw_shutdown(dd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,9 @@
|
||||
#define MTIP_FTL_REBUILD_MAGIC 0xED51
|
||||
#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000
|
||||
|
||||
/* unaligned IO handling */
|
||||
#define MTIP_MAX_UNALIGNED_SLOTS 8
|
||||
|
||||
/* Macro to extract the tag bit number from a tag value. */
|
||||
#define MTIP_TAG_BIT(tag) (tag & 0x1F)
|
||||
|
||||
@@ -333,6 +336,8 @@ struct mtip_cmd {
|
||||
|
||||
int scatter_ents; /* Number of scatter list entries used */
|
||||
|
||||
int unaligned; /* command is unaligned on 4k boundary */
|
||||
|
||||
struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
|
||||
|
||||
int retries; /* The number of retries left for this command. */
|
||||
@@ -452,6 +457,10 @@ struct mtip_port {
|
||||
* command slots available.
|
||||
*/
|
||||
struct semaphore cmd_slot;
|
||||
|
||||
/* Semaphore to control queue depth of unaligned IOs */
|
||||
struct semaphore cmd_slot_unal;
|
||||
|
||||
/* Spinlock for working around command-issue bug. */
|
||||
spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
|
||||
};
|
||||
@@ -502,6 +511,8 @@ struct driver_data {
|
||||
|
||||
int isr_binding;
|
||||
|
||||
int unal_qdepth; /* qdepth of unaligned IO queue */
|
||||
|
||||
struct list_head online_list; /* linkage for online list */
|
||||
|
||||
struct list_head remove_list; /* linkage for removing list */
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user