mirror of
https://github.com/linux-apfs/apfstests.git
synced 2026-05-01 15:01:44 -07:00
fa85aa6497
[BUG]
When btrfs/011 is executed on a fast enough system (fully memory backed
VM, with test device has unsafe cache mode), the test can fail like
this:
btrfs/011 43s ... [failed, exit status 1]- output mismatch (see /home/adam/xfstests-dev/results//btrfs/011.out.bad)
--- tests/btrfs/011.out 2019-07-22 14:13:44.643333326 +0800
+++ /home/adam/xfstests-dev/results//btrfs/011.out.bad 2019-09-18 14:49:28.308798022 +0800
@@ -1,3 +1,4 @@
QA output created by 011
*** test btrfs replace
-*** done
+failed: '/usr/bin/btrfs replace cancel /mnt/scratch'
+(see /home/adam/xfstests-dev/results//btrfs/011.full for details)
...
[CAUSE]
Looking into the full output, it shows:
...
Replace from /dev/mapper/test-scratch1 to /dev/mapper/test-scratch2
# /usr/bin/btrfs replace start -f /dev/mapper/test-scratch1 /dev/mapper/test-scratch2 /mnt/scratch
# /usr/bin/btrfs replace cancel /mnt/scratch
INFO: ioctl(DEV_REPLACE_CANCEL)"/mnt/scratch": not started
failed: '/usr/bin/btrfs replace cancel /mnt/scratch'
So this means the replace is already finished before we cancel it.
For fast system, it's very common.
[FIX]
In fill_scratch() after all the original file creations, do a timer
based direct IO write.
The extra write will take 2 * $wait_time, utilizing direct IO with 64K
block size, the write performance should be very comparable (although a
little faster) to replace performance.
So later cancel should be able to really cancel the dev-replace without
it finished too early.
Also, do extra check about the above write. If we hit ENOSPC we just
skip the test as the system is really too fast and the fs is not large
enough.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
247 lines
7.5 KiB
Bash
Executable File
247 lines
7.5 KiB
Bash
Executable File
#! /bin/bash
|
|
# SPDX-License-Identifier: GPL-2.0
|
|
# Copyright (C) 2013 STRATO. All rights reserved.
|
|
#
|
|
# FSQA Test No. btrfs/011
|
|
#
|
|
# Test of the btrfs replace operation.
|
|
#
|
|
# The amount of tests done depends on the number of devices in the
|
|
# SCRATCH_DEV_POOL. For full test coverage, at least 5 devices should
|
|
# be available (e.g. 5 partitions).
|
|
#
|
|
# The source and target devices for the replace operation are
|
|
# arbitrarily chosen out of SCRATCH_DEV_POOl. Since the target device
|
|
# mustn't be smaller than the source device, the requirement for this
|
|
# test is that all devices have _exactly_ the same size. If this is
|
|
# not the case, this test is not run.
|
|
#
|
|
# To check the filesystems after replacing a device, a scrub run is
|
|
# performed, a btrfsck run, and finally the filesystem is remounted.
|
|
#
|
|
seq=`basename $0`
|
|
seqres=$RESULT_DIR/$seq
|
|
echo "QA output created by $seq"
|
|
|
|
here=`pwd`
|
|
tmp=/tmp/$$
|
|
status=1
|
|
noise_pid=0
|
|
|
|
_cleanup()
|
|
{
|
|
if [ $noise_pid -ne 0 ] && ps -p $noise_pid | grep -q $noise_pid; then
|
|
kill -TERM $noise_pid
|
|
fi
|
|
wait
|
|
rm -f $tmp.*
|
|
# we need this umount and couldn't rely on _require_scratch to umount
|
|
# it from next test, because we would replace SCRATCH_DEV, which is
|
|
# needed by _require_scratch, and make it umounted.
|
|
_scratch_unmount > /dev/null 2>&1
|
|
}
|
|
trap "_cleanup; exit \$status" 0 1 2 3 15
|
|
|
|
# get standard environment, filters and checks
|
|
. ./common/rc
|
|
. ./common/filter
|
|
|
|
# real QA test starts here
|
|
_supported_fs btrfs
|
|
_require_scratch_nocheck
|
|
_require_scratch_dev_pool 5
|
|
_require_scratch_dev_pool_equal_size
|
|
_require_command "$WIPEFS_PROG" wipefs
|
|
|
|
rm -f $seqres.full
|
|
rm -f $tmp.*
|
|
|
|
echo "*** test btrfs replace"
|
|
|
|
# In seconds
|
|
wait_time=1
|
|
|
|
fill_scratch()
|
|
{
|
|
local fssize=$1
|
|
local filler_pid
|
|
|
|
# Fill inline extents.
|
|
for i in `seq 1 500`; do
|
|
_ddt of=$SCRATCH_MNT/s$i bs=3800 count=1
|
|
done > /dev/null 2>&1
|
|
|
|
# Fill data extents.
|
|
for i in `seq 1 500`; do
|
|
_ddt of=$SCRATCH_MNT/l$i bs=16385 count=1
|
|
done > /dev/null 2>&1
|
|
_ddt of=$SCRATCH_MNT/t0 bs=1M count=1 > /dev/null 2>&1
|
|
for i in `seq $fssize`; do
|
|
cp $SCRATCH_MNT/t0 $SCRATCH_MNT/t$i || _fail "cp failed"
|
|
done > /dev/null 2>> $seqres.full
|
|
|
|
# Ensure we have enough data so that dev-replace would take at least
|
|
# 2 * $wait_time, allowing we cancel the running replace.
|
|
# Some extra points:
|
|
# - Use XFS_IO_PROG instead of dd
|
|
# fstests wraps dd, making it pretty hard to kill the real dd pid
|
|
# - Use 64K block size with Direct IO
|
|
# 64K is the same stripe size used in replace/scrub. Using Direct IO
|
|
# ensure the IO speed is near device limit and comparable to replace
|
|
# speed.
|
|
$XFS_IO_PROG -f -d -c "pwrite -b 64k 0 1E" "$SCRATCH_MNT/t_filler" &>\
|
|
$tmp.filler_result &
|
|
filler_pid=$!
|
|
sleep $((2 * $wait_time))
|
|
kill -KILL $filler_pid &> /dev/null
|
|
wait $filler_pid &> /dev/null
|
|
|
|
# If the system is too fast and the fs is too small, then skip the test
|
|
if grep -q "No space left" $tmp.filler_result; then
|
|
ls -alh $SCRATCH_MNT >> $seqres.full
|
|
cat $tmp.filler_result >> $seqres.full
|
|
_notrun "fs too small for this test"
|
|
fi
|
|
cat $tmp.filler_result
|
|
sync; sync
|
|
}
|
|
|
|
workout()
|
|
{
|
|
local mkfs_options="$1"
|
|
local num_devs4raid="$2"
|
|
local with_cancel="$3"
|
|
local fssize="$4"
|
|
local source_dev="`echo ${SCRATCH_DEV_POOL} | awk '{print $1}'`"
|
|
local quick="quick"
|
|
|
|
[[ $fssize != 64 ]] && quick="thorough"
|
|
|
|
echo -e "\\n---------workout \"$1\" $2 $3 $4-----------" >> $seqres.full
|
|
|
|
$WIPEFS_PROG -a $SCRATCH_DEV_POOL > /dev/null 2>&1
|
|
_scratch_dev_pool_get $num_devs4raid
|
|
_spare_dev_get
|
|
|
|
_scratch_pool_mkfs $mkfs_options >> $seqres.full 2>&1 ||\
|
|
_fail "mkfs failed"
|
|
|
|
_scratch_mount
|
|
_require_fs_space $SCRATCH_MNT $((2 * 512 * 1024)) #2.5G
|
|
|
|
fill_scratch $fssize
|
|
_run_btrfs_util_prog filesystem show -m $SCRATCH_MNT
|
|
|
|
echo -e "Replace from $source_dev to $SPARE_DEV\\n" >> $seqres.full
|
|
btrfs_replace_test $source_dev $SPARE_DEV "" $with_cancel $quick
|
|
|
|
_run_btrfs_util_prog filesystem show -m $SCRATCH_MNT
|
|
|
|
# Skip -r test for configs without mirror OR replace cancel
|
|
if echo $mkfs_options | egrep -qv "raid1|raid5|raid6|raid10" || \
|
|
[ "${with_cancel}Q" = "cancelQ" ]; then
|
|
_scratch_unmount > /dev/null 2>&1
|
|
_scratch_dev_pool_put
|
|
_spare_dev_put
|
|
return 0
|
|
fi
|
|
|
|
# Due to above replace, now SPARE_DEV is part of the FS, check that.
|
|
$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT |\
|
|
grep -qs $SPARE_DEV$ ||\
|
|
_fail "$SPARE_DEV is not part of SCRATCH_FS"
|
|
|
|
btrfs_replace_test $SPARE_DEV $source_dev "-r" $with_cancel $quick
|
|
|
|
_scratch_unmount > /dev/null 2>&1
|
|
_scratch_dev_pool_put
|
|
_spare_dev_put
|
|
}
|
|
|
|
btrfs_replace_test()
|
|
{
|
|
local source_dev="$1"
|
|
local target_dev="$2"
|
|
local replace_options="$3"
|
|
local with_cancel="$4"
|
|
local quick="$5"
|
|
|
|
# generate some (slow) background traffic in parallel to the
|
|
# replace operation. It is not a problem if cat fails early
|
|
# with ENOSPC.
|
|
cat /dev/urandom | od > $SCRATCH_MNT/noise 2>> $seqres.full &
|
|
noise_pid=$!
|
|
|
|
if [ "${with_cancel}Q" = "cancelQ" ]; then
|
|
# background the replace operation (no '-B' option given)
|
|
_run_btrfs_util_prog replace start -f $replace_options $source_dev $target_dev $SCRATCH_MNT
|
|
sleep $wait_time
|
|
_run_btrfs_util_prog replace cancel $SCRATCH_MNT
|
|
|
|
# 'replace status' waits for the replace operation to finish
|
|
# before the status is printed
|
|
$BTRFS_UTIL_PROG replace status $SCRATCH_MNT > $tmp.tmp 2>&1
|
|
cat $tmp.tmp >> $seqres.full
|
|
grep -q canceled $tmp.tmp || _fail "btrfs replace status (canceled) failed"
|
|
else
|
|
if [ "${quick}Q" = "thoroughQ" ]; then
|
|
# The thorough test runs around 2 * $wait_time seconds.
|
|
# This is a chance to force a sync in the middle of the
|
|
# replace operation.
|
|
(sleep $wait_time; sync) > /dev/null 2>&1 &
|
|
fi
|
|
_run_btrfs_util_prog replace start -Bf $replace_options $source_dev $target_dev $SCRATCH_MNT
|
|
|
|
$BTRFS_UTIL_PROG replace status $SCRATCH_MNT > $tmp.tmp 2>&1
|
|
cat $tmp.tmp >> $seqres.full
|
|
grep -q finished $tmp.tmp || _fail "btrfs replace status (finished) failed"
|
|
fi
|
|
|
|
if ps -p $noise_pid | grep -q $noise_pid; then
|
|
kill -TERM $noise_pid 2> /dev/null
|
|
fi
|
|
noise_pid=0
|
|
wait
|
|
|
|
# scrub tests on-disk data, that's the reason for the sync.
|
|
# With the '-B' option (don't background), any type of error causes
|
|
# exit values != 0, including detected correctable and uncorrectable
|
|
# errors on the device.
|
|
sync; sync
|
|
_run_btrfs_util_prog scrub start -B $SCRATCH_MNT
|
|
|
|
# Two tests are performed, the 1st is to btrfsck the filesystem,
|
|
# and the 2nd test is to mount the filesystem.
|
|
# Usually _check_btrfs_filesystem would perform the mount test,
|
|
# but it gets confused by the mount output that shows SCRATCH_MNT
|
|
# mounted but not being mounted to SCRATCH_DEV. This happens
|
|
# because in /proc/mounts the 2nd device of the filesystem is
|
|
# shown after the replace operation. Let's just do the mount
|
|
# test manually after _check_btrfs_filesystem is finished.
|
|
_scratch_unmount > /dev/null 2>&1
|
|
if [ "${with_cancel}Q" != "cancelQ" ]; then
|
|
# after the replace operation, use the target_dev for everything
|
|
echo "_check_btrfs_filesystem $target_dev" >> $seqres.full
|
|
_check_btrfs_filesystem $target_dev
|
|
_mount -t $FSTYP `_scratch_mount_options | sed "s&${SCRATCH_DEV}&${target_dev}&"`
|
|
else
|
|
_check_btrfs_filesystem $source_dev
|
|
_scratch_mount
|
|
fi
|
|
}
|
|
|
|
workout "-m single -d single" 1 no 64
|
|
workout "-m single -d single -M" 1 no 64
|
|
workout "-m dup -d single" 1 no 64
|
|
workout "-m dup -d single" 1 cancel 1024
|
|
workout "-m dup -d dup -M" 1 no 64
|
|
workout "-m raid0 -d raid0" 2 no 64
|
|
workout "-m raid1 -d raid1" 2 no 2048
|
|
workout "-m raid5 -d raid5" 2 no 64
|
|
workout "-m raid6 -d raid6" 3 no 64
|
|
workout "-m raid10 -d raid10" 4 no 64
|
|
|
|
echo "*** done"
|
|
status=0
|
|
exit
|