Bug 1180225. Make convolver more like upstream. r=seth

This fixes uninitialised reads.
This commit is contained in:
Jeff Muizelaar 2015-08-27 16:06:37 -04:00
parent 6020ad60de
commit f2c966fa6f
4 changed files with 93 additions and 86 deletions

View File

@ -175,11 +175,11 @@ class CircularRowBuffer {
// |src_data| and continues for the [begin, end) of the filter.
template<bool has_alpha>
void ConvolveHorizontally(const unsigned char* src_data,
int begin, int end,
const ConvolutionFilter1D& filter,
unsigned char* out_row) {
int num_values = filter.num_values();
// Loop over each pixel on this row in the output image.
for (int out_x = begin; out_x < end; out_x++) {
for (int out_x = 0; out_x < num_values; out_x++) {
// Get the filter that determines the current output pixel.
int filter_offset, filter_length;
const ConvolutionFilter1D::Fixed* filter_values =
@ -220,17 +220,18 @@ void ConvolveHorizontally(const unsigned char* src_data,
// Does vertical convolution to produce one output row. The filter values and
// length are given in the first two parameters. These are applied to each
// of the rows pointed to in the |source_data_rows| array, with each row
// being |end - begin| wide.
// being |pixel_width| wide.
//
// The output must have room for |(end - begin) * 4| bytes.
// The output must have room for |pixel_width * 4| bytes.
template<bool has_alpha>
void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int begin, int end, unsigned char* out_row) {
int pixel_width,
unsigned char* out_row) {
// We go through each column in the output and do a vertical convolution,
// generating one output pixel each time.
for (int out_x = begin; out_x < end; out_x++) {
for (int out_x = 0; out_x < pixel_width; out_x++) {
// Compute the number of bytes over in each row that the current column
// we're convolving starts at. The pixel will cover the next 4 bytes.
int byte_offset = out_x * 4;
@ -288,28 +289,29 @@ void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int width, unsigned char* out_row,
int pixel_width, unsigned char* out_row,
bool has_alpha, bool use_simd) {
int processed = 0;
#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
// If the binary was not built with SSE2 support, we had to fallback to C version.
int simd_width = width & ~3;
if (use_simd && simd_width) {
if (use_simd) {
ConvolveVertically_SIMD(filter_values, filter_length,
source_data_rows, 0, simd_width,
source_data_rows,
pixel_width,
out_row, has_alpha);
processed = simd_width;
}
} else
#endif
if (width > processed) {
{
if (has_alpha) {
ConvolveVertically<true>(filter_values, filter_length, source_data_rows,
processed, width, out_row);
ConvolveVertically<true>(filter_values, filter_length,
source_data_rows,
pixel_width,
out_row);
} else {
ConvolveVertically<false>(filter_values, filter_length, source_data_rows,
processed, width, out_row);
ConvolveVertically<false>(filter_values, filter_length,
source_data_rows,
pixel_width,
out_row);
}
}
}
@ -326,16 +328,16 @@ void ConvolveHorizontally(const unsigned char* src_data,
// SIMD implementation works with 4 pixels at a time.
// Therefore we process as much as we can using SSE and then use
// C implementation for leftovers
ConvolveHorizontally_SSE2(src_data, 0, simd_width, filter, out_row);
ConvolveHorizontally_SSE2(src_data, filter, out_row);
processed = simd_width;
}
#endif
if (width > processed) {
if (has_alpha) {
ConvolveHorizontally<true>(src_data, processed, width, filter, out_row);
ConvolveHorizontally<true>(src_data, filter, out_row);
} else {
ConvolveHorizontally<false>(src_data, processed, width, filter, out_row);
ConvolveHorizontally<false>(src_data, filter, out_row);
}
}
}
@ -457,9 +459,23 @@ void BGRAConvolve2D(const unsigned char* source_data,
int num_output_rows = filter_y.num_values();
int pixel_width = filter_x.num_values();
// We need to check which is the last line to convolve before we advance 4
// lines in one iteration.
int last_filter_offset, last_filter_length;
// SSE2 can access up to 3 extra pixels past the end of the
// buffer. At the bottom of the image, we have to be careful
// not to access data past the end of the buffer. Normally
// we fall back to the C++ implementation for the last row.
// If the last row is less than 3 pixels wide, we may have to fall
// back to the C++ version for more rows. Compute how many
// rows we need to avoid the SSE implementation for here.
filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,
&last_filter_length);
#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
int avoid_simd_rows = 1 + 3 /
(last_filter_offset + last_filter_length);
#endif
filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
&last_filter_length);
@ -473,36 +489,32 @@ void BGRAConvolve2D(const unsigned char* source_data,
// We don't want to process too much rows in batches of 4 because
// we can go out-of-bounds at the end
while (next_x_row < filter_offset + filter_length) {
if (next_x_row + 3 < last_filter_offset + last_filter_length - 3) {
if (next_x_row + 3 < last_filter_offset + last_filter_length -
avoid_simd_rows) {
const unsigned char* src[4];
unsigned char* out_row[4];
for (int i = 0; i < 4; ++i) {
src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
out_row[i] = row_buffer.AdvanceRow();
}
ConvolveHorizontally4_SIMD(src, 0, pixel_width, filter_x, out_row);
ConvolveHorizontally4_SIMD(src, filter_x, out_row);
next_x_row += 4;
} else {
unsigned char* buffer = row_buffer.AdvanceRow();
// For last rows, SSE2 load possibly to access data beyond the
// image area. therefore we use cobined C+SSE version here
int simd_width = pixel_width & ~3;
if (simd_width) {
// Check if we need to avoid SSE2 for this row.
if (next_x_row < last_filter_offset + last_filter_length -
avoid_simd_rows) {
ConvolveHorizontally_SIMD(
&source_data[next_x_row * source_byte_row_stride],
0, simd_width, filter_x, buffer);
}
if (pixel_width > simd_width) {
filter_x, row_buffer.AdvanceRow());
} else {
if (source_has_alpha) {
ConvolveHorizontally<true>(
&source_data[next_x_row * source_byte_row_stride],
simd_width, pixel_width, filter_x, buffer);
filter_x, row_buffer.AdvanceRow());
} else {
ConvolveHorizontally<false>(
&source_data[next_x_row * source_byte_row_stride],
simd_width, pixel_width, filter_x, buffer);
filter_x, row_buffer.AdvanceRow());
}
}
next_x_row++;
@ -513,12 +525,12 @@ void BGRAConvolve2D(const unsigned char* source_data,
while (next_x_row < filter_offset + filter_length) {
if (source_has_alpha) {
ConvolveHorizontally<true>(
&source_data[next_x_row * source_byte_row_stride],
0, pixel_width, filter_x, row_buffer.AdvanceRow());
&source_data[next_x_row * source_byte_row_stride],
filter_x, row_buffer.AdvanceRow());
} else {
ConvolveHorizontally<false>(
&source_data[next_x_row * source_byte_row_stride],
0, pixel_width, filter_x, row_buffer.AdvanceRow());
&source_data[next_x_row * source_byte_row_stride],
filter_x, row_buffer.AdvanceRow());
}
next_x_row++;
}

View File

@ -35,26 +35,24 @@
namespace skia {
// Convolves horizontally along a single row. The row data is given in
// |src_data| and continues for the [begin, end) of the filter.
// |src_data| and continues for the num_values() of the filter.
void ConvolveHorizontally_SSE2(const unsigned char* src_data,
int begin, int end,
const ConvolutionFilter1D& filter,
unsigned char* out_row) {
int num_values = filter.num_values();
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
__m128i mask[3];
__m128i mask[4];
// |mask| will be used to decimate all extra filter coefficients that are
// loaded by SIMD when |filter_length| is not divisible by 4.
mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// This buffer is used for tails
__m128i buffer;
// mask[0] is not used in following algorithm.
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = begin; out_x < end; out_x++) {
for (int out_x = 0; out_x < num_values; out_x++) {
const ConvolutionFilter1D::Fixed* filter_values =
filter.FilterForValue(out_x, &filter_offset, &filter_length);
@ -117,22 +115,21 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
// When |filter_length| is not divisible by 4, we need to decimate some of
// the filter coefficient that was loaded incorrectly to zero; Other than
// that the algorithm is same with above, except that the 4th pixel will be
// that the algorithm is same with above, exceot that the 4th pixel will be
// always absent.
int r = filter_length & 3;
int r = filter_length&3;
if (r) {
memcpy(&buffer, row_to_filter, r * 4);
// Note: filter_values must be padded to align_up(filter_offset, 8).
__m128i coeff, coeff16;
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// Mask out extra filter taps.
coeff = _mm_and_si128(coeff, mask[r-1]);
coeff = _mm_and_si128(coeff, mask[r]);
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
// Note: line buffer must be padded to align_up(filter_offset, 16).
// We resolve this by temporary buffer
__m128i src8 = _mm_loadu_si128(&buffer);
// We resolve this by use C-version for the last horizontal line.
__m128i src8 = _mm_loadu_si128(row_to_filter);
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
@ -165,24 +162,26 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
}
// Convolves horizontally along four rows. The row data is given in
// |src_data| and continues for the [begin, end) of the filter.
// |src_data| and continues for the num_values() of the filter.
// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
// refer to that function for detailed comments.
void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
int begin, int end,
const ConvolutionFilter1D& filter,
unsigned char* out_row[4]) {
int num_values = filter.num_values();
int filter_offset, filter_length;
__m128i zero = _mm_setzero_si128();
__m128i mask[3];
__m128i mask[4];
// |mask| will be used to decimate all extra filter coefficients that are
// loaded by SIMD when |filter_length| is not divisible by 4.
mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// mask[0] is not used in following algorithm.
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
// Output one pixel each iteration, calculating all channels (RGBA) together.
for (int out_x = begin; out_x < end; out_x++) {
for (int out_x = 0; out_x < num_values; out_x++) {
const ConvolutionFilter1D::Fixed* filter_values =
filter.FilterForValue(out_x, &filter_offset, &filter_length);
@ -240,7 +239,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
__m128i coeff;
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
// Mask out extra filter taps.
coeff = _mm_and_si128(coeff, mask[r-1]);
coeff = _mm_and_si128(coeff, mask[r]);
__m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
/* c1 c1 c1 c1 c0 c0 c0 c0 */
@ -284,21 +283,22 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
// Does vertical convolution to produce one output row. The filter values and
// length are given in the first two parameters. These are applied to each
// of the rows pointed to in the |source_data_rows| array, with each row
// being |end - begin| wide.
// being |pixel_width| wide.
//
// The output must have room for |(end - begin) * 4| bytes.
// The output must have room for |pixel_width * 4| bytes.
template<bool has_alpha>
void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int begin, int end,
int pixel_width,
unsigned char* out_row) {
int width = pixel_width & ~3;
__m128i zero = _mm_setzero_si128();
__m128i accum0, accum1, accum2, accum3, coeff16;
const __m128i* src;
int out_x;
// Output four pixels per iteration (16 bytes).
for (out_x = begin; out_x + 3 < end; out_x += 4) {
for (int out_x = 0; out_x < width; out_x += 4) {
// Accumulated result for each pixel. 32 bits per RGBA channel.
accum0 = _mm_setzero_si128();
@ -391,11 +391,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
// When the width of the output is not divisible by 4, We need to save one
// pixel (4 bytes) each time. And also the fourth pixel is always absent.
int r = end - out_x;
if (r > 0) {
// Since accum3 is never used here, we'll use it as a buffer
__m128i *buffer = &accum3;
if (pixel_width & 3) {
accum0 = _mm_setzero_si128();
accum1 = _mm_setzero_si128();
accum2 = _mm_setzero_si128();
@ -403,9 +399,8 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
coeff16 = _mm_set1_epi16(filter_values[filter_y]);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
src = reinterpret_cast<const __m128i*>(
&source_data_rows[filter_y][out_x * 4]);
memcpy(buffer, src, r * 4);
__m128i src8 = _mm_loadu_si128(buffer);
&source_data_rows[filter_y][width<<2]);
__m128i src8 = _mm_loadu_si128(src);
// [16] a1 b1 g1 r1 a0 b0 g0 r0
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
@ -451,7 +446,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
accum0 = _mm_or_si128(accum0, mask);
}
for (; out_x < end; out_x++) {
for (int out_x = width; out_x < pixel_width; out_x++) {
*(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
accum0 = _mm_srli_si128(accum0, 4);
out_row += 4;
@ -462,14 +457,14 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int begin, int end,
int pixel_width,
unsigned char* out_row, bool has_alpha) {
if (has_alpha) {
ConvolveVertically_SSE2_impl<true>(filter_values, filter_length,
source_data_rows, begin, end, out_row);
source_data_rows, pixel_width, out_row);
} else {
ConvolveVertically_SSE2_impl<false>(filter_values, filter_length,
source_data_rows, begin, end, out_row);
source_data_rows, pixel_width, out_row);
}
}

View File

@ -40,7 +40,6 @@ namespace skia {
// Convolves horizontally along a single row. The row data is given in
// |src_data| and continues for the [begin, end) of the filter.
void ConvolveHorizontally_SSE2(const unsigned char* src_data,
int begin, int end,
const ConvolutionFilter1D& filter,
unsigned char* out_row);
@ -49,7 +48,6 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
// refer to that function for detailed comments.
void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
int begin, int end,
const ConvolutionFilter1D& filter,
unsigned char* out_row[4]);
@ -62,7 +60,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
int filter_length,
unsigned char* const* source_data_rows,
int begin, int end,
int pixel_width,
unsigned char* out_row, bool has_alpha);
} // namespace skia

View File

@ -91,7 +91,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
mYFilter.get());
// Allocate the buffer, which contains scanlines of the original image.
mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t));
// pad by 15 to handle overreads by the simd code
mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t) + 15);
if (MOZ_UNLIKELY(!mRowBuffer)) {
return NS_ERROR_OUT_OF_MEMORY;
}
@ -106,7 +107,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
}
bool anyAllocationFailed = false;
const int rowSize = mTargetSize.width * sizeof(uint32_t);
// pad by 15 to handle overreads by the simd code
const int rowSize = mTargetSize.width * sizeof(uint32_t) + 15;
for (int32_t i = 0; i < mWindowCapacity; ++i) {
mWindow[i] = new uint8_t[rowSize];
anyAllocationFailed = anyAllocationFailed || mWindow[i] == nullptr;