[x265] [PATCH] review: improvement filterVertical_p_p and filterHorizontal_p_p
Min Chen
chenm003 at 163.com
Fri Sep 6 15:55:04 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1378475470 -28800
# Node ID b340a72eb0c7af60ba0473eabb482085221dff7f
# Parent 63364b91b72a183ed18d2e9d22a4e7070b3bae60
review: improvement filterVertical_p_p and filterHorizontal_p_p
diff -r 63364b91b72a -r b340a72eb0c7 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Fri Sep 06 01:45:16 2013 -0500
+++ b/source/common/vec/ipfilter8.inc Fri Sep 06 21:51:10 2013 +0800
@@ -669,366 +669,302 @@
const short *coeff)
{
int offset;
- int shift = IF_FILTER_PREC;
+ const int shift = IF_FILTER_PREC;
src -= (N / 2 - 1) * srcStride;
offset = 1 << (shift - 1);
- __m128i coeffTemp = _mm_loadu_si128((__m128i const*)coeff);
-
- __m128i vm0 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
- __m128i vm1 = _mm_setr_epi8(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3);
- __m128i vm2 = _mm_setr_epi8(4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5);
- __m128i vm3 = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7);
- __m128i vm4 = _mm_setr_epi8(8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9);
- __m128i vm5 = _mm_setr_epi8(10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11);
- __m128i vm6 = _mm_setr_epi8(12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13);
- __m128i vm7 = _mm_setr_epi8(14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15);
-
- __m128i coeff0 = _mm_shuffle_epi8(coeffTemp, vm0);
- __m128i coeff1 = _mm_shuffle_epi8(coeffTemp, vm1);
- __m128i coeff2 = _mm_shuffle_epi8(coeffTemp, vm2);
- __m128i coeff3 = _mm_shuffle_epi8(coeffTemp, vm3);
- __m128i coeff4 = _mm_shuffle_epi8(coeffTemp, vm4);
- __m128i coeff5 = _mm_shuffle_epi8(coeffTemp, vm5);
- __m128i coeff6 = _mm_shuffle_epi8(coeffTemp, vm6);
- __m128i coeff7 = _mm_shuffle_epi8(coeffTemp, vm7);
-
- __m128i mask7 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i coeffTemp = _mm_loadu_si128((__m128i const*)coeff);
+ const __m128i sumOffset = _mm_set1_epi16(offset);
int row, col;
- for (row = 0; row < height; row++)
+ assert(height % 2 == 0);
+
+ uint32_t leftCols = (8 - (width & 7)) * 8;
+ uint32_t mask_shift = ((uint32_t)~0 >> leftCols);
+ uint32_t mask0 = (width & 7) <= 4 ? mask_shift : ~0;
+ uint32_t mask1 = (width & 7) <= 4 ? 0 : mask_shift;
+ __m128i leftmask = _mm_setr_epi32(mask0, mask1, 0, 0);
+
+ if (N == 8)
{
- for (col = 0; col < (width - 15); col += 16)
+ __m128i vm01 = _mm_shuffle_epi32(coeffTemp, 0x00);
+ __m128i vm23 = _mm_shuffle_epi32(coeffTemp, 0x55);
+ __m128i vm45 = _mm_shuffle_epi32(coeffTemp, 0xAA);
+ __m128i vm67 = _mm_shuffle_epi32(coeffTemp, 0xFF);
+ vm01 = _mm_packs_epi16(vm01, vm01);
+ vm23 = _mm_packs_epi16(vm23, vm23);
+ vm45 = _mm_packs_epi16(vm45, vm45);
+ vm67 = _mm_packs_epi16(vm67, vm67);
+
+ __m128i T00, T01, T02, T03, T04, T05, T06, T07/*, T08*/;
+ __m128i T10, T11, T12, T13;
+ for (row = 0; row < height; row += 1)
{
- __m128i srcCoeff = _mm_loadu_si128((__m128i*)&src[col]);
- __m128i srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff0);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T01 = _mm_mullo_epi16(srcCoeff, coeff0);
+ for (col = 0; col < (width & ~7); col += 8)
+ {
+ T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
+ T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
+ T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
+ T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
+ T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
+ T05 = _mm_loadl_epi64((__m128i*)&src[(5) * srcStride + col]);
+ T06 = _mm_loadl_epi64((__m128i*)&src[(6) * srcStride + col]);
+ T07 = _mm_loadl_epi64((__m128i*)&src[(7) * srcStride + col]);
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + srcStride]));
- __m128i srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff1);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T11 = _mm_mullo_epi16(srcCoeff, coeff1);
+ T10 = _mm_unpacklo_epi8(T00, T01);
+ T11 = _mm_unpacklo_epi8(T02, T03);
+ T12 = _mm_unpacklo_epi8(T04, T05);
+ T13 = _mm_unpacklo_epi8(T06, T07);
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 2 * srcStride]));
- __m128i srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff2);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T21 = _mm_mullo_epi16(srcCoeff, coeff2);
-
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 3 * srcStride]));
- __m128i srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff3);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T31 = _mm_mullo_epi16(srcCoeff, coeff3);
-
- __m128i sum0 = _mm_add_epi16(T00, T10);
- __m128i sum1 = _mm_add_epi16(T20, T30);
- __m128i sumlo = _mm_add_epi16(sum0, sum1);
-
- __m128i sum2 = _mm_add_epi16(T01, T11);
- __m128i sum3 = _mm_add_epi16(T21, T31);
- __m128i sumhi = _mm_add_epi16(sum2, sum3);
-
- if (N == 8)
- {
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 4 * srcStride]));
- srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
- T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff4);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- T01 = _mm_mullo_epi16(srcCoeff, coeff4);
-
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 5 * srcStride]));
- srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
- T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff5);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- T11 = _mm_mullo_epi16(srcCoeff, coeff5);
-
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 6 * srcStride]));
- srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
- T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff6);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- T21 = _mm_mullo_epi16(srcCoeff, coeff6);
-
- srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 7 * srcStride]));
- srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
- T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff7);
- srcCoeff = _mm_srli_si128(srcCoeff, 8);
- srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
- T31 = _mm_mullo_epi16(srcCoeff, coeff7);
-
- sum0 = _mm_add_epi16(T00, T10);
- sum1 = _mm_add_epi16(T20, T30);
- sumlo = _mm_add_epi16(sumlo, _mm_add_epi16(sum0, sum1));
-
- sum2 = _mm_add_epi16(T01, T11);
- sum3 = _mm_add_epi16(T21, T31);
- sumhi = _mm_add_epi16(sumhi, _mm_add_epi16(sum2, sum3));
+ T10 = _mm_maddubs_epi16(T10, vm01);
+ T11 = _mm_maddubs_epi16(T11, vm23);
+ T12 = _mm_maddubs_epi16(T12, vm45);
+ T13 = _mm_maddubs_epi16(T13, vm67);
+ T10 = _mm_add_epi16(T10, T11);
+ T11 = _mm_add_epi16(T12, T13);
+ T10 = _mm_add_epi16(T10, T11);
+ T10 = _mm_srai_epi16(_mm_add_epi16(T10, sumOffset), shift);
+ T10 = _mm_packus_epi16(T10, T10);
+ _mm_storel_epi64((__m128i*)&dst[0 * dstStride + col], T10);
}
- __m128i sumOffset = _mm_set1_epi16(offset);
+ assert((width - col) < 8);
+ if (col != width)
+ {
+ T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
+ T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
+ T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
+ T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
+ T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
+ T05 = _mm_loadl_epi64((__m128i*)&src[(5) * srcStride + col]);
+ T06 = _mm_loadl_epi64((__m128i*)&src[(6) * srcStride + col]);
+ T07 = _mm_loadl_epi64((__m128i*)&src[(7) * srcStride + col]);
- __m128i val1 = _mm_add_epi16(sumlo, sumOffset);
- val1 = _mm_srai_epi16(val1, shift);
+ T10 = _mm_unpacklo_epi8(T00, T01);
+ T11 = _mm_unpacklo_epi8(T02, T03);
+ T12 = _mm_unpacklo_epi8(T04, T05);
+ T13 = _mm_unpacklo_epi8(T06, T07);
- __m128i val2 = _mm_add_epi16(sumhi, sumOffset);
- val2 = _mm_srai_epi16(val2, shift);
+ T10 = _mm_maddubs_epi16(T10, vm01);
+ T11 = _mm_maddubs_epi16(T11, vm23);
+ T12 = _mm_maddubs_epi16(T12, vm45);
+ T13 = _mm_maddubs_epi16(T13, vm67);
+ T10 = _mm_add_epi16(T10, T11);
+ T11 = _mm_add_epi16(T12, T13);
+ T10 = _mm_add_epi16(T10, T11);
+ T10 = _mm_srai_epi16(_mm_add_epi16(T10, sumOffset), shift);
+ T10 = _mm_packus_epi16(T10, T10);
+ _mm_maskmoveu_si128(T10, leftmask, (char*)&dst[(0) * dstStride + col]);
+ }
- __m128i res = _mm_packus_epi16(val1, val2);
- _mm_storeu_si128((__m128i*)&dst[col], res);
- }
+ src += 1 * srcStride;
+ dst += 1 * dstStride;
+ } // end of row loop
+ } // end of N==8
- for (; col < (width - 7); col += 8)
+ if (N == 4)
+ {
+ __m128i vm01 = _mm_shuffle_epi32(coeffTemp, 0x00);
+ __m128i vm23 = _mm_shuffle_epi32(coeffTemp, 0x55);
+ vm01 = _mm_packs_epi16(vm01, vm01);
+ vm23 = _mm_packs_epi16(vm23, vm23);
+
+ __m128i T00, T01, T02, T03, T04;
+ __m128i T10, T11;
+ __m128i T20, T21;
+ for (row = 0; row < height; row += 2)
{
- __m128i srcCoeff = _mm_loadl_epi64((__m128i*)&src[col]);
- __m128i srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff0);
+ for (col = 0; col < (width & ~7); col += 8)
+ {
+ T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
+ T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
+ T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
+ T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
+ T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + srcStride]));
- __m128i srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff1);
+ T10 = _mm_unpacklo_epi8(T00, T01);
+ T11 = _mm_unpacklo_epi8(T02, T03);
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 2 * srcStride]));
- __m128i srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff2);
+ T10 = _mm_maddubs_epi16(T10, vm01);
+ T11 = _mm_maddubs_epi16(T11, vm23);
+ T10 = _mm_add_epi16(T10, T11);
+ T10 = _mm_srai_epi16(_mm_add_epi16(T10, sumOffset), shift);
+ T10 = _mm_packus_epi16(T10, T10);
+ _mm_storel_epi64((__m128i*)&dst[0 * dstStride + col], T10);
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 3 * srcStride]));
- __m128i srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff3);
+ T20 = _mm_unpacklo_epi8(T01, T02);
+ T21 = _mm_unpacklo_epi8(T03, T04);
- __m128i sum0 = _mm_add_epi16(T00, T10);
- __m128i sum1 = _mm_add_epi16(T20, T30);
- __m128i sumlo = _mm_add_epi16(sum0, sum1);
+ T20 = _mm_maddubs_epi16(T20, vm01);
+ T21 = _mm_maddubs_epi16(T21, vm23);
+ T20 = _mm_add_epi16(T20, T21);
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, sumOffset), shift);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)&dst[1 * dstStride + col], T20);
+ }
- if (N == 8)
+ assert((width - col) < 8);
+ if (col != width)
{
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 4 * srcStride]));
- srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
- T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff4);
+ T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
+ T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
+ T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
+ T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
+ T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 5 * srcStride]));
- srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
- T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff5);
+ T10 = _mm_unpacklo_epi8(T00, T01);
+ T11 = _mm_unpacklo_epi8(T02, T03);
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 6 * srcStride]));
- srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
- T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff6);
+ T10 = _mm_maddubs_epi16(T10, vm01);
+ T11 = _mm_maddubs_epi16(T11, vm23);
+ T10 = _mm_add_epi16(T10, T11);
+ T10 = _mm_srai_epi16(_mm_add_epi16(T10, sumOffset), shift);
+ T10 = _mm_packus_epi16(T10, T10);
+ _mm_maskmoveu_si128(T10, leftmask, (char*)&dst[(0) * dstStride + col]);
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 7 * srcStride]));
- srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
- T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff7);
+ T20 = _mm_unpacklo_epi8(T01, T02);
+ T21 = _mm_unpacklo_epi8(T03, T04);
- sum0 = _mm_add_epi16(T00, T10);
- sum1 = _mm_add_epi16(T20, T30);
- sumlo = _mm_add_epi16(sumlo, _mm_add_epi16(sum0, sum1));
+ T20 = _mm_maddubs_epi16(T20, vm01);
+ T21 = _mm_maddubs_epi16(T21, vm23);
+ T20 = _mm_add_epi16(T20, T21);
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, sumOffset), shift);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_maskmoveu_si128(T20, leftmask, (char*)&dst[(1) * dstStride + col]);
}
- __m128i zero = _mm_set1_epi16(0);
- __m128i sumOffset = _mm_set1_epi16(offset);
- __m128i val1 = _mm_add_epi16(sumlo, sumOffset);
- val1 = _mm_srai_epi16(val1, shift);
-
- __m128i res = _mm_packus_epi16(val1, zero);
- _mm_storel_epi64((__m128i*)&dst[col], res);
- }
-
- for (; col < width; col += 8)
- {
- __m128i srcCoeff = _mm_loadl_epi64((__m128i*)&src[col]);
- __m128i srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff0);
-
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + srcStride]));
- __m128i srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff1);
-
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 2 * srcStride]));
- __m128i srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff2);
-
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 3 * srcStride]));
- __m128i srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
- __m128i T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff3);
-
- __m128i sum0 = _mm_add_epi16(T00, T10);
- __m128i sum1 = _mm_add_epi16(T20, T30);
- __m128i sumlo = _mm_add_epi16(sum0, sum1);
-
- if (N == 8)
- {
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 4 * srcStride]));
- srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
- T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff4);
-
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 5 * srcStride]));
- srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
- T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff5);
-
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 6 * srcStride]));
- srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
- T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff6);
-
- srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 7 * srcStride]));
- srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
- T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff7);
-
- sum0 = _mm_add_epi16(T00, T10);
- sum1 = _mm_add_epi16(T20, T30);
- sumlo = _mm_add_epi16(sumlo, _mm_add_epi16(sum0, sum1));
- }
- __m128i zero = _mm_set1_epi16(0);
- __m128i sumOffset = _mm_set1_epi16(offset);
-
- __m128i val1 = _mm_add_epi16(sumlo, sumOffset);
- val1 = _mm_srai_epi16(val1, shift);
-
- __m128i res = _mm_packus_epi16(val1, zero);
-
- int n = width - col;
- __m128i mask1, mask2, mask3, mask4, mask5, mask6;
-
- switch (n) // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst
- {
- case 1: mask1 = _mm_srli_si128(mask7, 6);
- _mm_maskmoveu_si128(res, mask1, (char*)&dst[col]);
- break;
-
- case 2: mask2 = _mm_srli_si128(mask7, 5);
- _mm_maskmoveu_si128(res, mask2, (char*)&dst[col]);
- break;
-
- case 3: mask3 = _mm_srli_si128(mask7, 4);
- _mm_maskmoveu_si128(res, mask3, (char*)&dst[col]);
- break;
-
- case 4: mask4 = _mm_srli_si128(mask7, 3);
- _mm_maskmoveu_si128(res, mask4, (char*)&dst[col]);
- break;
-
- case 5: mask5 = _mm_srli_si128(mask7, 2);
- _mm_maskmoveu_si128(res, mask5, (char*)&dst[col]);
- break;
-
- case 6: mask6 = _mm_srli_si128(mask7, 1);
- _mm_maskmoveu_si128(res, mask6, (char*)&dst[col]);
- break;
-
- case 7: _mm_maskmoveu_si128(res, mask7, (char*)&dst[col]);
- break;
- }
- }
-
- src += srcStride;
- dst += dstStride;
- }
+ src += 2 * srcStride;
+ dst += 2 * dstStride;
+ } // end of row loop
+ } // end of N==4
}
#endif /* if INSTRSET >= X265_CPU_LEVEL_SSSE3 */
#if INSTRSET >= X265_CPU_LEVEL_SSE41
+ALIGN_VAR_32(const uint8_t, Tm[][16]) =
+{
+ // TODO: merge row0-3 into ipfilterH_0[0-3]
+ {0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8},
+ {2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10},
+ {4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12},
+ {6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14},
+ {0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6},
+ {4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10}
+};
+
template<int N>
void filterHorizontal_p_p(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
{
int row, col;
- int offset;
+ uint32_t offset;
short maxVal;
- int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
+ const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
offset = (1 << (headRoom - 1));
maxVal = (1 << X265_DEPTH) - 1;
src -= (N / 2 - 1);
__m128i a = _mm_loadu_si128((__m128i*)coeff);
- __m128i T10 = _mm_packs_epi16(a, a);
+ __m128i coef2 = _mm_packs_epi16(a, a);
+ __m128i sumOffset = _mm_shuffle_epi32(_mm_cvtsi32_si128(offset), 0);
+ sumOffset = _mm_packs_epi16(sumOffset, sumOffset);
- __m128i S1 = _mm_slli_si128(T10, 12);
- __m128i S2 = _mm_srli_si128(S1, 4);
- __m128i S3 = _mm_srli_si128(S2, 4);
- __m128i S4 = _mm_srli_si128(S3, 4);
- __m128i S = _mm_add_epi8(S1, _mm_add_epi8(S2, S3));
- S = _mm_add_epi8(S, S4);
+ const __m128i S = _mm_shuffle_epi32(coef2, 0);
- __m128i Tm1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8);
- __m128i Tm2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10);
- __m128i Tm3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12);
- __m128i Tm4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14);
- __m128i Tm5 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
- __m128i Tm6 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10);
+ uint32_t leftCols = (8 - (width & 7)) * 8;
+ uint32_t mask_shift = ((uint32_t)~0 >> leftCols);
+ uint32_t mask0 = (width & 7) <= 4 ? mask_shift : ~0;
+ uint32_t mask1 = (width & 7) <= 4 ? 0 : mask_shift;
+ __m128i leftmask = _mm_setr_epi32(mask0, mask1, 0, 0);
+ // TODO: unroll
for (row = 0; row < height; row++)
{
- col = 0;
- for (; col < (width - 7); col += 8)
+ for (col = 0; col < (width & ~7); col += 8)
{
+ __m128i sum;
__m128i srcCoeff = _mm_loadu_si128((__m128i*)(src + col));
- __m128i sum;
if (N == 4)
{
- __m128i T00 = _mm_shuffle_epi8(srcCoeff, Tm5);
+ __m128i T00 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[4]));
__m128i T20 = _mm_maddubs_epi16(T00, S);
- __m128i T30 = _mm_shuffle_epi8(srcCoeff, Tm6);
+ __m128i T30 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[5]));
__m128i T40 = _mm_maddubs_epi16(T30, S);
sum = _mm_hadd_epi16(T20, T40);
}
-
else // (N == 8)
{
- __m128i T00 = _mm_shuffle_epi8(srcCoeff, Tm1);
- __m128i T20 = _mm_maddubs_epi16(T00, T10);
+ __m128i T00 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[0]));
+ __m128i T20 = _mm_maddubs_epi16(T00, coef2);
- __m128i T30 = _mm_shuffle_epi8(srcCoeff, Tm2);
- __m128i T40 = _mm_maddubs_epi16(T30, T10);
+ __m128i T30 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[1]));
+ __m128i T40 = _mm_maddubs_epi16(T30, coef2);
- __m128i T50 = _mm_shuffle_epi8(srcCoeff, Tm3);
- __m128i T60 = _mm_maddubs_epi16(T50, T10);
+ __m128i T50 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[2]));
+ __m128i T60 = _mm_maddubs_epi16(T50, coef2);
- __m128i T70 = _mm_shuffle_epi8(srcCoeff, Tm4);
- __m128i T80 = _mm_maddubs_epi16(T70, T10);
+ __m128i T70 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[3]));
+ __m128i T80 = _mm_maddubs_epi16(T70, coef2);
__m128i s1 = _mm_hadd_epi16(T20, T40);
__m128i s2 = _mm_hadd_epi16(T60, T80);
sum = _mm_hadd_epi16(s1, s2);
}
- __m128i sumOffset = _mm_set1_epi16(offset);
- __m128i zero = _mm_set1_epi16(0);
__m128i val = _mm_add_epi16(sum, sumOffset);
-
val = _mm_srai_epi16(val, headRoom);
- val = _mm_packus_epi16(val, zero);
+ val = _mm_packus_epi16(val, val);
_mm_storel_epi64((__m128i*)&dst[col], val);
}
- for (; col < width; col++) // Remaining iterations
+ assert((width - col) < 8);
+
+ if (col != width)
{
- __m128i NewSrc = _mm_loadl_epi64((__m128i*)(src + col));
- __m128i T00 = _mm_maddubs_epi16(NewSrc, T10);
- __m128i add = _mm_hadd_epi16(T00, T00);
- short sum = _mm_extract_epi16(add, 0);
+ __m128i sum;
+ __m128i srcCoeff = _mm_loadu_si128((__m128i*)(src + col));
- if (N == 8)
+ if (N == 4)
{
- add = _mm_hadd_epi16(add, add);
- sum = _mm_extract_epi16(add, 0);
+ __m128i T00 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[4]));
+ __m128i T20 = _mm_maddubs_epi16(T00, S);
+
+ __m128i T30 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[5]));
+ __m128i T40 = _mm_maddubs_epi16(T30, S);
+
+ sum = _mm_hadd_epi16(T20, T40);
}
- short val = (short)(sum + offset) >> headRoom;
- val = (val < 0) ? 0 : val;
- val = (val > maxVal) ? maxVal : val;
- dst[col] = (pixel)val;
+ else // (N == 8)
+ {
+ __m128i T00 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[0]));
+ __m128i T20 = _mm_maddubs_epi16(T00, coef2);
+
+ __m128i T30 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[1]));
+ __m128i T40 = _mm_maddubs_epi16(T30, coef2);
+
+ __m128i T50 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[2]));
+ __m128i T60 = _mm_maddubs_epi16(T50, coef2);
+
+ __m128i T70 = _mm_shuffle_epi8(srcCoeff, _mm_load_si128((__m128i*)Tm[3]));
+ __m128i T80 = _mm_maddubs_epi16(T70, coef2);
+
+ __m128i s1 = _mm_hadd_epi16(T20, T40);
+ __m128i s2 = _mm_hadd_epi16(T60, T80);
+ sum = _mm_hadd_epi16(s1, s2);
+ }
+
+ __m128i val = _mm_add_epi16(sum, sumOffset);
+ val = _mm_srai_epi16(val, headRoom);
+ val = _mm_packus_epi16(val, val);
+
+ // TODO: optimize me: in here the really encode's size always be equal to 4
+ _mm_maskmoveu_si128(val, leftmask, (char*)&dst[col]);
}
src += srcStride;
diff -r 63364b91b72a -r b340a72eb0c7 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Fri Sep 06 01:45:16 2013 -0500
+++ b/source/test/ipfilterharness.cpp Fri Sep 06 21:51:10 2013 +0800
@@ -78,7 +78,10 @@
{
int rand_height = rand() % 100; // Randomly generated Height
int rand_width = rand() % 100; // Randomly generated Width
- short rand_val, rand_srcStride, rand_dstStride;
+ int rand_val, rand_srcStride, rand_dstStride;
+
+ if (rand_height % 2)
+ rand_height++;
for (int i = 0; i <= 100; i++)
{
@@ -89,6 +92,12 @@
rand_srcStride = rand() % 100; // Randomly generated srcStride
rand_dstStride = rand() % 100; // Randomly generated dstStride
+ if (rand_srcStride < rand_width)
+ rand_srcStride = rand_width;
+
+ if (rand_dstStride < rand_width)
+ rand_dstStride = rand_width;
+
opt(pixel_buff + 3 * rand_srcStride,
rand_srcStride,
IPF_vec_output_p,
More information about the x265-devel
mailing list