[x265] [PATCH] Close to +2x performance improvement for filterVertical_s_p
Steve Borho
steve at borho.org
Thu Aug 1 08:44:34 CEST 2013
On Thu, Aug 1, 2013 at 1:03 AM, <praveen at multicorewareinc.com> wrote:
> # HG changeset patch
> # User praveentiwari
> # Date 1375336983 -19800
> # Node ID 3b15b5834e894b9869d86280a93ebdf154057912
> # Parent 8eebc88f86e03f61d865c1211cee3247df1abb95
> Close to +2x performance improvement for filterVertical_s_p
>
Looks great, I'll push this after some quick smoke tests.
> diff -r 8eebc88f86e0 -r 3b15b5834e89 source/common/vec/ipfilter.inc
> --- a/source/common/vec/ipfilter.inc Wed Jul 31 15:43:37 2013 -0500
> +++ b/source/common/vec/ipfilter.inc Thu Aug 01 11:33:03 2013 +0530
> @@ -51,8 +51,10 @@
> p.ipfilter_pp[FILTER_V_P_P_4] = filterVertical_p_p<4>;
> p.ipfilter_pp[FILTER_V_P_P_8] = filterVertical_p_p<8>;
>
> +#if INSTRSET >= 4
> p.ipfilter_sp[FILTER_V_S_P_4] = filterVertical_s_p<4>;
> p.ipfilter_sp[FILTER_V_S_P_8] = filterVertical_s_p<8>;
> +#endif
>
> p.ipfilter_p2s = filterConvertPelToShort;
> p.ipfilter_s2p = filterConvertShortToPel;
> diff -r 8eebc88f86e0 -r 3b15b5834e89 source/common/vec/ipfilter8.inc
> --- a/source/common/vec/ipfilter8.inc Wed Jul 31 15:43:37 2013 -0500
> +++ b/source/common/vec/ipfilter8.inc Thu Aug 01 11:33:03 2013 +0530
> @@ -24,14 +24,12 @@
> * For more information, contact us at licensing at multicorewareinc.com.
>
> *****************************************************************************/
>
> +#if INSTRSET >= 4
> template<int N>
> -void filterVertical_s_p(short *src, int srcStride,
> - pixel *dst, int dstStride, int block_width,
> - int block_height, const short *coeff)
> +void filterVertical_s_p(short *src, int srcStride, pixel *dst, int
> dstStride, int width, int height, short const *coeff)
> {
> - int row, col;
> + src -= (N / 2 - 1) * srcStride;
>
> - src -= (N / 2 - 1) * srcStride;
> int offset;
> short maxVal;
> int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
> @@ -40,166 +38,188 @@
> offset = 1 << (shift - 1);
> offset += IF_INTERNAL_OFFS << IF_FILTER_PREC;
> maxVal = (1 << X265_DEPTH) - 1;
> - Vec4i cm0(coeff[0]), cm1(coeff[1]), cm2(coeff[2]), cm3(coeff[3]),
> cm4(coeff[4]), cm5(coeff[5]), cm6(coeff[6]), cm7(coeff[7]);
> - Vec16uc sum_uc;
> - Vec8s vec_zero(0);
>
> - for (row = 0; row < block_height; row++)
> + __m128i filterCoeff0 = _mm_set1_epi32(coeff[0]);
> + __m128i filterCoeff1 = _mm_set1_epi32(coeff[1]);
> + __m128i filterCoeff2 = _mm_set1_epi32(coeff[2]);
> + __m128i filterCoeff3 = _mm_set1_epi32(coeff[3]);
> + __m128i filterCoeff4 = _mm_set1_epi32(coeff[4]);
> + __m128i filterCoeff5 = _mm_set1_epi32(coeff[5]);
> + __m128i filterCoeff6 = _mm_set1_epi32(coeff[6]);
> + __m128i filterCoeff7 = _mm_set1_epi32(coeff[7]);
> +
> + int row, col;
> +
> + for (row = 0; row < height; row++)
> {
> - for (col = 0; col < block_width - 7; col += 8)
> + col = 0;
> + for (; col < (width - 7); col += 8)
> {
> - Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
> - Vec4i row0_first, row0_last, row1_first, row1_last,
> sum_first, sum_last;
> - Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
> + __m128i srcCoeff = _mm_loadu_si128((__m128i
> const*)(&src[col]));
> + __m128i srcCoeffTemp1 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T00 = _mm_mullo_epi32(srcCoeffTemp1, filterCoeff0);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T01 = _mm_mullo_epi32(srcCoeff, filterCoeff0);
>
> - row0.load(&src[col]);
> - row1.load(&src[col + srcStride]);
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col +
> srcStride]));
> + __m128i srcCoeffTemp2 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T10 = _mm_mullo_epi32(srcCoeffTemp2, filterCoeff1);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T11 = _mm_mullo_epi32(srcCoeff, filterCoeff1);
>
> - c0 = cm0;
> - c1 = cm1;
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 2 *
> srcStride]));
> + __m128i srcCoeffTemp3 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T20 = _mm_mullo_epi32(srcCoeffTemp3, filterCoeff2);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T21 = _mm_mullo_epi32(srcCoeff, filterCoeff2);
>
> - row0_first = extend_low(row0);
> - row1_first = extend_low(row1);
> - row0_last = extend_high(row0);
> - row1_last = extend_high(row1);
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 3 *
> srcStride]));
> + __m128i srcCoeffTemp4 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T30 = _mm_mullo_epi32(srcCoeffTemp4, filterCoeff3);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T31 = _mm_mullo_epi32(srcCoeff, filterCoeff3);
>
> - row0_first = row0_first * c0;
> - row1_first = row1_first * c1;
> - row0_last = row0_last * c0;
> - row1_last = row1_last * c1;
> + __m128i sum01 = _mm_add_epi32(T00, T10);
> + __m128i sum23 = _mm_add_epi32(T20, T30);
> + __m128i sumlo0123 = _mm_add_epi32(sum01, sum23);
>
> - sum_first = row0_first + row1_first;
> - sum_last = row0_last + row1_last;
> -
> - row2.load(&src[col + 2 * srcStride]);
> - row3.load(&src[col + 3 * srcStride]);
> -
> - c2 = cm2;
> - c3 = cm3;
> -
> - row0_first = extend_low(row2);
> - row0_last = extend_high(row2);
> - row0_first = row0_first * c2;
> - row0_last = row0_last * c2;
> - row1_first = extend_low(row3);
> - row1_last = extend_high(row3);
> - row1_first = row1_first * c3;
> - row1_last = row1_last * c3;
> - sum_first += row0_first + row1_first;
> - sum_last += row0_last + row1_last;
> + __m128i sum45 = _mm_add_epi32(T01, T11);
> + __m128i sum67 = _mm_add_epi32(T21, T31);
> + __m128i sumhi0123 = _mm_add_epi32(sum45, sum67);
>
> if (N == 8)
> {
> - row4.load(&src[col + 4 * srcStride]);
> - row5.load(&src[col + 5 * srcStride]);
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 4
> * srcStride]));
> + srcCoeffTemp1 = _mm_cvtepi16_epi32(srcCoeff);
> + T00 = _mm_mullo_epi32(srcCoeffTemp1, filterCoeff4);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + T01 = _mm_mullo_epi32(srcCoeff, filterCoeff4);
>
> - c4 = cm4;
> - c5 = cm5;
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 5
> * srcStride]));
> + srcCoeffTemp2 = _mm_cvtepi16_epi32(srcCoeff);
> + T10 = _mm_mullo_epi32(srcCoeffTemp2, filterCoeff5);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + T11 = _mm_mullo_epi32(srcCoeff, filterCoeff5);
>
> - row0_first = extend_low(row4);
> - row0_last = extend_high(row4);
> - row0_first = row0_first * c4;
> - row0_last = row0_last * c4;
> - row1_first = extend_low(row5);
> - row1_last = extend_high(row5);
> - row1_first = row1_first * c5;
> - row1_last = row1_last * c5;
> - sum_first += row0_first + row1_first;
> - sum_last += row0_last + row1_last;
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 6
> * srcStride]));
> + srcCoeffTemp3 = _mm_cvtepi16_epi32(srcCoeff);
> + T20 = _mm_mullo_epi32(srcCoeffTemp3, filterCoeff6);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + T21 = _mm_mullo_epi32(srcCoeff, filterCoeff6);
>
> - row6.load(&src[col + 6 * srcStride]);
> - row7.load(&src[col + 7 * srcStride]);
> + srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 7
> * srcStride]));
> + srcCoeffTemp4 = _mm_cvtepi16_epi32(srcCoeff);
> + T30 = _mm_mullo_epi32(srcCoeffTemp4, filterCoeff7);
> + srcCoeff = _mm_srli_si128(srcCoeff, 8);
> + srcCoeff = _mm_cvtepi16_epi32(srcCoeff);
> + T31 = _mm_mullo_epi32(srcCoeff, filterCoeff7);
>
> - c6 = cm6;
> - c7 = cm7;
> + sum01 = _mm_add_epi32(T00, T10);
> + sum23 = _mm_add_epi32(T20, T30);
> + sumlo0123 = _mm_add_epi32(sumlo0123, _mm_add_epi32(sum01,
> sum23));
>
> - row0_first = extend_low(row6);
> - row0_last = extend_high(row6);
> - row0_first = row0_first * c6;
> - row0_last = row0_last * c6;
> - row1_first = extend_low(row7);
> - row1_last = extend_high(row7);
> - row1_first = row1_first * c7;
> - row1_last = row1_last * c7;
> - sum_first += row0_first + row1_first;
> - sum_last += row0_last + row1_last;
> + sum45 = _mm_add_epi32(T01, T11);
> + sum67 = _mm_add_epi32(T21, T31);
> + sumhi0123 = _mm_add_epi32(sumhi0123, _mm_add_epi32(sum45,
> sum67));
> }
> - sum_first = (sum_first + offset) >> shift;
> - sum_last = (sum_last + offset) >> shift;
> - Vec4i zero(0);
> - sum = compress(sum_first, sum_last);
> - sum = max(sum, 0);
> - Vec8s maxVal_v(maxVal);
> - sum = min(sum, maxVal_v);
> - sum_uc = compress(sum, vec_zero);
> - sum_uc.store_partial(8, dst + col);
> + __m128i zero = _mm_set1_epi16(0);
> + __m128i sumOffset = _mm_set1_epi32(offset);
> +
> + __m128i val1 = _mm_add_epi32(sumlo0123, sumOffset);
> + val1 = _mm_srai_epi32(val1, shift);
> +
> + __m128i val2 = _mm_add_epi32(sumhi0123, sumOffset);
> + val2 = _mm_srai_epi32(val2, shift);
> +
> + __m128i val = _mm_packs_epi32(val1, val2);
> + __m128i res = _mm_packus_epi16(val, zero);
> + _mm_storel_epi64((__m128i*)&dst[col], res);
> }
>
> - //Handle the case when block_width is not multiple of 8
> - for (; col < block_width; col += 4)
> + for (; col < width; col += 4)
> {
> - Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
> - Vec4i row0_first, row0_last, row1_first, row1_last,
> sum_first, sum_last;
> - Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
> + __m128i srcCoeff = _mm_loadl_epi64((__m128i
> const*)(&src[col]));
> + __m128i srcCoeffTemp1 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T00 = _mm_mullo_epi32(srcCoeffTemp1, filterCoeff0);
>
> - row0.load(&src[col]);
> - row1.load(&src[col + srcStride]);
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col +
> srcStride]));
> + __m128i srcCoeffTemp2 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T10 = _mm_mullo_epi32(srcCoeffTemp2, filterCoeff1);
>
> - c0 = cm0;
> - c1 = cm1;
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 2 *
> srcStride]));
> + __m128i srcCoeffTemp3 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T20 = _mm_mullo_epi32(srcCoeffTemp3, filterCoeff2);
>
> - row0_first = extend_low(row0);
> - row1_first = extend_low(row1);
> - row0_first = row0_first * c0;
> - row1_first = row1_first * c1;
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 3 *
> srcStride]));
> + __m128i srcCoeffTemp4 = _mm_cvtepi16_epi32(srcCoeff);
> + __m128i T30 = _mm_mullo_epi32(srcCoeffTemp4, filterCoeff3);
>
> - sum_first = row0_first + row1_first;
> + __m128i sum01 = _mm_add_epi32(T00, T10);
> + __m128i sum23 = _mm_add_epi32(T20, T30);
> + __m128i sumlo0123 = _mm_add_epi32(sum01, sum23);
>
> - row2.load(&src[col + 2 * srcStride]);
> - row3.load(&src[col + 3 * srcStride]);
> -
> - c2 = cm2;
> - c3 = cm3;
> -
> - row0_first = extend_low(row2);
> - row0_first = row0_first * c2;
> - row1_first = extend_low(row3);
> - row1_first = row1_first * c3;
> - sum_first += row0_first + row1_first;
> if (N == 8)
> {
> - row4.load(&src[col + 4 * srcStride]);
> - row5.load(&src[col + 5 * srcStride]);
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 4
> * srcStride]));
> + srcCoeffTemp1 = _mm_cvtepi16_epi32(srcCoeff);
> + T00 = _mm_mullo_epi32(srcCoeffTemp1, filterCoeff4);
>
> - c4 = cm4;
> - c5 = cm5;
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 5
> * srcStride]));
> + srcCoeffTemp2 = _mm_cvtepi16_epi32(srcCoeff);
> + T10 = _mm_mullo_epi32(srcCoeffTemp2, filterCoeff5);
>
> - row0_first = extend_low(row4);
> - row0_first = row0_first * c4;
> - row1_first = extend_low(row5);
> - row1_first = row1_first * c5;
> - sum_first += row0_first + row1_first;
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 6
> * srcStride]));
> + srcCoeffTemp3 = _mm_cvtepi16_epi32(srcCoeff);
> + T20 = _mm_mullo_epi32(srcCoeffTemp3, filterCoeff6);
>
> - row6.load(&src[col + 6 * srcStride]);
> - row7.load(&src[col + 7 * srcStride]);
> + srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 7
> * srcStride]));
> + srcCoeffTemp4 = _mm_cvtepi16_epi32(srcCoeff);
> + T30 = _mm_mullo_epi32(srcCoeffTemp4, filterCoeff7);
>
> - c6 = cm6;
> - c7 = cm7;
> + sum01 = _mm_add_epi32(T00, T10);
> + sum23 = _mm_add_epi32(T20, T30);
> + sumlo0123 = _mm_add_epi32(sumlo0123, _mm_add_epi32(sum01,
> sum23));
> + }
>
> - row0_first = extend_low(row6);
> - row0_first = row0_first * c6;
> - row1_first = extend_low(row7);
> - row1_first = row1_first * c7;
> - sum_first += row0_first + row1_first;
> + __m128i zero16 = _mm_set1_epi16(0);
> + __m128i zero32 = _mm_set1_epi32(0);
> + __m128i sumOffset = _mm_set1_epi32(offset);
> +
> + __m128i val1 = _mm_add_epi32(sumlo0123, sumOffset);
> + val1 = _mm_srai_epi32(val1, shift);
> +
> + __m128i val = _mm_packs_epi32(val1, zero32);
> + __m128i res = _mm_packus_epi16(val, zero16);
> +
> + int n = width - col;
> +
> + switch (n) // store either 1, 2, 3 or 4 8-bit results in dst
> + {
> + case 1: dst[col] = _mm_extract_epi8(res, 0);
> + break;
> +
> + case 2: dst[col] = _mm_extract_epi8(res, 0);
> + dst[col + 1] = _mm_extract_epi8(res, 1);
> + break;
> +
> + case 3: dst[col] = _mm_extract_epi8(res, 0);
> + dst[col + 1] = _mm_extract_epi8(res, 1);
> + dst[col + 2] = _mm_extract_epi8(res, 2);
> + break;
> +
> + default: dst[col] = _mm_extract_epi8(res, 0);
> + dst[col + 1] = _mm_extract_epi8(res, 1);
> + dst[col + 2] = _mm_extract_epi8(res, 2);
> + dst[col + 3] = _mm_extract_epi8(res, 3);
> + break;
> }
> - sum_first = (sum_first + offset) >> shift;
> - Vec4i zero(0);
> - sum = compress(sum_first, zero);
> - sum = max(sum, 0);
> - Vec8s maxVal_v(maxVal);
> - sum = min(sum, maxVal_v);
> - sum_uc = compress(sum, vec_zero);
> - sum_uc.store_partial(block_width - col, dst + col);
> }
>
> src += srcStride;
> @@ -207,6 +227,8 @@
> }
> }
>
> +#endif /* if INSTRSET >= 4 */
> +
> /*
> Please refer Fig 7 in HEVC Overview document to familiarize with
> variables' naming convention
> Input: Subpel from the Horizontal filter - 'src'
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/private/x265-devel/attachments/20130801/1eee0c6e/attachment-0001.html>
More information about the x265-devel
mailing list