[x265-commits] [x265] testbench: update for blockcopy, these function use dynam...

Min Chen chenm003 at 163.com
Fri Nov 8 09:58:47 CET 2013


details:   http://hg.videolan.org/x265/rev/c5e633516217
branches:  
changeset: 4945:c5e633516217
user:      Min Chen <chenm003 at 163.com>
date:      Fri Nov 08 15:53:53 2013 +0800
description:
testbench: update for blockcopy, these function use dynamic range [0,255]
Subject: [x265] fix bug in sse_sp12

details:   http://hg.videolan.org/x265/rev/f76b591b7aef
branches:  
changeset: 4946:f76b591b7aef
user:      Min Chen <chenm003 at 163.com>
date:      Fri Nov 08 15:55:50 2013 +0800
description:
fix bug in sse_sp12
Subject: [x265] linux: re-enable sse_12x16 for clang and gcc

details:   http://hg.videolan.org/x265/rev/94cba84de8dd
branches:  
changeset: 4947:94cba84de8dd
user:      Steve Borho <steve at borho.org>
date:      Fri Nov 08 02:01:52 2013 -0600
description:
linux: re-enable sse_12x16 for clang and gcc
Subject: [x265] primitives: remove unused ipfilter_pp_t funcdef and C and intrinsic primitives

details:   http://hg.videolan.org/x265/rev/fef74c2e329d
branches:  
changeset: 4948:fef74c2e329d
user:      Steve Borho <steve at borho.org>
date:      Fri Nov 08 02:57:47 2013 -0600
description:
primitives: remove unused ipfilter_pp_t funcdef and C and intrinsic primitives

These are now completely replaced by block based assembly code

diffstat:

 source/common/ipfilter.cpp           |   86 ----
 source/common/primitives.h           |   13 +-
 source/common/vec/ipfilter-sse41.cpp |  607 -----------------------------------
 source/common/vec/pixel-sse41.cpp    |    3 +-
 source/test/ipfilterharness.cpp      |   74 ----
 source/test/ipfilterharness.h        |    1 -
 source/test/pixelharness.cpp         |   26 +-
 source/test/pixelharness.h           |    2 +-
 8 files changed, 18 insertions(+), 794 deletions(-)

diffs (truncated from 1039 to 300 lines):

diff -r fd721a5ba063 -r fef74c2e329d source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Thu Nov 07 21:27:45 2013 -0600
+++ b/source/common/ipfilter.cpp	Fri Nov 08 02:57:47 2013 -0600
@@ -80,47 +80,6 @@ void filterVertical_sp_c(int16_t *src, i
 }
 
 template<int N>
-void filterHorizontal_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
-{
-    int headRoom = IF_FILTER_PREC;
-    int offset =  (1 << (headRoom - 1));
-    uint16_t maxVal = (1 << X265_DEPTH) - 1;
-    const int cStride = 1;
-
-    src -= (N / 2 - 1) * cStride;
-
-    int row, col;
-    for (row = 0; row < height; row++)
-    {
-        for (col = 0; col < width; col++)
-        {
-            int sum;
-
-            sum  = src[col + 0 * cStride] * coeff[0];
-            sum += src[col + 1 * cStride] * coeff[1];
-            sum += src[col + 2 * cStride] * coeff[2];
-            sum += src[col + 3 * cStride] * coeff[3];
-            if (N == 8)
-            {
-                sum += src[col + 4 * cStride] * coeff[4];
-                sum += src[col + 5 * cStride] * coeff[5];
-                sum += src[col + 6 * cStride] * coeff[6];
-                sum += src[col + 7 * cStride] * coeff[7];
-            }
-
-            int16_t val = (int16_t)(sum + offset) >> headRoom;
-
-            if (val < 0) val = 0;
-            if (val > maxVal) val = maxVal;
-            dst[col] = (pixel)val;
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
-template<int N>
 void filterVertical_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coefIdx)
 {
     const int16_t *const c = (N == 8 ? g_lumaFilter[coefIdx] : g_chromaFilter[coefIdx]);
@@ -289,47 +248,6 @@ void filterConvertPelToShort_c(pixel *sr
     }
 }
 
-template<int N>
-void filterVertical_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *c)
-{
-    int shift = IF_FILTER_PREC;
-    int offset = 1 << (shift - 1);
-    uint16_t maxVal = (1 << X265_DEPTH) - 1;
-
-    src -= (N / 2 - 1) * srcStride;
-
-    int row, col;
-
-    for (row = 0; row < height; row++)
-    {
-        for (col = 0; col < width; col++)
-        {
-            int sum;
-
-            sum  = src[col + 0 * srcStride] * c[0];
-            sum += src[col + 1 * srcStride] * c[1];
-            sum += src[col + 2 * srcStride] * c[2];
-            sum += src[col + 3 * srcStride] * c[3];
-            if (N == 8)
-            {
-                sum += src[col + 4 * srcStride] * c[4];
-                sum += src[col + 5 * srcStride] * c[5];
-                sum += src[col + 6 * srcStride] * c[6];
-                sum += src[col + 7 * srcStride] * c[7];
-            }
-
-            int16_t val = (int16_t)((sum + offset) >> shift);
-            val = (val < 0) ? 0 : val;
-            val = (val > maxVal) ? maxVal : val;
-
-            dst[col] = (pixel)val;
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
 void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX)
 {
     for (int y = 0; y < height; y++)
@@ -586,16 +504,12 @@ void Setup_C_IPFilterPrimitives(EncoderP
     LUMA(16, 64);
     CHROMA(8, 32);
 
-    p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_pp_c<8>;
     p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_ps_c<8>;
     p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_ps_c<8>;
     p.ipfilter_sp[FILTER_V_S_P_8] = filterVertical_sp_c<8>;
-    p.ipfilter_pp[FILTER_H_P_P_4] = filterHorizontal_pp_c<4>;
     p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_ps_c<4>;
     p.ipfilter_ps[FILTER_V_P_S_4] = filterVertical_ps_c<4>;
     p.ipfilter_sp[FILTER_V_S_P_4] = filterVertical_sp_c<4>;
-    p.ipfilter_pp[FILTER_V_P_P_8] = filterVertical_pp_c<8>;
-    p.ipfilter_pp[FILTER_V_P_P_4] = filterVertical_pp_c<4>;
     p.ipfilter_ss[FILTER_V_S_S_8] = filterVertical_ss_c<8>;
     p.ipfilter_ss[FILTER_V_S_S_4] = filterVertical_ss_c<4>;
 
diff -r fd721a5ba063 -r fef74c2e329d source/common/primitives.h
--- a/source/common/primitives.h	Thu Nov 07 21:27:45 2013 -0600
+++ b/source/common/primitives.h	Fri Nov 08 02:57:47 2013 -0600
@@ -114,15 +114,6 @@ enum IDcts
     NUM_IDCTS
 };
 
-enum IPFilterConf_P_P
-{
-    FILTER_H_P_P_8,
-    FILTER_H_P_P_4,
-    FILTER_V_P_P_8,
-    FILTER_V_P_P_4,
-    NUM_IPFILTER_P_P
-};
-
 enum IPFilterConf_P_S
 {
     FILTER_H_P_S_8,
@@ -163,7 +154,6 @@ typedef int  (*pixelcmp_ss_t)(int16_t *f
 typedef int  (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
 typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
 typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
-typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
 typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
 typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
 typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
@@ -244,13 +234,11 @@ struct EncoderPrimitives
     copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
     copy_sp_t       chroma_copy_sp[NUM_CHROMA_PARTITIONS];
 
-    ipfilter_pp_t   ipfilter_pp[NUM_IPFILTER_P_P];
     ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];
     ipfilter_sp_t   ipfilter_sp[NUM_IPFILTER_S_P];
     ipfilter_ss_t   ipfilter_ss[NUM_IPFILTER_S_S];
     ipfilter_p2s_t  ipfilter_p2s;
     ipfilter_s2p_t  ipfilter_s2p;
-    extendCURowBorder_t extendRowBorder;
     filter_pp_t     chroma_hpp[NUM_CHROMA_PARTITIONS];
     filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];
     filter_ps_t     luma_hps[NUM_LUMA_PARTITIONS];
@@ -260,6 +248,7 @@ struct EncoderPrimitives
     filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
     filter_p2s_t    luma_p2s;
     filter_p2s_t    chroma_p2s;
+    extendCURowBorder_t extendRowBorder;
 
     intra_dc_t      intra_pred_dc;
     intra_planar_t  intra_pred_planar;
diff -r fd721a5ba063 -r fef74c2e329d source/common/vec/ipfilter-sse41.cpp
--- a/source/common/vec/ipfilter-sse41.cpp	Thu Nov 07 21:27:45 2013 -0600
+++ b/source/common/vec/ipfilter-sse41.cpp	Fri Nov 08 02:57:47 2013 -0600
@@ -371,310 +371,7 @@ void filterVertical_sp(int16_t *src, int
         sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
         *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
 }
-
-template<int N>
-void filterVertical_pp(pixel *src, intptr_t srcStride,
-                       pixel *dst, intptr_t dstStride,
-                       int width, int height,
-                       const int16_t *coeff)
-{
-    src -= (N / 2 - 1) * srcStride;
-
-    const __m128i coeffTemp = _mm_loadu_si128((__m128i const*)coeff);
-
-    int row, col;
-
-    if (N == 4)
-        assert(height % 2 == 0);
-
-    uint32_t leftCols = (8 - (width & 7)) * 8;
-    uint32_t mask_shift = ((uint32_t) ~0 >> leftCols);
-    uint32_t mask0 = (width & 7) <= 4 ? mask_shift : ~0;
-    uint32_t mask1 = (width & 7) <= 4 ? 0 : mask_shift;
-    __m128i leftmask = _mm_setr_epi32(mask0, mask1, 0, 0);
-
-    if (N == 8)
-    {
-        __m128i vm01 = _mm_shuffle_epi32(coeffTemp, 0x00);
-        __m128i vm23 = _mm_shuffle_epi32(coeffTemp, 0x55);
-        __m128i vm45 = _mm_shuffle_epi32(coeffTemp, 0xAA);
-        __m128i vm67 = _mm_shuffle_epi32(coeffTemp, 0xFF);
-        vm01 = _mm_packs_epi16(vm01, vm01);
-        vm23 = _mm_packs_epi16(vm23, vm23);
-        vm45 = _mm_packs_epi16(vm45, vm45);
-        vm67 = _mm_packs_epi16(vm67, vm67);
-
-        __m128i T00, T01, T02, T03, T04, T05, T06, T07 /*, T08*/;
-        __m128i T10, T11, T12, T13;
-        for (row = 0; row < height; row += 1)
-        {
-            for (col = 0; col < (width & ~7); col += 8)
-            {
-                T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
-                T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
-                T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
-                T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
-                T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
-                T05 = _mm_loadl_epi64((__m128i*)&src[(5) * srcStride + col]);
-                T06 = _mm_loadl_epi64((__m128i*)&src[(6) * srcStride + col]);
-                T07 = _mm_loadl_epi64((__m128i*)&src[(7) * srcStride + col]);
-
-                T10 = _mm_unpacklo_epi8(T00, T01);
-                T11 = _mm_unpacklo_epi8(T02, T03);
-                T12 = _mm_unpacklo_epi8(T04, T05);
-                T13 = _mm_unpacklo_epi8(T06, T07);
-
-                T10 = _mm_maddubs_epi16(T10, vm01);
-                T11 = _mm_maddubs_epi16(T11, vm23);
-                T12 = _mm_maddubs_epi16(T12, vm45);
-                T13 = _mm_maddubs_epi16(T13, vm67);
-                T10 = _mm_add_epi16(T10, T11);
-                T11 = _mm_add_epi16(T12, T13);
-                T10 = _mm_add_epi16(T10, T11);
-                T10 = _mm_mulhrs_epi16(T10, _mm_load_si128((__m128i*)c_512));
-                T10 = _mm_packus_epi16(T10, T10);
-                _mm_storel_epi64((__m128i*)&dst[0 * dstStride + col], T10);
-            }
-
-            assert((width - col) < 8);
-            if (col != width)
-            {
-                T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
-                T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
-                T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
-                T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
-                T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
-                T05 = _mm_loadl_epi64((__m128i*)&src[(5) * srcStride + col]);
-                T06 = _mm_loadl_epi64((__m128i*)&src[(6) * srcStride + col]);
-                T07 = _mm_loadl_epi64((__m128i*)&src[(7) * srcStride + col]);
-
-                T10 = _mm_unpacklo_epi8(T00, T01);
-                T11 = _mm_unpacklo_epi8(T02, T03);
-                T12 = _mm_unpacklo_epi8(T04, T05);
-                T13 = _mm_unpacklo_epi8(T06, T07);
-
-                T10 = _mm_maddubs_epi16(T10, vm01);
-                T11 = _mm_maddubs_epi16(T11, vm23);
-                T12 = _mm_maddubs_epi16(T12, vm45);
-                T13 = _mm_maddubs_epi16(T13, vm67);
-                T10 = _mm_add_epi16(T10, T11);
-                T11 = _mm_add_epi16(T12, T13);
-                T10 = _mm_add_epi16(T10, T11);
-                T10 = _mm_mulhrs_epi16(T10, _mm_load_si128((__m128i*)c_512));
-                T10 = _mm_packus_epi16(T10, T10);
-                _mm_maskmoveu_si128(T10, leftmask, (char*)&dst[(0) * dstStride + col]);
-            }
-
-            src += 1 * srcStride;
-            dst += 1 * dstStride;
-        }   // end of row loop
-    } // end of N==8
-
-    if (N == 4)
-    {
-        __m128i vm01 = _mm_shuffle_epi32(coeffTemp, 0x00);
-        __m128i vm23 = _mm_shuffle_epi32(coeffTemp, 0x55);
-        vm01 = _mm_packs_epi16(vm01, vm01);
-        vm23 = _mm_packs_epi16(vm23, vm23);
-
-        __m128i T00, T01, T02, T03, T04;
-        __m128i T10, T11;
-        __m128i T20, T21;
-        for (row = 0; row < height; row += 2)
-        {
-            for (col = 0; col < (width & ~7); col += 8)
-            {
-                T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
-                T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
-                T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
-                T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
-                T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
-
-                T10 = _mm_unpacklo_epi8(T00, T01);
-                T11 = _mm_unpacklo_epi8(T02, T03);
-
-                T10 = _mm_maddubs_epi16(T10, vm01);
-                T11 = _mm_maddubs_epi16(T11, vm23);
-                T10 = _mm_add_epi16(T10, T11);
-                T10 = _mm_mulhrs_epi16(T10, _mm_load_si128((__m128i*)c_512));
-                T10 = _mm_packus_epi16(T10, T10);
-                _mm_storel_epi64((__m128i*)&dst[0 * dstStride + col], T10);


More information about the x265-commits mailing list