[x265] [PATCH] primivites: rename luma_p2s to convert_p2s and move into PU

Steve Borho steve at borho.org
Fri Apr 3 17:50:09 CEST 2015


On 04/03, rajesh at multicorewareinc.com wrote:
> # HG changeset patch
> # User Rajesh Paulraj<rajesh at multicorewareinc.com>
> # Date 1428065328 -19800
> #      Fri Apr 03 18:18:48 2015 +0530
> # Node ID 24c96db729600c88a278c06cce4b8bf041d1357a
> # Parent  9a5fa67583feb6ffb7668f82632f7e93e5ec9415
> primivites: rename luma_p2s to convert_p2s and move into PU

this series is queued for testing

> diff -r 9a5fa67583fe -r 24c96db72960 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/ipfilter.cpp	Fri Apr 03 18:18:48 2015 +0530
> @@ -34,27 +34,8 @@
>  #endif
>  
>  namespace {
> -template<int dstStride, int width, int height>
> -void pixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst)
> -{
> -    int shift = IF_INTERNAL_PREC - X265_DEPTH;
> -    int row, col;
> -
> -    for (row = 0; row < height; row++)
> -    {
> -        for (col = 0; col < width; col++)
> -        {
> -            int16_t val = src[col] << shift;
> -            dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
> -        }
> -
> -        src += srcStride;
> -        dst += dstStride;
> -    }
> -}
> -
> -template<int dstStride>
> -void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
> +template<int width, int height>
> +void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
>  {
>      int shift = IF_INTERNAL_PREC - X265_DEPTH;
>      int row, col;
> @@ -398,7 +379,7 @@
>      p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
>      p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
>      p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
> -    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE / 2, W, H>; 
> +    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
>  
>  #define CHROMA_422(W, H) \
>      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
> @@ -407,7 +388,7 @@
>      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
>      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
>      p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
> -    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE / 2, W, H>; 
> +    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
>  
>  #define CHROMA_444(W, H) \
>      p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
> @@ -416,7 +397,7 @@
>      p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
>      p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
>      p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
> -    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE, W, H>; 
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
>  
>  #define LUMA(W, H) \
>      p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_c<8, W, H>; \
> @@ -426,7 +407,7 @@
>      p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_c<8, W, H>;  \
>      p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_c<8, W, H>;  \
>      p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>; \
> -    p.pu[LUMA_ ## W ## x ## H].filter_p2s = pixelToShort_c<MAX_CU_SIZE, W, H>
> +    p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
>  
>  void setupFilterPrimitives_c(EncoderPrimitives& p)
>  {
> @@ -530,11 +511,6 @@
>      CHROMA_444(48, 64);
>      CHROMA_444(64, 16);
>      CHROMA_444(16, 64);
> -    p.luma_p2s = filterPixelToShort_c<MAX_CU_SIZE>;
> -
> -    p.chroma[X265_CSP_I444].p2s = filterPixelToShort_c<MAX_CU_SIZE>;
> -    p.chroma[X265_CSP_I420].p2s = filterPixelToShort_c<MAX_CU_SIZE / 2>;
> -    p.chroma[X265_CSP_I422].p2s = filterPixelToShort_c<MAX_CU_SIZE / 2>;
>  
>      p.extendRowBorder = extendCURowColBorder;
>  }
> diff -r 9a5fa67583fe -r 24c96db72960 source/common/predict.cpp
> --- a/source/common/predict.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/predict.cpp	Fri Apr 03 18:18:48 2015 +0530
> @@ -273,7 +273,7 @@
>  void Predict::predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
>  {
>      int16_t* dst = dstSYuv.getLumaAddr(pu.puAbsPartIdx);
> -    int dstStride = dstSYuv.m_size;
> +    intptr_t dstStride = dstSYuv.m_size;
>  
>      intptr_t srcStride = refPic.m_stride;
>      intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
> @@ -288,7 +288,7 @@
>      X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n");
>  
>      if (!(yFrac | xFrac))
> -        primitives.luma_p2s(src, srcStride, dst, pu.width, pu.height);
> +        primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
>      else if (!yFrac)
>          primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
>      else if (!xFrac)
> @@ -375,14 +375,13 @@
>      int partEnum = partitionFromSizes(pu.width, pu.height);
>      
>      uint32_t cxWidth  = pu.width >> m_hChromaShift;
> -    uint32_t cxHeight = pu.height >> m_vChromaShift;
>  
> -    X265_CHECK(((cxWidth | cxHeight) % 2) == 0, "chroma block size expected to be multiple of 2\n");
> +    X265_CHECK(((cxWidth | (pu.height >> m_vChromaShift)) % 2) == 0, "chroma block size expected to be multiple of 2\n");
>  
>      if (!(yFrac | xFrac))
>      {
> -        primitives.chroma[m_csp].p2s(refCb, refStride, dstCb, cxWidth, cxHeight);
> -        primitives.chroma[m_csp].p2s(refCr, refStride, dstCr, cxWidth, cxHeight);
> +        primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
> +        primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
>      }
>      else if (!yFrac)
>      {
> diff -r 9a5fa67583fe -r 24c96db72960 source/common/primitives.cpp
> --- a/source/common/primitives.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/primitives.cpp	Fri Apr 03 18:18:48 2015 +0530
> @@ -90,7 +90,6 @@
>  
>      /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */
>  
> -    p.chroma[X265_CSP_I444].p2s = p.luma_p2s;
>      p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL;
>  
>      for (int i = 0; i < NUM_PU_SIZES; i++)
> @@ -98,7 +97,7 @@
>          p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
>          p.chroma[X265_CSP_I444].pu[i].addAvg  = p.pu[i].addAvg;
>          p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
> -        p.chroma[X265_CSP_I444].pu[i].chroma_p2s = p.pu[i].filter_p2s;
> +        p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s;
>      }
>  
>      for (int i = 0; i < NUM_CU_SIZES; i++)
> diff -r 9a5fa67583fe -r 24c96db72960 source/common/primitives.h
> --- a/source/common/primitives.h	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/primitives.h	Fri Apr 03 18:18:48 2015 +0530
> @@ -156,8 +156,7 @@
>  typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
>  typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>  typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
> -typedef void (*filter_p2s_wxh_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
> -typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst);
> +typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>  
>  typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
>  typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> @@ -211,7 +210,7 @@
>          addAvg_t       addAvg;      // bidir motion compensation, uses 16bit values
>  
>          copy_pp_t      copy_pp;
> -        filter_p2s_t   filter_p2s;
> +        filter_p2s_t   convert_p2s;
>      }
>      pu[NUM_PU_SIZES];
>  
> @@ -290,7 +289,6 @@
>      weightp_sp_t          weight_sp;
>      weightp_pp_t          weight_pp;
>  
> -    filter_p2s_wxh_t      luma_p2s;
>  
>      findPosLast_t         findPosLast;
>  
> @@ -317,7 +315,7 @@
>              filter_hps_t filter_hps;
>              addAvg_t     addAvg;
>              copy_pp_t    copy_pp;
> -            filter_p2s_t chroma_p2s;
> +            filter_p2s_t p2s;
>  
>          }
>          pu[NUM_PU_SIZES];
> @@ -337,7 +335,6 @@
>          }
>          cu[NUM_CU_SIZES];
>  
> -        filter_p2s_wxh_t p2s; // takes width/height as arguments
>      }
>      chroma[X265_CSP_COUNT];
>  };
> diff -r 9a5fa67583fe -r 24c96db72960 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/asm-primitives.cpp	Fri Apr 03 18:18:48 2015 +0530
> @@ -859,9 +859,6 @@
>          PIXEL_AVG_W4(mmx2);
>          LUMA_VAR(sse2);
>  
> -        p.luma_p2s = x265_luma_p2s_sse2;
> -        p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
> -        p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
>  
>          ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
>          ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
> @@ -1273,31 +1270,6 @@
>          ASSIGN_SSE_PP(ssse3);
>          p.cu[BLOCK_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3;
>          p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = x265_pixel_ssd_4x8_ssse3;
> -        p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_ssse3;
> -        p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_ssse3;
> -        p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_ssse3;
> -        p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_ssse3;
> -        p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_ssse3;
> -        p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_ssse3;
> -        p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_ssse3;
> -        p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_ssse3;
> -        p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_ssse3;
> -        p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_ssse3;
> -        p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_ssse3;
> -        p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_ssse3;
> -        p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_ssse3;
> -        p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_ssse3;
> -        p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_ssse3;
> -        p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_ssse3;
> -        p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_ssse3;
> -        p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_ssse3;
> -        p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_ssse3;
> -        p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3;
> -        p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3;
> -        p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3;
> -
> -        p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
> -        p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
>  
>          p.dst4x4 = x265_dst4_ssse3;
>          p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
> diff -r 9a5fa67583fe -r 24c96db72960 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/ipfilter8.asm	Fri Apr 03 18:18:48 2015 +0530
> @@ -7740,320 +7740,6 @@
>  FILTER_V4_W16n_H2 64, 48
>  FILTER_V4_W16n_H2 48, 64
>  FILTER_V4_W16n_H2 64, 16
> -;-----------------------------------------------------------------------------
> -; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
> -;-----------------------------------------------------------------------------
> -%macro PIXEL_WH_4xN 2
> -INIT_XMM ssse3
> -cglobal pixelToShort_%1x%2, 3, 7, 6
> -
> -    ; load width and height
> -    mov         r3d, %1
> -    mov         r4d, %2
> -    ; load constant
> -    mova        m4, [pb_128]
> -    mova        m5, [tab_c_64_n64]
> -.loopH:
> -    xor         r5d, r5d
> -
> -.loopW:
> -    mov         r6, r0
> -    movh        m0, [r6]
> -    punpcklbw   m0, m4
> -    pmaddubsw   m0, m5
> -
> -    movh        m1, [r6 + r1]
> -    punpcklbw   m1, m4
> -    pmaddubsw   m1, m5
> -
> -    movh        m2, [r6 + r1 * 2]
> -    punpcklbw   m2, m4
> -    pmaddubsw   m2, m5
> -
> -    lea         r6, [r6 + r1 * 2]
> -    movh        m3, [r6 + r1]
> -    punpcklbw   m3, m4
> -    pmaddubsw   m3, m5
> -
> -    add         r5, 8
> -    cmp         r5, r3
> -    jg          .width4
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
> -    je          .nextH
> -    jmp         .loopW
> -
> -.width4:
> -    movh        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
> -    movh        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
> -    movh        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
> -    movh        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
> -
> -.nextH:
> -    lea         r0, [r0 + r1 * 4]
> -    add         r2, FENC_STRIDE * 8
> -
> -    sub         r4d, 4
> -    jnz         .loopH
> -    RET
> -%endmacro
> -PIXEL_WH_4xN 4, 4
> -PIXEL_WH_4xN 4, 8
> -PIXEL_WH_4xN 4, 16
> -
> -;-----------------------------------------------------------------------------
> -; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
> -;-----------------------------------------------------------------------------
> -%macro PIXEL_WH_8xN 2
> -INIT_XMM ssse3
> -cglobal pixelToShort_%1x%2, 3, 7, 6
> -
> -    ; load width and height
> -    mov         r3d, %1
> -    mov         r4d, %2
> -
> -    ; load constant
> -    mova        m4, [pb_128]
> -    mova        m5, [tab_c_64_n64]
> -
> -.loopH
> -    xor         r5d, r5d
> -.loopW
> -    lea         r6, [r0 + r5]
> -
> -    movh        m0, [r6]
> -    punpcklbw   m0, m4
> -    pmaddubsw   m0, m5
> -
> -    movh        m1, [r6 + r1]
> -    punpcklbw   m1, m4
> -    pmaddubsw   m1, m5
> -
> -    movh        m2, [r6 + r1 * 2]
> -    punpcklbw   m2, m4
> -    pmaddubsw   m2, m5
> -
> -    lea         r6, [r6 + r1 * 2]
> -    movh        m3, [r6 + r1]
> -    punpcklbw   m3, m4
> -    pmaddubsw   m3, m5
> -
> -    add         r5, 8
> -    cmp         r5, r3
> -
> -    movu        [r2 + FENC_STRIDE * 0], m0
> -    movu        [r2 + FENC_STRIDE * 2], m1
> -    movu        [r2 + FENC_STRIDE * 4], m2
> -    movu        [r2 + FENC_STRIDE * 6], m3
> -
> -    je          .nextH
> -    jmp         .loopW
> -
> -
> -.nextH:
> -    lea         r0, [r0 + r1 * 4]
> -    add         r2, FENC_STRIDE * 8
> -
> -    sub         r4d, 4
> -    jnz         .loopH
> -    RET
> -%endmacro
> -PIXEL_WH_8xN 8, 8
> -PIXEL_WH_8xN 8, 4
> -PIXEL_WH_8xN 8, 16
> -PIXEL_WH_8xN 8, 32
> -
> -
> -;-----------------------------------------------------------------------------
> -; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
> -;-----------------------------------------------------------------------------
> -%macro PIXEL_WH_16xN 2
> -INIT_XMM ssse3
> -cglobal pixelToShort_%1x%2, 3, 7, 6
> -
> -    ; load width and height
> -    mov         r3d, %1
> -    mov         r4d, %2
> -
> -    ; load constant
> -    mova        m4, [pb_128]
> -    mova        m5, [tab_c_64_n64]
> -
> -.loopH:
> -    xor         r5d, r5d
> -.loopW:
> -    lea         r6, [r0 + r5]
> -
> -    movh        m0, [r6]
> -    punpcklbw   m0, m4
> -    pmaddubsw   m0, m5
> -
> -    movh        m1, [r6 + r1]
> -    punpcklbw   m1, m4
> -    pmaddubsw   m1, m5
> -
> -    movh        m2, [r6 + r1 * 2]
> -    punpcklbw   m2, m4
> -    pmaddubsw   m2, m5
> -
> -    lea         r6, [r6 + r1 * 2]
> -    movh        m3, [r6 + r1]
> -    punpcklbw   m3, m4
> -    pmaddubsw   m3, m5
> -
> -    add         r5, 8
> -    cmp         r5, r3
> -
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
> -    je          .nextH
> -    jmp         .loopW
> -
> -
> -.nextH:
> -    lea         r0, [r0 + r1 * 4]
> -    add         r2, FENC_STRIDE * 8
> -
> -    sub         r4d, 4
> -    jnz         .loopH
> -
> -    RET
> -%endmacro
> -PIXEL_WH_16xN 16, 16
> -PIXEL_WH_16xN 16, 8
> -PIXEL_WH_16xN 16, 4
> -PIXEL_WH_16xN 16, 12
> -PIXEL_WH_16xN 16, 32
> -PIXEL_WH_16xN 16, 64
> -
> -;-----------------------------------------------------------------------------
> -; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
> -;-----------------------------------------------------------------------------
> -%macro PIXEL_WH_32xN 2
> -INIT_XMM ssse3
> -cglobal pixelToShort_%1x%2, 3, 7, 6
> -
> -    ; load width and height
> -    mov         r3d, %1
> -    mov         r4d, %2
> -
> -    ; load constant
> -    mova        m4, [pb_128]
> -    mova        m5, [tab_c_64_n64]
> -
> -.loopH:
> -    xor         r5d, r5d
> -.loopW:
> -    lea         r6, [r0 + r5]
> -
> -    movh        m0, [r6]
> -    punpcklbw   m0, m4
> -    pmaddubsw   m0, m5
> -
> -    movh        m1, [r6 + r1]
> -    punpcklbw   m1, m4
> -    pmaddubsw   m1, m5
> -
> -    movh        m2, [r6 + r1 * 2]
> -    punpcklbw   m2, m4
> -    pmaddubsw   m2, m5
> -
> -    lea         r6, [r6 + r1 * 2]
> -    movh        m3, [r6 + r1]
> -    punpcklbw   m3, m4
> -    pmaddubsw   m3, m5
> -
> -    add         r5, 8
> -    cmp         r5, r3
> -
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
> -    je          .nextH
> -    jmp         .loopW
> -
> -
> -.nextH:
> -    lea         r0, [r0 + r1 * 4]
> -    add         r2, FENC_STRIDE * 8
> -
> -    sub         r4d, 4
> -    jnz         .loopH
> -
> -    RET
> -%endmacro
> -PIXEL_WH_32xN 32, 32
> -PIXEL_WH_32xN 32, 8
> -PIXEL_WH_32xN 32, 16
> -PIXEL_WH_32xN 32, 24
> -PIXEL_WH_32xN 32, 64
> -
> -;-----------------------------------------------------------------------------
> -; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
> -;-----------------------------------------------------------------------------
> -%macro PIXEL_WH_64xN 2
> -INIT_XMM ssse3
> -cglobal pixelToShort_%1x%2, 3, 7, 6
> -
> -    ; load width and height
> -    mov         r3d, %1
> -    mov         r4d, %2
> -
> -    ; load constant
> -    mova        m4, [pb_128]
> -    mova        m5, [tab_c_64_n64]
> -
> -.loopH:
> -    xor         r5d, r5d
> -.loopW:
> -    lea         r6, [r0 + r5]
> -
> -    movh        m0, [r6]
> -    punpcklbw   m0, m4
> -    pmaddubsw   m0, m5
> -
> -    movh        m1, [r6 + r1]
> -    punpcklbw   m1, m4
> -    pmaddubsw   m1, m5
> -
> -    movh        m2, [r6 + r1 * 2]
> -    punpcklbw   m2, m4
> -    pmaddubsw   m2, m5
> -
> -    lea         r6, [r6 + r1 * 2]
> -    movh        m3, [r6 + r1]
> -    punpcklbw   m3, m4
> -    pmaddubsw   m3, m5
> -
> -    add         r5, 8
> -    cmp         r5, r3
> -
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
> -    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
> -    je          .nextH
> -    jmp         .loopW
> -
> -
> -.nextH:
> -    lea         r0, [r0 + r1 * 4]
> -    add         r2, FENC_STRIDE * 8
> -
> -    sub         r4d, 4
> -    jnz         .loopH
> -
> -    RET
> -%endmacro
> -PIXEL_WH_64xN 64, 64
> -PIXEL_WH_64xN 64, 16
> -PIXEL_WH_64xN 64, 32
> -PIXEL_WH_64xN 64, 48
>  
>  %macro PROCESS_LUMA_W4_4R 0
>      movd        m0, [r0]
> diff -r 9a5fa67583fe -r 24c96db72960 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/ipfilter8.h	Fri Apr 03 18:18:48 2015 +0530
> @@ -289,8 +289,6 @@
>      SETUP_CHROMA_420_HORIZ_FUNC_DEF(64, 16, cpu); \
>      SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 64, cpu)
>  
> -void x265_chroma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
> -void x265_luma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
>  
>  CHROMA_420_VERT_FILTERS(_sse2);
>  CHROMA_420_HORIZ_FILTERS(_sse4);
> @@ -594,7 +592,6 @@
>  CHROMA_444_SP_FILTERS(_sse4);
>  CHROMA_444_SS_FILTERS(_sse2);
>  
> -void x265_chroma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
>  
>  #undef SETUP_CHROMA_FUNC_DEF
>  #undef SETUP_CHROMA_SP_FUNC_DEF
> @@ -624,28 +621,6 @@
>  LUMA_SP_FILTERS(_avx2);
>  LUMA_SS_FILTERS(_avx2);
>  void x265_interp_8tap_hv_pp_8x8_sse4(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
> -void x265_pixelToShort_4x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_4x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_4x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_8x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_8x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_8x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_8x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_16x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_16x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_16x12_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_16x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_16x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_16x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_32x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_32x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_32x24_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_32x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_32x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_64x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> -void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
>  #undef LUMA_FILTERS
>  #undef LUMA_SP_FILTERS
>  #undef LUMA_SS_FILTERS
> diff -r 9a5fa67583fe -r 24c96db72960 source/test/ipfilterharness.cpp
> --- a/source/test/ipfilterharness.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/test/ipfilterharness.cpp	Fri Apr 03 18:18:48 2015 +0530
> @@ -61,55 +61,6 @@
>      }
>  }
>  
> -bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_wxh_t ref, filter_p2s_wxh_t opt, int isChroma, int csp)
> -{
> -    intptr_t rand_srcStride;
> -    int min_size = isChroma ? 2 : 4;
> -    int max_size = isChroma ? (MAX_CU_SIZE >> 1) : MAX_CU_SIZE;
> -
> -    if (isChroma && (csp == X265_CSP_I444))
> -    {
> -        min_size = 4;
> -        max_size = MAX_CU_SIZE;
> -    }
> -
> -    for (int i = 0; i < ITERS; i++)
> -    {
> -        int index = i % TEST_CASES;
> -        int rand_height = (int16_t)rand() % 100;
> -        int rand_width = (int16_t)rand() % 100;
> -
> -        rand_srcStride = rand_width + rand() % 100;
> -        if (rand_srcStride < rand_width)
> -            rand_srcStride = rand_width;
> -
> -        rand_width &= ~(min_size - 1);
> -        rand_width = x265_clip3(min_size, max_size, rand_width);
> -
> -        rand_height &= ~(min_size - 1);
> -        rand_height = x265_clip3(min_size, max_size, rand_height);
> -
> -        ref(pixel_test_buff[index],
> -            rand_srcStride,
> -            IPF_C_output_s,
> -            rand_width,
> -            rand_height);
> -
> -        checked(opt, pixel_test_buff[index],
> -                rand_srcStride,
> -                IPF_vec_output_s,
> -                rand_width,
> -                rand_height);
> -
> -        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
> -            return false;
> -
> -        reportfail();
> -    }
> -
> -    return true;
> -}
> -
>  bool IPFilterHarness::check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt)
>  {
>      intptr_t rand_srcStride, rand_dstStride;
> @@ -518,12 +469,13 @@
>      {
>          intptr_t rand_srcStride = rand() % 100;
>          int index = i % TEST_CASES;
> +        intptr_t dstStride = rand() % 100 + 64;
>  
> -        ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s);
> +        ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s, dstStride);
>  
> -        checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
> +        checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s, dstStride);
>  
> -        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
> +        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
>              return false;
>  
>          reportfail();
> @@ -538,12 +490,13 @@
>      {
>          intptr_t rand_srcStride = rand() % 100;
>          int index = i % TEST_CASES;
> +        intptr_t dstStride = rand() % 100 + 64;
>  
> -        ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s);
> +        ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s, dstStride);
>  
> -        checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
> +        checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s, dstStride);
>  
> -        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
> +        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
>              return false;
>  
>          reportfail();
> @@ -554,15 +507,6 @@
>  
>  bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
>  {
> -    if (opt.luma_p2s)
> -    {
> -        // last parameter does not matter in case of luma
> -        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s, 0, 1))
> -        {
> -            printf("luma_p2s failed\n");
> -            return false;
> -        }
> -    }
>  
>      for (int value = 0; value < NUM_PU_SIZES; value++)
>      {
> @@ -622,11 +566,11 @@
>                  return false;
>              }
>          }
> -        if (opt.pu[value].filter_p2s)
> +        if (opt.pu[value].convert_p2s)
>          {
> -            if (!check_IPFilterLumaP2S_primitive(ref.pu[value].filter_p2s, opt.pu[value].filter_p2s))
> +            if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s, opt.pu[value].convert_p2s))
>              {
> -                printf("filter_p2s[%s]", lumaPartStr[value]);
> +                printf("convert_p2s[%s]", lumaPartStr[value]);
>                  return false;
>              }
>          }
> @@ -634,14 +578,6 @@
>  
>      for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
>      {
> -        if (opt.chroma[csp].p2s)
> -        {
> -            if (!check_IPFilter_primitive(ref.chroma[csp].p2s, opt.chroma[csp].p2s, 1, csp))
> -            {
> -                printf("chroma_p2s[%s]", x265_source_csp_names[csp]);
> -                return false;
> -            }
> -        }
>          for (int value = 0; value < NUM_PU_SIZES; value++)
>          {
>              if (opt.chroma[csp].pu[value].filter_hpp)
> @@ -692,9 +628,9 @@
>                      return false;
>                  }
>              }
> -            if (opt.chroma[csp].pu[value].chroma_p2s)
> +            if (opt.chroma[csp].pu[value].p2s)
>              {
> -                if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].chroma_p2s, opt.chroma[csp].pu[value].chroma_p2s))
> +                if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s, opt.chroma[csp].pu[value].p2s))
>                  {
>                      printf("chroma_p2s[%s]", chromaPartStr[csp][value]);
>                      return false;
> @@ -708,19 +644,10 @@
>  
>  void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
>  {
> -    int height = 64;
> -    int width = 64;
>      int16_t srcStride = 96;
>      int16_t dstStride = 96;
>      int maxVerticalfilterHalfDistance = 3;
>  
> -    if (opt.luma_p2s)
> -    {
> -        printf("luma_p2s\t");
> -        REPORT_SPEEDUP(opt.luma_p2s, ref.luma_p2s,
> -                       pixel_buff, srcStride, IPF_vec_output_s, width, height);
> -    }
> -
>      for (int value = 0; value < NUM_PU_SIZES; value++)
>      {
>          if (opt.pu[value].luma_hpp)
> @@ -777,23 +704,18 @@
>                             pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
>          }
>  
> -        if (opt.pu[value].filter_p2s)
> +        if (opt.pu[value].convert_p2s)
>          {
> -            printf("filter_p2s [%s]\t", lumaPartStr[value]);
> -            REPORT_SPEEDUP(opt.pu[value].filter_p2s, ref.pu[value].filter_p2s,
> -                           pixel_buff, srcStride, IPF_vec_output_s);
> +            printf("convert_p2s[%s]\t", lumaPartStr[value]);
> +            REPORT_SPEEDUP(opt.pu[value].convert_p2s, ref.pu[value].convert_p2s,
> +                               pixel_buff, srcStride,
> +                               IPF_vec_output_s, dstStride);
>          }
>      }
>  
>      for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
>      {
>          printf("= Color Space %s =\n", x265_source_csp_names[csp]);
> -        if (opt.chroma[csp].p2s)
> -        {
> -            printf("chroma_p2s\t");
> -            REPORT_SPEEDUP(opt.chroma[csp].p2s, ref.chroma[csp].p2s,
> -                           pixel_buff, srcStride, IPF_vec_output_s, width, height);
> -        }
>          for (int value = 0; value < NUM_PU_SIZES; value++)
>          {
>              if (opt.chroma[csp].pu[value].filter_hpp)
> @@ -836,13 +758,11 @@
>                                 short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
>                                 IPF_vec_output_s, dstStride, 1);
>              }
> -
> -            if (opt.chroma[csp].pu[value].chroma_p2s)
> +            if (opt.chroma[csp].pu[value].p2s)
>              {
>                  printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]);
> -                REPORT_SPEEDUP(opt.chroma[csp].pu[value].chroma_p2s, ref.chroma[csp].pu[value].chroma_p2s,
> -                               pixel_buff, srcStride,
> -                               IPF_vec_output_s);
> +                REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s, ref.chroma[csp].pu[value].p2s,
> +                               pixel_buff, srcStride, IPF_vec_output_s, dstStride);
>              }
>          }
>      }
> diff -r 9a5fa67583fe -r 24c96db72960 source/test/ipfilterharness.h
> --- a/source/test/ipfilterharness.h	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/test/ipfilterharness.h	Fri Apr 03 18:18:48 2015 +0530
> @@ -50,7 +50,6 @@
>      pixel   pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
>      int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
>  
> -    bool check_IPFilter_primitive(filter_p2s_wxh_t ref, filter_p2s_wxh_t opt, int isChroma, int csp);
>      bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
>      bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
>      bool check_IPFilterChroma_hps_primitive(filter_hps_t ref, filter_hps_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list