[x265] [PATCH 1 of 5] Refactor EncoderPrimitives under common

Steve Borho steve at borho.org
Fri Jan 9 11:19:24 CET 2015


On 01/09, Kevin Wu wrote:
> # HG changeset patch
> # User Kevin Wu <kevin at multicorewareinc.com>
> # Date 1420752218 21600
> #      Thu Jan 08 15:23:38 2015 -0600
> # Node ID c6ca0fd54aa7c50119c9e5bdbbd02d49abb45559
> # Parent  1924c460d1304d9ce775f35864712dd98f758f9f
> Refactor EncoderPrimitives under common.

this series is queued for testing

> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/dct.cpp
> --- a/source/common/dct.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/dct.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -765,22 +765,22 @@
>      p.dequant_normal = dequant_normal_c;
>      p.quant = quant_c;
>      p.nquant = nquant_c;
> -    p.dct[DST_4x4] = dst4_c;
> -    p.dct[DCT_4x4] = dct4_c;
> -    p.dct[DCT_8x8] = dct8_c;
> -    p.dct[DCT_16x16] = dct16_c;
> -    p.dct[DCT_32x32] = dct32_c;
> -    p.idct[IDST_4x4] = idst4_c;
> -    p.idct[IDCT_4x4] = idct4_c;
> -    p.idct[IDCT_8x8] = idct8_c;
> -    p.idct[IDCT_16x16] = idct16_c;
> -    p.idct[IDCT_32x32] = idct32_c;
> +    p.dst4x4 = dst4_c;
> +    p.cu[BLOCK_4x4].dct   = dct4_c;
> +    p.cu[BLOCK_8x8].dct   = dct8_c;
> +    p.cu[BLOCK_16x16].dct = dct16_c;
> +    p.cu[BLOCK_32x32].dct = dct32_c;
> +    p.idst4x4 = idst4_c;
> +    p.cu[BLOCK_4x4].idct   = idct4_c;
> +    p.cu[BLOCK_8x8].idct   = idct8_c;
> +    p.cu[BLOCK_16x16].idct = idct16_c;
> +    p.cu[BLOCK_32x32].idct = idct32_c;
>      p.count_nonzero = count_nonzero_c;
>      p.denoiseDct = denoiseDct_c;
>  
> -    p.copy_cnt[BLOCK_4x4] = copy_count<4>;
> -    p.copy_cnt[BLOCK_8x8] = copy_count<8>;
> -    p.copy_cnt[BLOCK_16x16] = copy_count<16>;
> -    p.copy_cnt[BLOCK_32x32] = copy_count<32>;
> +    p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
> +    p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
> +    p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
> +    p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
>  }
>  }
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/ipfilter.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -373,37 +373,37 @@
>  // x265 private namespace
>  
>  #define CHROMA_420(W, H) \
> -    p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
> -    p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
> -    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
>  
>  #define CHROMA_422(W, H) \
> -    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
> -    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
> -    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
>  
>  #define CHROMA_444(W, H) \
> -    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
> -    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
> -    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>;  \
> -    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
>  
>  #define LUMA(W, H) \
> -    p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>; \
> -    p.luma_hps[LUMA_ ## W ## x ## H]     = interp_horiz_ps_c<8, W, H>; \
> -    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>;  \
> -    p.luma_vps[LUMA_ ## W ## x ## H]     = interp_vert_ps_c<8, W, H>;  \
> -    p.luma_vsp[LUMA_ ## W ## x ## H]     = interp_vert_sp_c<8, W, H>;  \
> -    p.luma_vss[LUMA_ ## W ## x ## H]     = interp_vert_ss_c<8, W, H>;  \
> -    p.luma_hvpp[LUMA_ ## W ## x ## H]    = interp_hv_pp_c<8, W, H>;
> +    p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_c<8, W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_hps     = interp_horiz_ps_c<8, W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vpp     = interp_vert_pp_c<8, W, H>;  \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vps     = interp_vert_ps_c<8, W, H>;  \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_c<8, W, H>;  \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_c<8, W, H>;  \
> +    p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>;
>  
>  void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
>  {
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/lowres.h
> --- a/source/common/lowres.h	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/lowres.h	Thu Jan 08 15:23:38 2015 -0600
> @@ -69,7 +69,7 @@
>              int qmvy = qmv.y + (qmv.y & 1);
>              int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
>              pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
> -            primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
> +            primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
>              return buf;
>          }
>          else
> @@ -91,7 +91,7 @@
>              int qmvy = qmv.y + (qmv.y & 1);
>              int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
>              pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
> -            primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
> +            primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
>              return comp(fenc, FENC_STRIDE, subpelbuf, 8);
>          }
>          else
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/pixel.cpp
> --- a/source/common/pixel.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/pixel.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -33,58 +33,58 @@
>  using namespace x265;
>  
>  #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
> -    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
> +    p.pu[LUMA_4x4].FUNC_PREFIX   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_8x8].FUNC_PREFIX   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_8x4].FUNC_PREFIX   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_4x8].FUNC_PREFIX   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_16x16].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_16x8].FUNC_PREFIX  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_8x16].FUNC_PREFIX  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_16x12].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_12x16].FUNC_PREFIX = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_16x4].FUNC_PREFIX  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_4x16].FUNC_PREFIX  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_32x32].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_32x16].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_16x32].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_32x24].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_24x32].FUNC_PREFIX = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_32x8].FUNC_PREFIX  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_8x32].FUNC_PREFIX  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_64x64].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_64x32].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_32x64].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_64x48].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_48x64].FUNC_PREFIX = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_64x16].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.pu[LUMA_16x64].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
>  
>  #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
> -    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
> -    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX<8,  8>; \
> -    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX<8,  4>; \
> -    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX<4,  8>; \
> -    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
> -    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX<16,  8>; \
> -    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX<8, 16>; \
> -    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
> -    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
> -    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX<16,  4>; \
> -    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX<4, 16>; \
> -    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
> -    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
> -    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
> -    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
> -    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
> -    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX<32,  8>; \
> -    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX<8, 32>; \
> -    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
> -    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
> -    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
> -    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
> -    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
> -    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
> -    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
> +    p.pu[LUMA_4x4].FUNC_PREFIX   = FUNC_PREFIX<4,  4>; \
> +    p.pu[LUMA_8x8].FUNC_PREFIX   = FUNC_PREFIX<8,  8>; \
> +    p.pu[LUMA_8x4].FUNC_PREFIX   = FUNC_PREFIX<8,  4>; \
> +    p.pu[LUMA_4x8].FUNC_PREFIX   = FUNC_PREFIX<4,  8>; \
> +    p.pu[LUMA_16x16].FUNC_PREFIX = FUNC_PREFIX<16, 16>; \
> +    p.pu[LUMA_16x8].FUNC_PREFIX  = FUNC_PREFIX<16,  8>; \
> +    p.pu[LUMA_8x16].FUNC_PREFIX  = FUNC_PREFIX<8, 16>; \
> +    p.pu[LUMA_16x12].FUNC_PREFIX = FUNC_PREFIX<16, 12>; \
> +    p.pu[LUMA_12x16].FUNC_PREFIX = FUNC_PREFIX<12, 16>; \
> +    p.pu[LUMA_16x4].FUNC_PREFIX  = FUNC_PREFIX<16,  4>; \
> +    p.pu[LUMA_4x16].FUNC_PREFIX  = FUNC_PREFIX<4, 16>; \
> +    p.pu[LUMA_32x32].FUNC_PREFIX = FUNC_PREFIX<32, 32>; \
> +    p.pu[LUMA_32x16].FUNC_PREFIX = FUNC_PREFIX<32, 16>; \
> +    p.pu[LUMA_16x32].FUNC_PREFIX = FUNC_PREFIX<16, 32>; \
> +    p.pu[LUMA_32x24].FUNC_PREFIX = FUNC_PREFIX<32, 24>; \
> +    p.pu[LUMA_24x32].FUNC_PREFIX = FUNC_PREFIX<24, 32>; \
> +    p.pu[LUMA_32x8].FUNC_PREFIX  = FUNC_PREFIX<32,  8>; \
> +    p.pu[LUMA_8x32].FUNC_PREFIX  = FUNC_PREFIX<8, 32>; \
> +    p.pu[LUMA_64x64].FUNC_PREFIX = FUNC_PREFIX<64, 64>; \
> +    p.pu[LUMA_64x32].FUNC_PREFIX = FUNC_PREFIX<64, 32>; \
> +    p.pu[LUMA_32x64].FUNC_PREFIX = FUNC_PREFIX<32, 64>; \
> +    p.pu[LUMA_64x48].FUNC_PREFIX = FUNC_PREFIX<64, 48>; \
> +    p.pu[LUMA_48x64].FUNC_PREFIX = FUNC_PREFIX<48, 64>; \
> +    p.pu[LUMA_64x16].FUNC_PREFIX = FUNC_PREFIX<64, 16>; \
> +    p.pu[LUMA_16x64].FUNC_PREFIX = FUNC_PREFIX<16, 64>;
>  
>  namespace {
>  // place functions in anonymous namespace (file static)
> @@ -1019,132 +1019,132 @@
>      SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
>  
>      // satd
> -    p.satd[LUMA_4x4]   = satd_4x4;
> -    p.satd[LUMA_8x8]   = satd8<8, 8>;
> -    p.satd[LUMA_8x4]   = satd_8x4;
> -    p.satd[LUMA_4x8]   = satd4<4, 8>;
> -    p.satd[LUMA_16x16] = satd8<16, 16>;
> -    p.satd[LUMA_16x8]  = satd8<16, 8>;
> -    p.satd[LUMA_8x16]  = satd8<8, 16>;
> -    p.satd[LUMA_16x12] = satd8<16, 12>;
> -    p.satd[LUMA_12x16] = satd4<12, 16>;
> -    p.satd[LUMA_16x4]  = satd8<16, 4>;
> -    p.satd[LUMA_4x16]  = satd4<4, 16>;
> -    p.satd[LUMA_32x32] = satd8<32, 32>;
> -    p.satd[LUMA_32x16] = satd8<32, 16>;
> -    p.satd[LUMA_16x32] = satd8<16, 32>;
> -    p.satd[LUMA_32x24] = satd8<32, 24>;
> -    p.satd[LUMA_24x32] = satd8<24, 32>;
> -    p.satd[LUMA_32x8]  = satd8<32, 8>;
> -    p.satd[LUMA_8x32]  = satd8<8, 32>;
> -    p.satd[LUMA_64x64] = satd8<64, 64>;
> -    p.satd[LUMA_64x32] = satd8<64, 32>;
> -    p.satd[LUMA_32x64] = satd8<32, 64>;
> -    p.satd[LUMA_64x48] = satd8<64, 48>;
> -    p.satd[LUMA_48x64] = satd8<48, 64>;
> -    p.satd[LUMA_64x16] = satd8<64, 16>;
> -    p.satd[LUMA_16x64] = satd8<16, 64>;
> +    p.pu[LUMA_4x4].satd   = satd_4x4;
> +    p.pu[LUMA_8x8].satd   = satd8<8, 8>;
> +    p.pu[LUMA_8x4].satd   = satd_8x4;
> +    p.pu[LUMA_4x8].satd   = satd4<4, 8>;
> +    p.pu[LUMA_16x16].satd = satd8<16, 16>;
> +    p.pu[LUMA_16x8].satd  = satd8<16, 8>;
> +    p.pu[LUMA_8x16].satd  = satd8<8, 16>;
> +    p.pu[LUMA_16x12].satd = satd8<16, 12>;
> +    p.pu[LUMA_12x16].satd = satd4<12, 16>;
> +    p.pu[LUMA_16x4].satd  = satd8<16, 4>;
> +    p.pu[LUMA_4x16].satd  = satd4<4, 16>;
> +    p.pu[LUMA_32x32].satd = satd8<32, 32>;
> +    p.pu[LUMA_32x16].satd = satd8<32, 16>;
> +    p.pu[LUMA_16x32].satd = satd8<16, 32>;
> +    p.pu[LUMA_32x24].satd = satd8<32, 24>;
> +    p.pu[LUMA_24x32].satd = satd8<24, 32>;
> +    p.pu[LUMA_32x8].satd  = satd8<32, 8>;
> +    p.pu[LUMA_8x32].satd  = satd8<8, 32>;
> +    p.pu[LUMA_64x64].satd = satd8<64, 64>;
> +    p.pu[LUMA_64x32].satd = satd8<64, 32>;
> +    p.pu[LUMA_32x64].satd = satd8<32, 64>;
> +    p.pu[LUMA_64x48].satd = satd8<64, 48>;
> +    p.pu[LUMA_48x64].satd = satd8<48, 64>;
> +    p.pu[LUMA_64x16].satd = satd8<64, 16>;
> +    p.pu[LUMA_16x64].satd = satd8<16, 64>;
>  
> -    p.chroma[X265_CSP_I420].satd[CHROMA_2x2]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = satd_4x4;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = satd8<8, 8>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_2x2].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x4].satd   = satd_4x4;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x8].satd   = satd8<8, 8>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x16].satd = satd8<16, 16>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x32].satd = satd8<32, 32>;
>  
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x2]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_2x4]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = satd_8x4;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = satd4<4, 8>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = satd8<16, 8>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = satd8<8, 16>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x2].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_2x4].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x4].satd   = satd_8x4;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x8].satd   = satd4<4, 8>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x8].satd  = satd8<16, 8>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x16].satd  = satd8<8, 16>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x16].satd = satd8<32, 16>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x32].satd = satd8<16, 32>;
>  
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x6]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_6x8]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x2]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_2x8]   = NULL;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = satd4<16, 4>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = satd4<4, 16>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = satd8<32, 8>;
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = satd8<8, 32>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x6].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_6x8].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x2].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_2x8].satd   = NULL;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x12].satd = satd4<16, 12>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_12x16].satd = satd4<12, 16>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x4].satd  = satd4<16, 4>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x16].satd  = satd4<4, 16>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x24].satd = satd8<32, 24>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_24x32].satd = satd8<24, 32>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x8].satd  = satd8<32, 8>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x32].satd  = satd8<8, 32>;
>  
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_2x4]   = NULL;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = satd4<4, 8>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = satd8<8, 16>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_2x4].satd   = NULL;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x8].satd   = satd4<4, 8>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x16].satd  = satd8<8, 16>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].satd = satd8<16, 32>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x64].satd = satd8<32, 64>;
>  
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = satd_4x4;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_2x8]   = NULL;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = satd8<8, 8>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = satd4<4, 16>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = satd8<8, 32>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x4].satd   = satd_4x4;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_2x8].satd   = NULL;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x8].satd   = satd8<8, 8>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x16].satd  = satd4<4, 16>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x16].satd = satd8<16, 16>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x32].satd  = satd8<8, 32>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x32].satd = satd8<32, 32>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].satd = satd8<16, 64>;
>  
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_6x16]  = NULL;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = satd4<8, 4>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_2x16]  = NULL;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = satd8<16, 8>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x12].satd  = satd4<8, 12>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_6x16].satd  = NULL;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x4].satd   = satd4<8, 4>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_2x16].satd  = NULL;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x24].satd = satd8<16, 24>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_12x32].satd = satd4<12, 32>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x8].satd  = satd8<16, 8>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x32].satd  = satd4<4, 32>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x48].satd = satd8<32, 48>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_24x64].satd = satd8<24, 64>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x16].satd = satd8<32, 16>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x64].satd  = satd8<8, 64>;
>  
>  #define CHROMA_420(W, H) \
> -    p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H]  = addAvg<W, H>;         \
> -    p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
> -    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
> -    p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
> -    p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].addAvg  = addAvg<W, H>;         \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>;
>  
>  #define CHROMA_422(W, H) \
> -    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H]  = addAvg<W, H>;         \
> -    p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
> -    p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
> -    p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
> -    p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].addAvg  = addAvg<W, H>;         \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>;
>  
>  #define CHROMA_444(W, H) \
> -    p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H]    = p.satd[LUMA_ ## W ## x ## H]; \
> -    p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
> -    p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
> -    p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
> -    p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
> -    p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].satd    = p.pu[LUMA_ ## W ## x ## H].satd; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].addAvg  = addAvg<W, H>; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>;
>  
>  #define LUMA(W, H) \
> -    p.luma_addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
> -    p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
> -    p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
> -    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
> -    p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
> +    p.pu[LUMA_ ## W ## x ## H].luma_addAvg  = addAvg<W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_copy_pp = blockcopy_pp_c<W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_copy_sp = blockcopy_sp_c<W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_copy_ps = blockcopy_ps_c<W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_copy_ss = blockcopy_ss_c<W, H>;
>  
>  #define LUMA_PIXELSUB(W, H) \
> -    p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
> -    p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
> +    p.pu[LUMA_ ## W ## x ## H].luma_sub_ps = pixel_sub_ps_c<W, H>; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_add_ps = pixel_add_ps_c<W, H>;
>  
>  #define CHROMA_PIXELSUB_420(W, H) \
> -    p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;  \
> -    p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
> +    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>;  \
> +    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
>  
>  #define CHROMA_PIXELSUB_422(W, H) \
> -    p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
> -    p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
> +    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
> +    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
>  
>  #define CHROMA_PIXELSUB_444(W, H) \
> -    p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
> -    p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
> +    p.chroma[X265_CSP_I444].cu[LUMA_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
> +    p.chroma[X265_CSP_I444].cu[LUMA_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
>  
>      LUMA(4, 4);
>      LUMA(8, 8);
> @@ -1269,89 +1269,89 @@
>      SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
>      SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
>  
> -    p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
> -    p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
> -    p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
> -    p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
> -    p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
> +    p.cu[BLOCK_4x4].blockfill_s   = blockfil_s_c<4>;
> +    p.cu[BLOCK_8x8].blockfill_s   = blockfil_s_c<8>;
> +    p.cu[BLOCK_16x16].blockfill_s = blockfil_s_c<16>;
> +    p.cu[BLOCK_32x32].blockfill_s = blockfil_s_c<32>;
> +    p.cu[BLOCK_64x64].blockfill_s = blockfil_s_c<64>;
>  
> -    p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
> -    p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
> -    p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
> -    p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
> -    p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
> -    p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
> -    p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
> -    p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
> -    p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
> -    p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
> -    p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
> -    p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
> -    p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
> -    p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
> -    p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
> -    p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
> +    p.cu[BLOCK_4x4].cpy2Dto1D_shl   = cpy2Dto1D_shl<4>;
> +    p.cu[BLOCK_8x8].cpy2Dto1D_shl   = cpy2Dto1D_shl<8>;
> +    p.cu[BLOCK_16x16].cpy2Dto1D_shl = cpy2Dto1D_shl<16>;
> +    p.cu[BLOCK_32x32].cpy2Dto1D_shl = cpy2Dto1D_shl<32>;
> +    p.cu[BLOCK_4x4].cpy2Dto1D_shr   = cpy2Dto1D_shr<4>;
> +    p.cu[BLOCK_8x8].cpy2Dto1D_shr   = cpy2Dto1D_shr<8>;
> +    p.cu[BLOCK_16x16].cpy2Dto1D_shr = cpy2Dto1D_shr<16>;
> +    p.cu[BLOCK_32x32].cpy2Dto1D_shr = cpy2Dto1D_shr<32>;
> +    p.cu[BLOCK_4x4].cpy1Dto2D_shl   = cpy1Dto2D_shl<4>;
> +    p.cu[BLOCK_8x8].cpy1Dto2D_shl   = cpy1Dto2D_shl<8>;
> +    p.cu[BLOCK_16x16].cpy1Dto2D_shl = cpy1Dto2D_shl<16>;
> +    p.cu[BLOCK_32x32].cpy1Dto2D_shl = cpy1Dto2D_shl<32>;
> +    p.cu[BLOCK_4x4].cpy1Dto2D_shr   = cpy1Dto2D_shr<4>;
> +    p.cu[BLOCK_8x8].cpy1Dto2D_shr   = cpy1Dto2D_shr<8>;
> +    p.cu[BLOCK_16x16].cpy1Dto2D_shr = cpy1Dto2D_shr<16>;
> +    p.cu[BLOCK_32x32].cpy1Dto2D_shr = cpy1Dto2D_shr<32>;
>  
> -    p.sa8d[BLOCK_4x4]   = satd_4x4;
> -    p.sa8d[BLOCK_8x8]   = sa8d_8x8;
> -    p.sa8d[BLOCK_16x16] = sa8d_16x16;
> -    p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
> -    p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
> +    p.cu[BLOCK_4x4].sa8d   = satd_4x4;
> +    p.cu[BLOCK_8x8].sa8d   = sa8d_8x8;
> +    p.cu[BLOCK_16x16].sa8d = sa8d_16x16;
> +    p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
> +    p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
>  
> -    p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
> -    p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
> -    p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
> -    p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
> -    p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
> +    p.cu[BLOCK_4x4].psy_cost_pp   = psyCost_pp<BLOCK_4x4>;
> +    p.cu[BLOCK_8x8].psy_cost_pp   = psyCost_pp<BLOCK_8x8>;
> +    p.cu[BLOCK_16x16].psy_cost_pp = psyCost_pp<BLOCK_16x16>;
> +    p.cu[BLOCK_32x32].psy_cost_pp = psyCost_pp<BLOCK_32x32>;
> +    p.cu[BLOCK_64x64].psy_cost_pp = psyCost_pp<BLOCK_64x64>;
>  
> -    p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
> -    p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
> -    p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
> -    p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
> -    p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
> +    p.cu[BLOCK_4x4].psy_cost_ss   = psyCost_ss<BLOCK_4x4>;
> +    p.cu[BLOCK_8x8].psy_cost_ss   = psyCost_ss<BLOCK_8x8>;
> +    p.cu[BLOCK_16x16].psy_cost_ss = psyCost_ss<BLOCK_16x16>;
> +    p.cu[BLOCK_32x32].psy_cost_ss = psyCost_ss<BLOCK_32x32>;
> +    p.cu[BLOCK_64x64].psy_cost_ss = psyCost_ss<BLOCK_64x64>;
>  
> -    p.sa8d_inter[LUMA_4x4]   = satd_4x4;
> -    p.sa8d_inter[LUMA_8x8]   = sa8d_8x8;
> -    p.sa8d_inter[LUMA_8x4]   = satd_8x4;
> -    p.sa8d_inter[LUMA_4x8]   = satd4<4, 8>;
> -    p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
> -    p.sa8d_inter[LUMA_16x8]  = sa8d8<16, 8>;
> -    p.sa8d_inter[LUMA_8x16]  = sa8d8<8, 16>;
> -    p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
> -    p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
> -    p.sa8d_inter[LUMA_4x16]  = satd4<4, 16>;
> -    p.sa8d_inter[LUMA_16x4]  = satd8<16, 4>;
> -    p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
> -    p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
> -    p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
> -    p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
> -    p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
> -    p.sa8d_inter[LUMA_32x8]  = sa8d8<32, 8>;
> -    p.sa8d_inter[LUMA_8x32]  = sa8d8<8, 32>;
> -    p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
> -    p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
> -    p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
> -    p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
> -    p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
> -    p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
> -    p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
> +    p.pu[LUMA_4x4].sa8d_inter   = satd_4x4;
> +    p.pu[LUMA_8x8].sa8d_inter   = sa8d_8x8;
> +    p.pu[LUMA_8x4].sa8d_inter   = satd_8x4;
> +    p.pu[LUMA_4x8].sa8d_inter   = satd4<4, 8>;
> +    p.pu[LUMA_16x16].sa8d_inter = sa8d_16x16;
> +    p.pu[LUMA_16x8].sa8d_inter  = sa8d8<16, 8>;
> +    p.pu[LUMA_8x16].sa8d_inter  = sa8d8<8, 16>;
> +    p.pu[LUMA_16x12].sa8d_inter = satd8<16, 12>;
> +    p.pu[LUMA_12x16].sa8d_inter = satd4<12, 16>;
> +    p.pu[LUMA_4x16].sa8d_inter  = satd4<4, 16>;
> +    p.pu[LUMA_16x4].sa8d_inter  = satd8<16, 4>;
> +    p.pu[LUMA_32x32].sa8d_inter = sa8d16<32, 32>;
> +    p.pu[LUMA_32x16].sa8d_inter = sa8d16<32, 16>;
> +    p.pu[LUMA_16x32].sa8d_inter = sa8d16<16, 32>;
> +    p.pu[LUMA_32x24].sa8d_inter = sa8d8<32, 24>;
> +    p.pu[LUMA_24x32].sa8d_inter = sa8d8<24, 32>;
> +    p.pu[LUMA_32x8].sa8d_inter  = sa8d8<32, 8>;
> +    p.pu[LUMA_8x32].sa8d_inter  = sa8d8<8, 32>;
> +    p.pu[LUMA_64x64].sa8d_inter = sa8d16<64, 64>;
> +    p.pu[LUMA_64x32].sa8d_inter = sa8d16<64, 32>;
> +    p.pu[LUMA_32x64].sa8d_inter = sa8d16<32, 64>;
> +    p.pu[LUMA_64x48].sa8d_inter = sa8d16<64, 48>;
> +    p.pu[LUMA_48x64].sa8d_inter = sa8d16<48, 64>;
> +    p.pu[LUMA_64x16].sa8d_inter = sa8d16<64, 16>;
> +    p.pu[LUMA_16x64].sa8d_inter = sa8d16<16, 64>;
>  
> -    p.calcresidual[BLOCK_4x4] = getResidual<4>;
> -    p.calcresidual[BLOCK_8x8] = getResidual<8>;
> -    p.calcresidual[BLOCK_16x16] = getResidual<16>;
> -    p.calcresidual[BLOCK_32x32] = getResidual<32>;
> -    p.calcresidual[BLOCK_64x64] = NULL;
> +    p.cu[BLOCK_4x4].calcresidual   = getResidual<4>;
> +    p.cu[BLOCK_8x8].calcresidual   = getResidual<8>;
> +    p.cu[BLOCK_16x16].calcresidual = getResidual<16>;
> +    p.cu[BLOCK_32x32].calcresidual = getResidual<32>;
> +    p.cu[BLOCK_64x64].calcresidual = NULL;
>  
> -    p.transpose[BLOCK_4x4] = transpose<4>;
> -    p.transpose[BLOCK_8x8] = transpose<8>;
> -    p.transpose[BLOCK_16x16] = transpose<16>;
> -    p.transpose[BLOCK_32x32] = transpose<32>;
> -    p.transpose[BLOCK_64x64] = transpose<64>;
> +    p.cu[BLOCK_4x4].transpose   = transpose<4>;
> +    p.cu[BLOCK_8x8].transpose   = transpose<8>;
> +    p.cu[BLOCK_16x16].transpose = transpose<16>;
> +    p.cu[BLOCK_32x32].transpose = transpose<32>;
> +    p.cu[BLOCK_64x64].transpose = transpose<64>;
>  
> -    p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
> -    p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
> -    p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
> -    p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
> +    p.cu[BLOCK_4x4].ssd_s   = pixel_ssd_s_c<4>;
> +    p.cu[BLOCK_8x8].ssd_s   = pixel_ssd_s_c<8>;
> +    p.cu[BLOCK_16x16].ssd_s = pixel_ssd_s_c<16>;
> +    p.cu[BLOCK_32x32].ssd_s = pixel_ssd_s_c<32>;
>  
>      p.weight_pp = weight_pp_c;
>      p.weight_sp = weight_sp_c;
> @@ -1362,10 +1362,10 @@
>      p.ssim_4x4x2_core = ssim_4x4x2_core;
>      p.ssim_end_4 = ssim_end_4;
>  
> -    p.var[BLOCK_8x8] = pixel_var<8>;
> -    p.var[BLOCK_16x16] = pixel_var<16>;
> -    p.var[BLOCK_32x32] = pixel_var<32>;
> -    p.var[BLOCK_64x64] = pixel_var<64>;
> +    p.cu[BLOCK_8x8].var   = pixel_var<8>;
> +    p.cu[BLOCK_16x16].var = pixel_var<16>;
> +    p.cu[BLOCK_32x32].var = pixel_var<32>;
> +    p.cu[BLOCK_64x64].var = pixel_var<64>;
>      p.planecopy_cp = planecopy_cp_c;
>      p.planecopy_sp = planecopy_sp_c;
>      p.propagateCost = estimateCUPropagateCost;
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/predict.cpp
> --- a/source/common/predict.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/predict.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -334,13 +334,13 @@
>      int yFrac = mv.y & 0x3;
>  
>      if (!(yFrac | xFrac))
> -        primitives.luma_copy_pp[partEnum](dst, dstStride, src, srcStride);
> +        primitives.pu[partEnum].luma_copy_pp(dst, dstStride, src, srcStride);
>      else if (!yFrac)
> -        primitives.luma_hpp[partEnum](src, srcStride, dst, dstStride, xFrac);
> +        primitives.pu[partEnum].luma_hpp(src, srcStride, dst, dstStride, xFrac);
>      else if (!xFrac)
> -        primitives.luma_vpp[partEnum](src, srcStride, dst, dstStride, yFrac);
> +        primitives.pu[partEnum].luma_vpp(src, srcStride, dst, dstStride, yFrac);
>      else
> -        primitives.luma_hvpp[partEnum](src, srcStride, dst, dstStride, xFrac, yFrac);
> +        primitives.pu[partEnum].luma_hvpp(src, srcStride, dst, dstStride, xFrac, yFrac);
>  }
>  
>  void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
> @@ -363,16 +363,16 @@
>      if (!(yFrac | xFrac))
>          primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight);
>      else if (!yFrac)
> -        primitives.luma_hps[partEnum](src, srcStride, dst, dstStride, xFrac, 0);
> +        primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
>      else if (!xFrac)
> -        primitives.luma_vps[partEnum](src, srcStride, dst, dstStride, yFrac);
> +        primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
>      else
>      {
>          int tmpStride = m_puWidth;
>          int filterSize = NTAPS_LUMA;
>          int halfFilterSize = (filterSize >> 1);
> -        primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1);
> -        primitives.luma_vss[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
> +        primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
> +        primitives.pu[partEnum].luma_vss(m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
>      }
>  }
>  
> @@ -399,18 +399,18 @@
>      
>      if (!(yFrac | xFrac))
>      {
> -        primitives.chroma[m_csp].copy_pp[partEnum](dstCb, dstStride, refCb, refStride);
> -        primitives.chroma[m_csp].copy_pp[partEnum](dstCr, dstStride, refCr, refStride);
> +        primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCb, dstStride, refCb, refStride);
> +        primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCr, dstStride, refCr, refStride);
>      }
>      else if (!yFrac)
>      {
> -        primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
> -        primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
>      }
>      else if (!xFrac)
>      {
> -        primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> -        primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
>      }
>      else
>      {
> @@ -418,11 +418,11 @@
>          int filterSize = NTAPS_CHROMA;
>          int halfFilterSize = (filterSize >> 1);
>  
> -        primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> -        primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> +        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
>  
> -        primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> -        primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> +        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
>      }
>  }
>  
> @@ -459,23 +459,23 @@
>      }
>      else if (!yFrac)
>      {
> -        primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
> -        primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
> +        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
> +        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
>      }
>      else if (!xFrac)
>      {
> -        primitives.chroma[m_csp].filter_vps[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> -        primitives.chroma[m_csp].filter_vps[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
>      }
>      else
>      {
>          int extStride = cxWidth;
>          int filterSize = NTAPS_CHROMA;
>          int halfFilterSize = (filterSize >> 1);
> -        primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> -        primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> -        primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> -        primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> +        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
> +        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
> +        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
>      }
>  }
>  
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/primitives.cpp
> --- a/source/common/primitives.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/primitives.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -71,79 +71,79 @@
>      /* copy reusable luma primitives to chroma 4:4:4 */
>      for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
>      {
> -        p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
> -        p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
> -        p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
> -        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
> -        p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
> -        p.chroma[X265_CSP_I444].satd[i] = p.satd[i];
> +        p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].luma_copy_pp;
> +        p.chroma[X265_CSP_I444].pu[i].copy_ps = p.pu[i].luma_copy_ps;
> +        p.chroma[X265_CSP_I444].pu[i].copy_sp = p.pu[i].luma_copy_sp;
> +        p.chroma[X265_CSP_I444].pu[i].copy_ss = p.pu[i].luma_copy_ss;
> +        p.chroma[X265_CSP_I444].pu[i].addAvg  = p.pu[i].luma_addAvg;
> +        p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
>      }
>  
>      for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
>      {
> -        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];
> -        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
> +        p.chroma[X265_CSP_I444].cu[i].add_ps  = p.pu[i].luma_add_ps;
> +        p.chroma[X265_CSP_I444].cu[i].sub_ps  = p.pu[i].luma_sub_ps;
>      }
>  
> -    primitives.sa8d[BLOCK_4x4]   = primitives.satd[LUMA_4x4];
> -    primitives.sa8d[BLOCK_8x8]   = primitives.sa8d_inter[LUMA_8x8];
> -    primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
> -    primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];
> -    primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];
> +    primitives.cu[BLOCK_4x4].sa8d   = primitives.pu[LUMA_4x4].satd;
> +    primitives.cu[BLOCK_8x8].sa8d   = primitives.pu[LUMA_8x8].sa8d_inter;
> +    primitives.cu[BLOCK_16x16].sa8d = primitives.pu[LUMA_16x16].sa8d_inter;
> +    primitives.cu[BLOCK_32x32].sa8d = primitives.pu[LUMA_32x32].sa8d_inter;
> +    primitives.cu[BLOCK_64x64].sa8d = primitives.pu[LUMA_64x64].sa8d_inter;
>  
>      // SA8D devolves to SATD for blocks not even multiples of 8x8
> -    primitives.sa8d_inter[LUMA_4x4]   = primitives.satd[LUMA_4x4];
> -    primitives.sa8d_inter[LUMA_4x8]   = primitives.satd[LUMA_4x8];
> -    primitives.sa8d_inter[LUMA_4x16]  = primitives.satd[LUMA_4x16];
> -    primitives.sa8d_inter[LUMA_8x4]   = primitives.satd[LUMA_8x4];
> -    primitives.sa8d_inter[LUMA_16x4]  = primitives.satd[LUMA_16x4];
> -    primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12];
> -    primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16];
> +    primitives.pu[LUMA_4x4].sa8d_inter   = primitives.pu[LUMA_4x4].satd;
> +    primitives.pu[LUMA_4x8].sa8d_inter   = primitives.pu[LUMA_4x8].satd;
> +    primitives.pu[LUMA_4x16].sa8d_inter  = primitives.pu[LUMA_4x16].satd;
> +    primitives.pu[LUMA_8x4].sa8d_inter   = primitives.pu[LUMA_8x4].satd;
> +    primitives.pu[LUMA_16x4].sa8d_inter  = primitives.pu[LUMA_16x4].satd;
> +    primitives.pu[LUMA_16x12].sa8d_inter = primitives.pu[LUMA_16x12].satd;
> +    primitives.pu[LUMA_12x16].sa8d_inter = primitives.pu[LUMA_12x16].satd;
>  
>      // Chroma SATD can often reuse luma primitives
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = primitives.satd[LUMA_4x4];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = primitives.satd[LUMA_8x8];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = primitives.satd[LUMA_16x16];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = primitives.satd[LUMA_32x32];
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x4].satd   = primitives.pu[LUMA_4x4].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x8].satd   = primitives.pu[LUMA_8x8].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x16].satd = primitives.pu[LUMA_16x16].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x32].satd = primitives.pu[LUMA_32x32].satd;
>  
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = primitives.satd[LUMA_8x4];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = primitives.satd[LUMA_4x8];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = primitives.satd[LUMA_16x8];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = primitives.satd[LUMA_8x16];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = primitives.satd[LUMA_32x16];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = primitives.satd[LUMA_16x32];
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x4].satd   = primitives.pu[LUMA_8x4].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x8].satd   = primitives.pu[LUMA_4x8].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x8].satd  = primitives.pu[LUMA_16x8].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x16].satd  = primitives.pu[LUMA_8x16].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x16].satd = primitives.pu[LUMA_32x16].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x32].satd = primitives.pu[LUMA_16x32].satd;
>  
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = primitives.satd[LUMA_16x12];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = primitives.satd[LUMA_12x16];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = primitives.satd[LUMA_16x4];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = primitives.satd[LUMA_4x16];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = primitives.satd[LUMA_32x24];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = primitives.satd[LUMA_24x32];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = primitives.satd[LUMA_32x8];
> -    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = primitives.satd[LUMA_8x32];
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x12].satd = primitives.pu[LUMA_16x12].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_12x16].satd = primitives.pu[LUMA_12x16].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_16x4].satd  = primitives.pu[LUMA_16x4].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_4x16].satd  = primitives.pu[LUMA_4x16].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x24].satd = primitives.pu[LUMA_32x24].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_24x32].satd = primitives.pu[LUMA_24x32].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_32x8].satd  = primitives.pu[LUMA_32x8].satd;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_8x32].satd  = primitives.pu[LUMA_8x32].satd;
>  
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = primitives.satd[LUMA_4x8];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = primitives.satd[LUMA_8x16];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = primitives.satd[LUMA_16x32];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = primitives.satd[LUMA_32x64];
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x8].satd   = primitives.pu[LUMA_4x8].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x16].satd  = primitives.pu[LUMA_8x16].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].satd = primitives.pu[LUMA_16x32].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x64].satd = primitives.pu[LUMA_32x64].satd;
>  
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = primitives.satd[LUMA_4x4];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = primitives.satd[LUMA_8x8];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = primitives.satd[LUMA_4x16];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = primitives.satd[LUMA_16x16];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = primitives.satd[LUMA_8x32];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = primitives.satd[LUMA_32x32];
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = primitives.satd[LUMA_16x64];
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x4].satd   = primitives.pu[LUMA_4x4].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x8].satd   = primitives.pu[LUMA_8x8].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_4x16].satd  = primitives.pu[LUMA_4x16].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x16].satd = primitives.pu[LUMA_16x16].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x32].satd  = primitives.pu[LUMA_8x32].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x32].satd = primitives.pu[LUMA_32x32].satd;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].satd = primitives.pu[LUMA_16x64].satd;
>  
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = primitives.satd[LUMA_8x4];
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_8x4].satd  = primitives.pu[LUMA_8x4].satd;
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = primitives.satd[LUMA_16x8];
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_16x8].satd = primitives.pu[LUMA_16x8].satd;
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
> -    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16];
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_32x16].satd = primitives.pu[LUMA_32x16].satd;
>      //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
>  }
>  }
> @@ -158,7 +158,7 @@
>          cpuid = x265::cpu_detect();
>  
>      // initialize global variables
> -    if (!primitives.sad[0])
> +    if (!primitives.pu[0].sad)
>      {
>          Setup_C_Primitives(primitives);
>  
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/primitives.h
> --- a/source/common/primitives.h	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/primitives.h	Thu Jan 08 15:23:38 2015 -0600
> @@ -42,7 +42,7 @@
>      LUMA_4x4,   LUMA_8x8,   LUMA_16x16, LUMA_32x32, LUMA_64x64,
>      // Rectangular
>      LUMA_8x4,   LUMA_4x8,
> -    LUMA_16x8,  LUMA_8x16,  
> +    LUMA_16x8,  LUMA_8x16,
>      LUMA_32x16, LUMA_16x32,
>      LUMA_64x32, LUMA_32x64,
>      // Asymmetrical (0.75, 0.25)
> @@ -206,42 +206,76 @@
>   * a vectorized primitive, or a C function. */
>  struct EncoderPrimitives
>  {
> -    pixelcmp_t            sad[NUM_LUMA_PARTITIONS];        // Sum of Differences for each size
> -    pixelcmp_x3_t         sad_x3[NUM_LUMA_PARTITIONS];     // Sum of Differences 3x for each size
> -    pixelcmp_x4_t         sad_x4[NUM_LUMA_PARTITIONS];     // Sum of Differences 4x for each size
> -    pixelcmp_t            sse_pp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
> -    pixelcmp_ss_t         sse_ss[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, short) fenc alignment not assumed
> -    pixelcmp_sp_t         sse_sp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, pixel) fenc alignment not assumed
> -    pixel_ssd_s_t         ssd_s[NUM_SQUARE_BLOCKS - 1];    // Sum of Square Error (short) fenc alignment not assumed
> -    pixelcmp_t            satd[NUM_LUMA_PARTITIONS];       // Sum of Transformed differences (HADAMARD)
> -    pixelcmp_t            sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
> -    pixelcmp_t            sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks
> -    pixelcmp_t            psy_cost_pp[NUM_SQUARE_BLOCKS];  // difference in AC energy between two blocks
> -    pixelcmp_ss_t         psy_cost_ss[NUM_SQUARE_BLOCKS];
> +  struct PU
> +  {
> +    pixelcmp_t            sad;        // Sum of Differences for each size
> +    pixelcmp_x3_t         sad_x3;     // Sum of Differences 3x for each size
> +    pixelcmp_x4_t         sad_x4;     // Sum of Differences 4x for each size
> +    pixelcmp_t            sse_pp;     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
> +    pixelcmp_ss_t         sse_ss;     // Sum of Square Error (short, short) fenc alignment not assumed
> +    pixelcmp_sp_t         sse_sp;     // Sum of Square Error (short, pixel) fenc alignment not assumed
> +    pixelcmp_t            satd;       // Sum of Transformed differences (HADAMARD)
> +    pixelcmp_t            sa8d_inter; // sa8d primitives for motion search partitions
>  
> -    dct_t                 dct[NUM_DCTS];
> -    idct_t                idct[NUM_IDCTS];
> +    pixelavg_pp_t         pixelavg_pp;
> +    addAvg_t              luma_addAvg;
> +
> +    filter_pp_t           luma_hpp;
> +    filter_hps_t          luma_hps;
> +    filter_pp_t           luma_vpp;
> +    filter_ps_t           luma_vps;
> +    filter_sp_t           luma_vsp;
> +    filter_ss_t           luma_vss;
> +    filter_hv_pp_t        luma_hvpp;
> +
> +    copy_pp_t             luma_copy_pp;
> +    copy_sp_t             luma_copy_sp;
> +    copy_ps_t             luma_copy_ps;
> +    copy_ss_t             luma_copy_ss;
> +
> +    pixel_sub_ps_t        luma_sub_ps;
> +    pixel_add_ps_t        luma_add_ps;
> +
> +  } pu[NUM_LUMA_PARTITIONS];
> +
> +  struct CU
> +  {
> +    dct_t                 dct;
> +    idct_t                idct;
> +    calcresidual_t        calcresidual;
> +    blockfill_s_t         blockfill_s;  // block fill with value
> +    cpy2Dto1D_shl_t       cpy2Dto1D_shl;
> +    cpy2Dto1D_shr_t       cpy2Dto1D_shr;
> +    cpy1Dto2D_shl_t       cpy1Dto2D_shl;
> +    cpy1Dto2D_shr_t       cpy1Dto2D_shr;
> +    copy_cnt_t            copy_cnt;
> +
> +    transpose_t           transpose;
> +
> +    var_t                 var;
> +
> +    pixelcmp_t            sa8d;         // sa8d primitives for square intra blocks
> +    pixel_ssd_s_t         ssd_s;    // Sum of Square Error (short) fenc alignment not assumed
> +    pixelcmp_t            psy_cost_pp;  // difference in AC energy between two blocks
> +    pixelcmp_ss_t         psy_cost_ss;
> +
> +  } cu[NUM_SQUARE_BLOCKS];
> +
> +    dct_t                 dst4x4;
> +    idct_t                idst4x4;
> +
>      quant_t               quant;
>      nquant_t              nquant;
>      dequant_scaling_t     dequant_scaling;
>      dequant_normal_t      dequant_normal;
>      count_nonzero_t       count_nonzero;
>      denoiseDct_t          denoiseDct;
> -    calcresidual_t        calcresidual[NUM_SQUARE_BLOCKS];
> -    blockfill_s_t         blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
> -    cpy2Dto1D_shl_t       cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
> -    cpy2Dto1D_shr_t       cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
> -    cpy1Dto2D_shl_t       cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
> -    cpy1Dto2D_shr_t       cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
> -    copy_cnt_t            copy_cnt[NUM_SQUARE_BLOCKS - 1];
>  
>      intra_pred_t          intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
>      intra_allangs_t       intra_pred_allangs[NUM_TR_SIZE];
> -    transpose_t           transpose[NUM_SQUARE_BLOCKS];
>      scale_t               scale1D_128to64;
>      scale_t               scale2D_64to32;
>  
> -    var_t                 var[NUM_SQUARE_BLOCKS];
>      ssim_4x4x2_core_t     ssim_4x4x2_core;
>      ssim_end4_t           ssim_end_4;
>  
> @@ -261,42 +295,36 @@
>  
>      weightp_sp_t          weight_sp;
>      weightp_pp_t          weight_pp;
> -    pixelavg_pp_t         pixelavg_pp[NUM_LUMA_PARTITIONS];
> -    addAvg_t              luma_addAvg[NUM_LUMA_PARTITIONS];
>  
> -    filter_pp_t           luma_hpp[NUM_LUMA_PARTITIONS];
> -    filter_hps_t          luma_hps[NUM_LUMA_PARTITIONS];
> -    filter_pp_t           luma_vpp[NUM_LUMA_PARTITIONS];
> -    filter_ps_t           luma_vps[NUM_LUMA_PARTITIONS];
> -    filter_sp_t           luma_vsp[NUM_LUMA_PARTITIONS];
> -    filter_ss_t           luma_vss[NUM_LUMA_PARTITIONS];
> -    filter_hv_pp_t        luma_hvpp[NUM_LUMA_PARTITIONS];
>      filter_p2s_t          luma_p2s;
>  
> -    copy_pp_t             luma_copy_pp[NUM_LUMA_PARTITIONS];
> -    copy_sp_t             luma_copy_sp[NUM_LUMA_PARTITIONS];
> -    copy_ps_t             luma_copy_ps[NUM_LUMA_PARTITIONS];
> -    copy_ss_t             luma_copy_ss[NUM_LUMA_PARTITIONS];
> -    pixel_sub_ps_t        luma_sub_ps[NUM_SQUARE_BLOCKS];
> -    pixel_add_ps_t        luma_add_ps[NUM_SQUARE_BLOCKS];
> +    struct Chroma
> +    {
> +      struct PUChroma
> +      {
> +        // ME and MC
> +        pixelcmp_t      satd;
> +        filter_pp_t     filter_vpp;
> +        filter_ps_t     filter_vps;
> +        filter_sp_t     filter_vsp;
> +        filter_ss_t     filter_vss;
> +        filter_pp_t     filter_hpp;
> +        filter_hps_t    filter_hps;
> +        addAvg_t        addAvg;
> +        copy_pp_t       copy_pp;
> +        copy_sp_t       copy_sp;
> +        copy_ps_t       copy_ps;
> +        copy_ss_t       copy_ss;
> +      } pu[NUM_LUMA_PARTITIONS];
>  
> -    struct
> -    {
> -        pixelcmp_t      satd[NUM_LUMA_PARTITIONS];
> -        filter_pp_t     filter_vpp[NUM_LUMA_PARTITIONS];
> -        filter_ps_t     filter_vps[NUM_LUMA_PARTITIONS];
> -        filter_sp_t     filter_vsp[NUM_LUMA_PARTITIONS];
> -        filter_ss_t     filter_vss[NUM_LUMA_PARTITIONS];
> -        filter_pp_t     filter_hpp[NUM_LUMA_PARTITIONS];
> -        filter_hps_t    filter_hps[NUM_LUMA_PARTITIONS];
> -        addAvg_t        addAvg[NUM_LUMA_PARTITIONS];
> -        copy_pp_t       copy_pp[NUM_LUMA_PARTITIONS];
> -        copy_sp_t       copy_sp[NUM_LUMA_PARTITIONS];
> -        copy_ps_t       copy_ps[NUM_LUMA_PARTITIONS];
> -        copy_ss_t       copy_ss[NUM_LUMA_PARTITIONS];
> -        pixel_sub_ps_t  sub_ps[NUM_SQUARE_BLOCKS];
> -        pixel_add_ps_t  add_ps[NUM_SQUARE_BLOCKS];
> -        filter_p2s_t    p2s;
> +      struct CUChroma
> +      {
> +        pixelcmp_t sa8d;
> +        pixel_sub_ps_t  sub_ps;
> +        pixel_add_ps_t  add_ps;
> +      } cu[NUM_SQUARE_BLOCKS];
> +
> +      filter_p2s_t    p2s;
>      } chroma[X265_CSP_COUNT];
>  };
>  
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/quant.cpp
> --- a/source/common/quant.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/quant.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -329,7 +329,7 @@
>      if (cu.m_tqBypass[absPartIdx])
>      {
>          X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
> -        return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
> +        return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
>      }
>  
>      bool isLuma  = ttype == TEXT_LUMA;
> @@ -341,21 +341,21 @@
>      {
>  #if X265_DEPTH <= 10
>          X265_CHECK(transformShift >= 0, "invalid transformShift\n");
> -        primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
> +        primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
>  #else
>          if (transformShift >= 0)
> -            primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
> +            primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
>          else
> -            primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
> +            primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift);
>  #endif
>      }
>      else
>      {
>          bool isIntra = cu.isIntra(absPartIdx);
>          int useDST = !sizeIdx && isLuma && isIntra;
> -        int index = DCT_4x4 + sizeIdx - useDST;
> +        int index = BLOCK_4x4 + sizeIdx - useDST;
>  
> -        primitives.dct[index](residual, m_resiDctCoeff, resiStride);
> +        primitives.cu[index].dct(residual, m_resiDctCoeff, resiStride);
>  
>          /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
>           * there is no risk of performing this DCT unnecessarily */
> @@ -363,8 +363,8 @@
>          {
>              int trSize = 1 << log2TrSize;
>              /* perform DCT on source pixels for psy-rdoq */
> -            primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
> -            primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
> +            primitives.pu[sizeIdx].luma_copy_ps(m_fencShortBuf, trSize, fenc, fencStride);
> +            primitives.cu[index].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
>          }
>  
>          if (m_nr)
> @@ -411,7 +411,7 @@
>      const uint32_t sizeIdx = log2TrSize - 2;
>      if (transQuantBypass)
>      {
> -        primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
> +        primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
>          return;
>      }
>  
> @@ -438,12 +438,12 @@
>      {
>  #if X265_DEPTH <= 10
>          X265_CHECK(transformShift > 0, "invalid transformShift\n");
> -        primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
> +        primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
>  #else
>          if (transformShift > 0)
> -            primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
> +            primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
>          else
> -            primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
> +            primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
>  #endif
>      }
>      else
> @@ -461,11 +461,11 @@
>              const int add_2nd = 1 << (shift_2nd - 1);
>  
>              int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
> -            primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
> +            primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
>              return;
>          }
>  
> -        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
> +        primitives.cu[BLOCK_4x4 + sizeIdx - useDST].idct(m_resiDctCoeff, residual, resiStride);
>      }
>  }
>  
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/shortyuv.cpp
> --- a/source/common/shortyuv.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/shortyuv.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -74,9 +74,9 @@
>  void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size)
>  {
>      const int sizeIdx = log2Size - 2;
> -    primitives.luma_sub_ps[sizeIdx](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
> -    primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
> -    primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
> +    primitives.pu[sizeIdx].luma_sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
> +    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
> +    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
>  }
>  
>  void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
> @@ -84,7 +84,7 @@
>      const int16_t* src = getLumaAddr(absPartIdx);
>      int16_t* dst = dstYuv.getLumaAddr(absPartIdx);
>  
> -    primitives.luma_copy_ss[log2Size - 2](dst, dstYuv.m_size, src, m_size);
> +    primitives.pu[log2Size - 2].luma_copy_ss(dst, dstYuv.m_size, src, m_size);
>  }
>  
>  void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
> @@ -92,7 +92,7 @@
>      const int16_t* src = getLumaAddr(absPartIdx);
>      pixel* dst = dstYuv.getLumaAddr(absPartIdx);
>  
> -    primitives.luma_copy_sp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
> +    primitives.pu[log2Size - 2].luma_copy_sp(dst, dstYuv.m_size, src, m_size);
>  }
>  
>  void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
> @@ -103,8 +103,8 @@
>      int16_t* dstU = dstYuv.getCbAddr(absPartIdx);
>      int16_t* dstV = dstYuv.getCrAddr(absPartIdx);
>  
> -    primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, srcU, m_csize);
> -    primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, srcV, m_csize);
> +    primitives.chroma[m_csp].pu[part].copy_ss(dstU, dstYuv.m_csize, srcU, m_csize);
> +    primitives.chroma[m_csp].pu[part].copy_ss(dstV, dstYuv.m_csize, srcV, m_csize);
>  }
>  
>  void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
> @@ -115,6 +115,6 @@
>      pixel* dstU = dstYuv.getCbAddr(absPartIdx);
>      pixel* dstV = dstYuv.getCrAddr(absPartIdx);
>  
> -    primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, srcU, m_csize);
> -    primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, srcV, m_csize);
> +    primitives.chroma[m_csp].pu[part].copy_sp(dstU, dstYuv.m_csize, srcU, m_csize);
> +    primitives.chroma[m_csp].pu[part].copy_sp(dstV, dstYuv.m_csize, srcV, m_csize);
>  }
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/vec/dct-sse3.cpp
> --- a/source/common/vec/dct-sse3.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/vec/dct-sse3.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -1402,9 +1402,9 @@
>       * still somewhat rare on end-user PCs we still compile and link these SSE3
>       * intrinsic SIMD functions */
>  #if !HIGH_BIT_DEPTH
> -    p.idct[IDCT_8x8] = idct8;
> -    p.idct[IDCT_16x16] = idct16;
> -    p.idct[IDCT_32x32] = idct32;
> +    p.cu[BLOCK_8x8].idct   = idct8;
> +    p.cu[BLOCK_16x16].idct = idct16;
> +    p.cu[BLOCK_32x32].idct = idct32;
>  #endif
>  }
>  }
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/vec/dct-ssse3.cpp
> --- a/source/common/vec/dct-ssse3.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/vec/dct-ssse3.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -1111,8 +1111,8 @@
>       * still somewhat rare on end-user PCs we still compile and link these SSSE3
>       * intrinsic SIMD functions */
>  #if !HIGH_BIT_DEPTH
> -    p.dct[DCT_16x16] = dct16;
> -    p.dct[DCT_32x32] = dct32;
> +    p.cu[BLOCK_16x16].dct = dct16;
> +    p.cu[BLOCK_32x32].dct = dct32;
>  #endif
>  }
>  }
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -46,29 +46,29 @@
>      const int filterSize = NTAPS_LUMA;
>      const int halfFilterSize = filterSize >> 1;
>  
> -    x265::primitives.luma_hps[size](src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
> -    x265::primitives.luma_vsp[size](immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
> +    x265::primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
> +    x265::primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
>  }
>  
>  #define INIT2_NAME(name1, name2, cpu) \
> -    p.name1[LUMA_16x16] = x265_pixel_ ## name2 ## _16x16 ## cpu; \
> -    p.name1[LUMA_16x8]  = x265_pixel_ ## name2 ## _16x8 ## cpu;
> +    p.pu[LUMA_16x16].name1 = x265_pixel_ ## name2 ## _16x16 ## cpu; \
> +    p.pu[LUMA_16x8].name1  = x265_pixel_ ## name2 ## _16x8 ## cpu;
>  #define INIT4_NAME(name1, name2, cpu) \
>      INIT2_NAME(name1, name2, cpu) \
> -    p.name1[LUMA_8x16]  = x265_pixel_ ## name2 ## _8x16 ## cpu; \
> -    p.name1[LUMA_8x8]   = x265_pixel_ ## name2 ## _8x8 ## cpu;
> +    p.pu[LUMA_8x16].name1  = x265_pixel_ ## name2 ## _8x16 ## cpu; \
> +    p.pu[LUMA_8x8].name1   = x265_pixel_ ## name2 ## _8x8 ## cpu;
>  #define INIT5_NAME(name1, name2, cpu) \
>      INIT4_NAME(name1, name2, cpu) \
> -    p.name1[LUMA_8x4]   = x265_pixel_ ## name2 ## _8x4 ## cpu;
> +    p.pu[LUMA_8x4].name1   = x265_pixel_ ## name2 ## _8x4 ## cpu;
>  #define INIT6_NAME(name1, name2, cpu) \
>      INIT5_NAME(name1, name2, cpu) \
> -    p.name1[LUMA_4x8]   = x265_pixel_ ## name2 ## _4x8 ## cpu;
> +    p.pu[LUMA_4x8].name1   = x265_pixel_ ## name2 ## _4x8 ## cpu;
>  #define INIT7_NAME(name1, name2, cpu) \
>      INIT6_NAME(name1, name2, cpu) \
> -    p.name1[LUMA_4x4]   = x265_pixel_ ## name2 ## _4x4 ## cpu;
> +    p.pu[LUMA_4x4].name1   = x265_pixel_ ## name2 ## _4x4 ## cpu;
>  #define INIT8_NAME(name1, name2, cpu) \
>      INIT7_NAME(name1, name2, cpu) \
> -    p.name1[LUMA_4x16]  = x265_pixel_ ## name2 ## _4x16 ## cpu;
> +    p.pu[LUMA_4x16].name1  = x265_pixel_ ## name2 ## _4x16 ## cpu;
>  #define INIT2(name, cpu) INIT2_NAME(name, name, cpu)
>  #define INIT4(name, cpu) INIT4_NAME(name, name, cpu)
>  #define INIT5(name, cpu) INIT5_NAME(name, name, cpu)
> @@ -77,220 +77,220 @@
>  #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
>  
>  #define HEVC_SATD(cpu) \
> -    p.satd[LUMA_4x8]   = x265_pixel_satd_4x8_ ## cpu; \
> -    p.satd[LUMA_4x16]   = x265_pixel_satd_4x16_ ## cpu; \
> -    p.satd[LUMA_8x4]   = x265_pixel_satd_8x4_ ## cpu; \
> -    p.satd[LUMA_8x8]   = x265_pixel_satd_8x8_ ## cpu; \
> -    p.satd[LUMA_8x16]   = x265_pixel_satd_8x16_ ## cpu; \
> -    p.satd[LUMA_8x32]   = x265_pixel_satd_8x32_ ## cpu; \
> -    p.satd[LUMA_12x16]   = x265_pixel_satd_12x16_ ## cpu; \
> -    p.satd[LUMA_16x4]   = x265_pixel_satd_16x4_ ## cpu; \
> -    p.satd[LUMA_16x8]   = x265_pixel_satd_16x8_ ## cpu; \
> -    p.satd[LUMA_16x12]   = x265_pixel_satd_16x12_ ## cpu; \
> -    p.satd[LUMA_16x16]   = x265_pixel_satd_16x16_ ## cpu; \
> -    p.satd[LUMA_16x32]   = x265_pixel_satd_16x32_ ## cpu; \
> -    p.satd[LUMA_16x64]   = x265_pixel_satd_16x64_ ## cpu; \
> -    p.satd[LUMA_24x32]   = x265_pixel_satd_24x32_ ## cpu; \
> -    p.satd[LUMA_32x8]   = x265_pixel_satd_32x8_ ## cpu; \
> -    p.satd[LUMA_32x16]   = x265_pixel_satd_32x16_ ## cpu; \
> -    p.satd[LUMA_32x24]   = x265_pixel_satd_32x24_ ## cpu; \
> -    p.satd[LUMA_32x32]   = x265_pixel_satd_32x32_ ## cpu; \
> -    p.satd[LUMA_32x64]   = x265_pixel_satd_32x64_ ## cpu; \
> -    p.satd[LUMA_48x64]   = x265_pixel_satd_48x64_ ## cpu; \
> -    p.satd[LUMA_64x16]   = x265_pixel_satd_64x16_ ## cpu; \
> -    p.satd[LUMA_64x32]   = x265_pixel_satd_64x32_ ## cpu; \
> -    p.satd[LUMA_64x48]   = x265_pixel_satd_64x48_ ## cpu; \
> -    p.satd[LUMA_64x64]   = x265_pixel_satd_64x64_ ## cpu;
> +    p.pu[LUMA_4x8].satd   = x265_pixel_satd_4x8_ ## cpu; \
> +    p.pu[LUMA_4x16].satd  = x265_pixel_satd_4x16_ ## cpu; \
> +    p.pu[LUMA_8x4].satd   = x265_pixel_satd_8x4_ ## cpu; \
> +    p.pu[LUMA_8x8].satd   = x265_pixel_satd_8x8_ ## cpu; \
> +    p.pu[LUMA_8x16].satd  = x265_pixel_satd_8x16_ ## cpu; \
> +    p.pu[LUMA_8x32].satd  = x265_pixel_satd_8x32_ ## cpu; \
> +    p.pu[LUMA_12x16].satd = x265_pixel_satd_12x16_ ## cpu; \
> +    p.pu[LUMA_16x4].satd  = x265_pixel_satd_16x4_ ## cpu; \
> +    p.pu[LUMA_16x8].satd  = x265_pixel_satd_16x8_ ## cpu; \
> +    p.pu[LUMA_16x12].satd = x265_pixel_satd_16x12_ ## cpu; \
> +    p.pu[LUMA_16x16].satd = x265_pixel_satd_16x16_ ## cpu; \
> +    p.pu[LUMA_16x32].satd = x265_pixel_satd_16x32_ ## cpu; \
> +    p.pu[LUMA_16x64].satd = x265_pixel_satd_16x64_ ## cpu; \
> +    p.pu[LUMA_24x32].satd = x265_pixel_satd_24x32_ ## cpu; \
> +    p.pu[LUMA_32x8].satd  = x265_pixel_satd_32x8_ ## cpu; \
> +    p.pu[LUMA_32x16].satd = x265_pixel_satd_32x16_ ## cpu; \
> +    p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_ ## cpu; \
> +    p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_ ## cpu; \
> +    p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_ ## cpu; \
> +    p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_ ## cpu; \
> +    p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_ ## cpu; \
> +    p.pu[LUMA_64x32].satd = x265_pixel_satd_64x32_ ## cpu; \
> +    p.pu[LUMA_64x48].satd = x265_pixel_satd_64x48_ ## cpu; \
> +    p.pu[LUMA_64x64].satd = x265_pixel_satd_64x64_ ## cpu;
>  
>  #define SAD_X3(cpu) \
> -    p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
> -    p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
> -    p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
> -    p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
> -    p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
> -    p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
> -    p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
> -    p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
> -    p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
> -    p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
> -    p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
> -    p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
> -    p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
> -    p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
> -    p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
> -    p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
> +    p.pu[LUMA_16x8].sad_x3  = x265_pixel_sad_x3_16x8_ ## cpu; \
> +    p.pu[LUMA_16x12].sad_x3 = x265_pixel_sad_x3_16x12_ ## cpu; \
> +    p.pu[LUMA_16x16].sad_x3 = x265_pixel_sad_x3_16x16_ ## cpu; \
> +    p.pu[LUMA_16x32].sad_x3 = x265_pixel_sad_x3_16x32_ ## cpu; \
> +    p.pu[LUMA_16x64].sad_x3 = x265_pixel_sad_x3_16x64_ ## cpu; \
> +    p.pu[LUMA_32x8].sad_x3  = x265_pixel_sad_x3_32x8_ ## cpu; \
> +    p.pu[LUMA_32x16].sad_x3 = x265_pixel_sad_x3_32x16_ ## cpu; \
> +    p.pu[LUMA_32x24].sad_x3 = x265_pixel_sad_x3_32x24_ ## cpu; \
> +    p.pu[LUMA_32x32].sad_x3 = x265_pixel_sad_x3_32x32_ ## cpu; \
> +    p.pu[LUMA_32x64].sad_x3 = x265_pixel_sad_x3_32x64_ ## cpu; \
> +    p.pu[LUMA_24x32].sad_x3 = x265_pixel_sad_x3_24x32_ ## cpu; \
> +    p.pu[LUMA_48x64].sad_x3 = x265_pixel_sad_x3_48x64_ ## cpu; \
> +    p.pu[LUMA_64x16].sad_x3 = x265_pixel_sad_x3_64x16_ ## cpu; \
> +    p.pu[LUMA_64x32].sad_x3 = x265_pixel_sad_x3_64x32_ ## cpu; \
> +    p.pu[LUMA_64x48].sad_x3 = x265_pixel_sad_x3_64x48_ ## cpu; \
> +    p.pu[LUMA_64x64].sad_x3 = x265_pixel_sad_x3_64x64_ ## cpu
>  
>  #define SAD_X4(cpu) \
> -    p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
> -    p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
> -    p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
> -    p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
> -    p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
> -    p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
> -    p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
> -    p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
> -    p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
> -    p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
> -    p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
> -    p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
> -    p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
> -    p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
> -    p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
> -    p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
> +    p.pu[LUMA_16x8].sad_x4  = x265_pixel_sad_x4_16x8_ ## cpu; \
> +    p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_ ## cpu; \
> +    p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_ ## cpu; \
> +    p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_ ## cpu; \
> +    p.pu[LUMA_16x64].sad_x4 = x265_pixel_sad_x4_16x64_ ## cpu; \
> +    p.pu[LUMA_32x8].sad_x4  = x265_pixel_sad_x4_32x8_ ## cpu; \
> +    p.pu[LUMA_32x16].sad_x4 = x265_pixel_sad_x4_32x16_ ## cpu; \
> +    p.pu[LUMA_32x24].sad_x4 = x265_pixel_sad_x4_32x24_ ## cpu; \
> +    p.pu[LUMA_32x32].sad_x4 = x265_pixel_sad_x4_32x32_ ## cpu; \
> +    p.pu[LUMA_32x64].sad_x4 = x265_pixel_sad_x4_32x64_ ## cpu; \
> +    p.pu[LUMA_24x32].sad_x4 = x265_pixel_sad_x4_24x32_ ## cpu; \
> +    p.pu[LUMA_48x64].sad_x4 = x265_pixel_sad_x4_48x64_ ## cpu; \
> +    p.pu[LUMA_64x16].sad_x4 = x265_pixel_sad_x4_64x16_ ## cpu; \
> +    p.pu[LUMA_64x32].sad_x4 = x265_pixel_sad_x4_64x32_ ## cpu; \
> +    p.pu[LUMA_64x48].sad_x4 = x265_pixel_sad_x4_64x48_ ## cpu; \
> +    p.pu[LUMA_64x64].sad_x4 = x265_pixel_sad_x4_64x64_ ## cpu
>  
>  #define SAD(cpu) \
> -    p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_ ## cpu; \
> -    p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_ ## cpu; \
> -    p.sad[LUMA_16x12] = x265_pixel_sad_16x12_ ## cpu; \
> -    p.sad[LUMA_16x32] = x265_pixel_sad_16x32_ ## cpu; \
> -    p.sad[LUMA_16x64] = x265_pixel_sad_16x64_ ## cpu; \
> -    p.sad[LUMA_32x8]  = x265_pixel_sad_32x8_ ## cpu; \
> -    p.sad[LUMA_32x16] = x265_pixel_sad_32x16_ ## cpu; \
> -    p.sad[LUMA_32x24] = x265_pixel_sad_32x24_ ## cpu; \
> -    p.sad[LUMA_32x32] = x265_pixel_sad_32x32_ ## cpu; \
> -    p.sad[LUMA_32x64] = x265_pixel_sad_32x64_ ## cpu; \
> -    p.sad[LUMA_64x16] = x265_pixel_sad_64x16_ ## cpu; \
> -    p.sad[LUMA_64x32] = x265_pixel_sad_64x32_ ## cpu; \
> -    p.sad[LUMA_64x48] = x265_pixel_sad_64x48_ ## cpu; \
> -    p.sad[LUMA_64x64] = x265_pixel_sad_64x64_ ## cpu; \
> -    p.sad[LUMA_48x64] = x265_pixel_sad_48x64_ ## cpu; \
> -    p.sad[LUMA_24x32] = x265_pixel_sad_24x32_ ## cpu; \
> -    p.sad[LUMA_12x16] = x265_pixel_sad_12x16_ ## cpu
> +    p.pu[LUMA_8x32].sad  = x265_pixel_sad_8x32_ ## cpu; \
> +    p.pu[LUMA_16x4].sad  = x265_pixel_sad_16x4_ ## cpu; \
> +    p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_ ## cpu; \
> +    p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_ ## cpu; \
> +    p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu; \
> +    p.pu[LUMA_32x8].sad  = x265_pixel_sad_32x8_ ## cpu; \
> +    p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_ ## cpu; \
> +    p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_ ## cpu; \
> +    p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_ ## cpu; \
> +    p.pu[LUMA_32x64].sad = x265_pixel_sad_32x64_ ## cpu; \
> +    p.pu[LUMA_64x16].sad = x265_pixel_sad_64x16_ ## cpu; \
> +    p.pu[LUMA_64x32].sad = x265_pixel_sad_64x32_ ## cpu; \
> +    p.pu[LUMA_64x48].sad = x265_pixel_sad_64x48_ ## cpu; \
> +    p.pu[LUMA_64x64].sad = x265_pixel_sad_64x64_ ## cpu; \
> +    p.pu[LUMA_48x64].sad = x265_pixel_sad_48x64_ ## cpu; \
> +    p.pu[LUMA_24x32].sad = x265_pixel_sad_24x32_ ## cpu; \
> +    p.pu[LUMA_12x16].sad = x265_pixel_sad_12x16_ ## cpu
>  
>  #define ASSGN_SSE(cpu) \
> -    p.sse_pp[LUMA_8x8]   = x265_pixel_ssd_8x8_ ## cpu; \
> -    p.sse_pp[LUMA_8x4]   = x265_pixel_ssd_8x4_ ## cpu; \
> -    p.sse_pp[LUMA_16x16] = x265_pixel_ssd_16x16_ ## cpu; \
> -    p.sse_pp[LUMA_16x4]  = x265_pixel_ssd_16x4_ ## cpu; \
> -    p.sse_pp[LUMA_16x8]  = x265_pixel_ssd_16x8_ ## cpu; \
> -    p.sse_pp[LUMA_8x16]  = x265_pixel_ssd_8x16_ ## cpu; \
> -    p.sse_pp[LUMA_16x12] = x265_pixel_ssd_16x12_ ## cpu; \
> -    p.sse_pp[LUMA_32x32] = x265_pixel_ssd_32x32_ ## cpu; \
> -    p.sse_pp[LUMA_32x16] = x265_pixel_ssd_32x16_ ## cpu; \
> -    p.sse_pp[LUMA_16x32] = x265_pixel_ssd_16x32_ ## cpu; \
> -    p.sse_pp[LUMA_8x32]  = x265_pixel_ssd_8x32_ ## cpu; \
> -    p.sse_pp[LUMA_32x8]  = x265_pixel_ssd_32x8_ ## cpu; \
> -    p.sse_pp[LUMA_32x24] = x265_pixel_ssd_32x24_ ## cpu; \
> -    p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
> -    p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
> +    p.pu[LUMA_8x8].sse_pp   = x265_pixel_ssd_8x8_ ## cpu; \
> +    p.pu[LUMA_8x4].sse_pp   = x265_pixel_ssd_8x4_ ## cpu; \
> +    p.pu[LUMA_16x16].sse_pp = x265_pixel_ssd_16x16_ ## cpu; \
> +    p.pu[LUMA_16x4].sse_pp  = x265_pixel_ssd_16x4_ ## cpu; \
> +    p.pu[LUMA_16x8].sse_pp  = x265_pixel_ssd_16x8_ ## cpu; \
> +    p.pu[LUMA_8x16].sse_pp  = x265_pixel_ssd_8x16_ ## cpu; \
> +    p.pu[LUMA_16x12].sse_pp = x265_pixel_ssd_16x12_ ## cpu; \
> +    p.pu[LUMA_32x32].sse_pp = x265_pixel_ssd_32x32_ ## cpu; \
> +    p.pu[LUMA_32x16].sse_pp = x265_pixel_ssd_32x16_ ## cpu; \
> +    p.pu[LUMA_16x32].sse_pp = x265_pixel_ssd_16x32_ ## cpu; \
> +    p.pu[LUMA_8x32].sse_pp  = x265_pixel_ssd_8x32_ ## cpu; \
> +    p.pu[LUMA_32x8].sse_pp  = x265_pixel_ssd_32x8_ ## cpu; \
> +    p.pu[LUMA_32x24].sse_pp = x265_pixel_ssd_32x24_ ## cpu; \
> +    p.pu[LUMA_32x64].sse_pp = x265_pixel_ssd_32x64_ ## cpu; \
> +    p.pu[LUMA_16x64].sse_pp = x265_pixel_ssd_16x64_ ## cpu
>  
>  #define ASSGN_SSE_SS(cpu) \
> -    p.sse_ss[LUMA_4x4]   = x265_pixel_ssd_ss_4x4_ ## cpu; \
> -    p.sse_ss[LUMA_4x8]   = x265_pixel_ssd_ss_4x8_ ## cpu; \
> -    p.sse_ss[LUMA_4x16]   = x265_pixel_ssd_ss_4x16_ ## cpu; \
> -    p.sse_ss[LUMA_8x4]   = x265_pixel_ssd_ss_8x4_ ## cpu; \
> -    p.sse_ss[LUMA_8x8]   = x265_pixel_ssd_ss_8x8_ ## cpu; \
> -    p.sse_ss[LUMA_8x16]   = x265_pixel_ssd_ss_8x16_ ## cpu; \
> -    p.sse_ss[LUMA_8x32]   = x265_pixel_ssd_ss_8x32_ ## cpu; \
> -    p.sse_ss[LUMA_12x16]   = x265_pixel_ssd_ss_12x16_ ## cpu; \
> -    p.sse_ss[LUMA_16x4]   = x265_pixel_ssd_ss_16x4_ ## cpu; \
> -    p.sse_ss[LUMA_16x8]   = x265_pixel_ssd_ss_16x8_ ## cpu; \
> -    p.sse_ss[LUMA_16x12]   = x265_pixel_ssd_ss_16x12_ ## cpu; \
> -    p.sse_ss[LUMA_16x16]   = x265_pixel_ssd_ss_16x16_ ## cpu; \
> -    p.sse_ss[LUMA_16x32]   = x265_pixel_ssd_ss_16x32_ ## cpu; \
> -    p.sse_ss[LUMA_16x64]   = x265_pixel_ssd_ss_16x64_ ## cpu; \
> -    p.sse_ss[LUMA_24x32]   = x265_pixel_ssd_ss_24x32_ ## cpu; \
> -    p.sse_ss[LUMA_32x8]   = x265_pixel_ssd_ss_32x8_ ## cpu; \
> -    p.sse_ss[LUMA_32x16]   = x265_pixel_ssd_ss_32x16_ ## cpu; \
> -    p.sse_ss[LUMA_32x24]   = x265_pixel_ssd_ss_32x24_ ## cpu; \
> -    p.sse_ss[LUMA_32x32]   = x265_pixel_ssd_ss_32x32_ ## cpu; \
> -    p.sse_ss[LUMA_32x64]   = x265_pixel_ssd_ss_32x64_ ## cpu; \
> -    p.sse_ss[LUMA_48x64]   = x265_pixel_ssd_ss_48x64_ ## cpu; \
> -    p.sse_ss[LUMA_64x16]   = x265_pixel_ssd_ss_64x16_ ## cpu; \
> -    p.sse_ss[LUMA_64x32]   = x265_pixel_ssd_ss_64x32_ ## cpu; \
> -    p.sse_ss[LUMA_64x48]   = x265_pixel_ssd_ss_64x48_ ## cpu; \
> -    p.sse_ss[LUMA_64x64]   = x265_pixel_ssd_ss_64x64_ ## cpu;
> +    p.pu[LUMA_4x4].sse_ss   = x265_pixel_ssd_ss_4x4_ ## cpu; \
> +    p.pu[LUMA_4x8].sse_ss   = x265_pixel_ssd_ss_4x8_ ## cpu; \
> +    p.pu[LUMA_4x16].sse_ss  = x265_pixel_ssd_ss_4x16_ ## cpu; \
> +    p.pu[LUMA_8x4].sse_ss   = x265_pixel_ssd_ss_8x4_ ## cpu; \
> +    p.pu[LUMA_8x8].sse_ss   = x265_pixel_ssd_ss_8x8_ ## cpu; \
> +    p.pu[LUMA_8x16].sse_ss  = x265_pixel_ssd_ss_8x16_ ## cpu; \
> +    p.pu[LUMA_8x32].sse_ss  = x265_pixel_ssd_ss_8x32_ ## cpu; \
> +    p.pu[LUMA_12x16].sse_ss = x265_pixel_ssd_ss_12x16_ ## cpu; \
> +    p.pu[LUMA_16x4].sse_ss  = x265_pixel_ssd_ss_16x4_ ## cpu; \
> +    p.pu[LUMA_16x8].sse_ss  = x265_pixel_ssd_ss_16x8_ ## cpu; \
> +    p.pu[LUMA_16x12].sse_ss = x265_pixel_ssd_ss_16x12_ ## cpu; \
> +    p.pu[LUMA_16x16].sse_ss = x265_pixel_ssd_ss_16x16_ ## cpu; \
> +    p.pu[LUMA_16x32].sse_ss = x265_pixel_ssd_ss_16x32_ ## cpu; \
> +    p.pu[LUMA_16x64].sse_ss = x265_pixel_ssd_ss_16x64_ ## cpu; \
> +    p.pu[LUMA_24x32].sse_ss = x265_pixel_ssd_ss_24x32_ ## cpu; \
> +    p.pu[LUMA_32x8].sse_ss  = x265_pixel_ssd_ss_32x8_ ## cpu; \
> +    p.pu[LUMA_32x16].sse_ss = x265_pixel_ssd_ss_32x16_ ## cpu; \
> +    p.pu[LUMA_32x24].sse_ss = x265_pixel_ssd_ss_32x24_ ## cpu; \
> +    p.pu[LUMA_32x32].sse_ss = x265_pixel_ssd_ss_32x32_ ## cpu; \
> +    p.pu[LUMA_32x64].sse_ss = x265_pixel_ssd_ss_32x64_ ## cpu; \
> +    p.pu[LUMA_48x64].sse_ss = x265_pixel_ssd_ss_48x64_ ## cpu; \
> +    p.pu[LUMA_64x16].sse_ss = x265_pixel_ssd_ss_64x16_ ## cpu; \
> +    p.pu[LUMA_64x32].sse_ss = x265_pixel_ssd_ss_64x32_ ## cpu; \
> +    p.pu[LUMA_64x48].sse_ss = x265_pixel_ssd_ss_64x48_ ## cpu; \
> +    p.pu[LUMA_64x64].sse_ss = x265_pixel_ssd_ss_64x64_ ## cpu;
>  
>  #define SA8D_INTER_FROM_BLOCK(cpu) \
> -    p.sa8d_inter[LUMA_4x8]   = x265_pixel_satd_4x8_ ## cpu; \
> -    p.sa8d_inter[LUMA_8x4]   = x265_pixel_satd_8x4_ ## cpu; \
> -    p.sa8d_inter[LUMA_4x16]  = x265_pixel_satd_4x16_ ## cpu; \
> -    p.sa8d_inter[LUMA_16x4]  = x265_pixel_satd_16x4_ ## cpu; \
> -    p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
> -    p.sa8d_inter[LUMA_8x8]   = x265_pixel_sa8d_8x8_ ## cpu; \
> -    p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_ ## cpu; \
> -    p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
> -    p.sa8d_inter[LUMA_16x8]  = x265_pixel_sa8d_16x8_ ## cpu; \
> -    p.sa8d_inter[LUMA_8x16]  = x265_pixel_sa8d_8x16_ ## cpu; \
> -    p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_ ## cpu; \
> -    p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_ ## cpu; \
> -    p.sa8d_inter[LUMA_32x8]  = x265_pixel_sa8d_32x8_ ## cpu; \
> -    p.sa8d_inter[LUMA_8x32]  = x265_pixel_sa8d_8x32_ ## cpu; \
> -    p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
> -    p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_ ## cpu; \
> -    p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_ ## cpu; \
> -    p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
> -    p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \
> -    p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \
> -    p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \
> -    p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_ ## cpu; \
> -    p.sa8d_inter[LUMA_64x16] = x265_pixel_sa8d_64x16_ ## cpu; \
> -    p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_ ## cpu;
> +    p.pu[LUMA_4x8].sa8d_inter   = x265_pixel_satd_4x8_ ## cpu; \
> +    p.pu[LUMA_8x4].sa8d_inter   = x265_pixel_satd_8x4_ ## cpu; \
> +    p.pu[LUMA_4x16].sa8d_inter  = x265_pixel_satd_4x16_ ## cpu; \
> +    p.pu[LUMA_16x4].sa8d_inter  = x265_pixel_satd_16x4_ ## cpu; \
> +    p.pu[LUMA_12x16].sa8d_inter = x265_pixel_satd_12x16_ ## cpu; \
> +    p.pu[LUMA_8x8].sa8d_inter   = x265_pixel_sa8d_8x8_ ## cpu; \
> +    p.pu[LUMA_16x16].sa8d_inter = x265_pixel_sa8d_16x16_ ## cpu; \
> +    p.pu[LUMA_16x12].sa8d_inter = x265_pixel_satd_16x12_ ## cpu; \
> +    p.pu[LUMA_16x8].sa8d_inter  = x265_pixel_sa8d_16x8_ ## cpu; \
> +    p.pu[LUMA_8x16].sa8d_inter  = x265_pixel_sa8d_8x16_ ## cpu; \
> +    p.pu[LUMA_32x24].sa8d_inter = x265_pixel_sa8d_32x24_ ## cpu; \
> +    p.pu[LUMA_24x32].sa8d_inter = x265_pixel_sa8d_24x32_ ## cpu; \
> +    p.pu[LUMA_32x8].sa8d_inter  = x265_pixel_sa8d_32x8_ ## cpu; \
> +    p.pu[LUMA_8x32].sa8d_inter  = x265_pixel_sa8d_8x32_ ## cpu; \
> +    p.pu[LUMA_32x32].sa8d_inter = x265_pixel_sa8d_32x32_ ## cpu; \
> +    p.pu[LUMA_32x16].sa8d_inter = x265_pixel_sa8d_32x16_ ## cpu; \
> +    p.pu[LUMA_16x32].sa8d_inter = x265_pixel_sa8d_16x32_ ## cpu; \
> +    p.pu[LUMA_64x64].sa8d_inter = x265_pixel_sa8d_64x64_ ## cpu; \
> +    p.pu[LUMA_64x32].sa8d_inter = x265_pixel_sa8d_64x32_ ## cpu; \
> +    p.pu[LUMA_32x64].sa8d_inter = x265_pixel_sa8d_32x64_ ## cpu; \
> +    p.pu[LUMA_64x48].sa8d_inter = x265_pixel_sa8d_64x48_ ## cpu; \
> +    p.pu[LUMA_48x64].sa8d_inter = x265_pixel_sa8d_48x64_ ## cpu; \
> +    p.pu[LUMA_64x16].sa8d_inter = x265_pixel_sa8d_64x16_ ## cpu; \
> +    p.pu[LUMA_16x64].sa8d_inter = x265_pixel_sa8d_16x64_ ## cpu;
>  
>  #define PIXEL_AVG(cpu) \
> -    p.pixelavg_pp[LUMA_64x64] = x265_pixel_avg_64x64_ ## cpu; \
> -    p.pixelavg_pp[LUMA_64x48] = x265_pixel_avg_64x48_ ## cpu; \
> -    p.pixelavg_pp[LUMA_64x32] = x265_pixel_avg_64x32_ ## cpu; \
> -    p.pixelavg_pp[LUMA_64x16] = x265_pixel_avg_64x16_ ## cpu; \
> -    p.pixelavg_pp[LUMA_48x64] = x265_pixel_avg_48x64_ ## cpu; \
> -    p.pixelavg_pp[LUMA_32x64] = x265_pixel_avg_32x64_ ## cpu; \
> -    p.pixelavg_pp[LUMA_32x32] = x265_pixel_avg_32x32_ ## cpu; \
> -    p.pixelavg_pp[LUMA_32x24] = x265_pixel_avg_32x24_ ## cpu; \
> -    p.pixelavg_pp[LUMA_32x16] = x265_pixel_avg_32x16_ ## cpu; \
> -    p.pixelavg_pp[LUMA_32x8] = x265_pixel_avg_32x8_ ## cpu; \
> -    p.pixelavg_pp[LUMA_24x32] = x265_pixel_avg_24x32_ ## cpu; \
> -    p.pixelavg_pp[LUMA_16x64] = x265_pixel_avg_16x64_ ## cpu; \
> -    p.pixelavg_pp[LUMA_16x32] = x265_pixel_avg_16x32_ ## cpu; \
> -    p.pixelavg_pp[LUMA_16x16] = x265_pixel_avg_16x16_ ## cpu; \
> -    p.pixelavg_pp[LUMA_16x12]  = x265_pixel_avg_16x12_ ## cpu; \
> -    p.pixelavg_pp[LUMA_16x8]  = x265_pixel_avg_16x8_ ## cpu; \
> -    p.pixelavg_pp[LUMA_16x4]  = x265_pixel_avg_16x4_ ## cpu; \
> -    p.pixelavg_pp[LUMA_12x16] = x265_pixel_avg_12x16_ ## cpu; \
> -    p.pixelavg_pp[LUMA_8x32]  = x265_pixel_avg_8x32_ ## cpu; \
> -    p.pixelavg_pp[LUMA_8x16]  = x265_pixel_avg_8x16_ ## cpu; \
> -    p.pixelavg_pp[LUMA_8x8]   = x265_pixel_avg_8x8_ ## cpu; \
> -    p.pixelavg_pp[LUMA_8x4]   = x265_pixel_avg_8x4_ ## cpu;
> +    p.pu[LUMA_64x64].pixelavg_pp = x265_pixel_avg_64x64_ ## cpu; \
> +    p.pu[LUMA_64x48].pixelavg_pp = x265_pixel_avg_64x48_ ## cpu; \
> +    p.pu[LUMA_64x32].pixelavg_pp = x265_pixel_avg_64x32_ ## cpu; \
> +    p.pu[LUMA_64x16].pixelavg_pp = x265_pixel_avg_64x16_ ## cpu; \
> +    p.pu[LUMA_48x64].pixelavg_pp = x265_pixel_avg_48x64_ ## cpu; \
> +    p.pu[LUMA_32x64].pixelavg_pp = x265_pixel_avg_32x64_ ## cpu; \
> +    p.pu[LUMA_32x32].pixelavg_pp = x265_pixel_avg_32x32_ ## cpu; \
> +    p.pu[LUMA_32x24].pixelavg_pp = x265_pixel_avg_32x24_ ## cpu; \
> +    p.pu[LUMA_32x16].pixelavg_pp = x265_pixel_avg_32x16_ ## cpu; \
> +    p.pu[LUMA_32x8].pixelavg_pp  = x265_pixel_avg_32x8_ ## cpu; \
> +    p.pu[LUMA_24x32].pixelavg_pp = x265_pixel_avg_24x32_ ## cpu; \
> +    p.pu[LUMA_16x64].pixelavg_pp = x265_pixel_avg_16x64_ ## cpu; \
> +    p.pu[LUMA_16x32].pixelavg_pp = x265_pixel_avg_16x32_ ## cpu; \
> +    p.pu[LUMA_16x16].pixelavg_pp = x265_pixel_avg_16x16_ ## cpu; \
> +    p.pu[LUMA_16x12].pixelavg_pp = x265_pixel_avg_16x12_ ## cpu; \
> +    p.pu[LUMA_16x8].pixelavg_pp  = x265_pixel_avg_16x8_ ## cpu; \
> +    p.pu[LUMA_16x4].pixelavg_pp  = x265_pixel_avg_16x4_ ## cpu; \
> +    p.pu[LUMA_12x16].pixelavg_pp = x265_pixel_avg_12x16_ ## cpu; \
> +    p.pu[LUMA_8x32].pixelavg_pp  = x265_pixel_avg_8x32_ ## cpu; \
> +    p.pu[LUMA_8x16].pixelavg_pp  = x265_pixel_avg_8x16_ ## cpu; \
> +    p.pu[LUMA_8x8].pixelavg_pp   = x265_pixel_avg_8x8_ ## cpu; \
> +    p.pu[LUMA_8x4].pixelavg_pp   = x265_pixel_avg_8x4_ ## cpu;
>  
>  #define PIXEL_AVG_W4(cpu) \
> -    p.pixelavg_pp[LUMA_4x4]  = x265_pixel_avg_4x4_ ## cpu; \
> -    p.pixelavg_pp[LUMA_4x8]  = x265_pixel_avg_4x8_ ## cpu; \
> -    p.pixelavg_pp[LUMA_4x16] = x265_pixel_avg_4x16_ ## cpu;
> +    p.pu[LUMA_4x4].pixelavg_pp  = x265_pixel_avg_4x4_ ## cpu; \
> +    p.pu[LUMA_4x8].pixelavg_pp  = x265_pixel_avg_4x8_ ## cpu; \
> +    p.pu[LUMA_4x16].pixelavg_pp = x265_pixel_avg_4x16_ ## cpu;
>  
>  #define SETUP_CHROMA_FUNC_DEF_420(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
> -    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
> -    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
> -    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_FILTERS_420(cpu) \
>      SETUP_CHROMA_FUNC_DEF_420(4, 4, cpu); \
> @@ -538,37 +538,37 @@
>  
>  #if HIGH_BIT_DEPTH    // temporary, until all 10bit functions are completed
>  #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
> -    p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
> -    p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
> -    p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
> -    p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
> +    p.pu[LUMA_ ## W ## x ## H].luma_hpp = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_hps = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vpp = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vps = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vsp = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
>  #else
>  #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
> -    p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
> -    p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
> -    p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
> -    p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
> +    p.pu[LUMA_ ## W ## x ## H].luma_hpp = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_hps = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vpp = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vps = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_vsp = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
>  #endif // if HIGH_BIT_DEPTH
>  
>  #define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
> -    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
> -    p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
> +    p.pu[LUMA_ ## W ## x ## H].luma_sub_ps = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
> +    p.pu[LUMA_ ## W ## x ## H].luma_add_ps = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
> -    p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.pu[LUMA_ ## W ## x ## H].luma_vsp = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
> -    p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
> +    p.pu[LUMA_ ## W ## x ## H].luma_vss = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
>  
>  #define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
> -    p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
> +    p.pu[LUMA_ ## W ## x ## H].luma_copy_ ## type = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
>  
>  #define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
> -    p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_ ## type = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_BLOCKCOPY(type, cpu) \
>      SETUP_CHROMA_BLOCKCOPY(type, 2,  4,  cpu); \
> @@ -597,7 +597,7 @@
>      SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
>  
>  #define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
> -    p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_ ## type = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_BLOCKCOPY_422(type, cpu) \
>      SETUP_CHROMA_BLOCKCOPY_422(type, 2,  8,  cpu); \
> @@ -653,7 +653,7 @@
>      SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
>  
>  #define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_sp = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_BLOCKCOPY_SP(cpu) \
>      SETUP_CHROMA_BLOCKCOPY_SP(2,  4,  cpu); \
> @@ -682,7 +682,7 @@
>      SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
>  
>  #define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_sp = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_BLOCKCOPY_SP_422(cpu) \
>      SETUP_CHROMA_BLOCKCOPY_SP_422(2,  8,  cpu); \
> @@ -711,8 +711,8 @@
>      SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
>  
>  #define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].sub_ps = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].add_ps = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_PIXELSUB_PS(cpu) \
>      SETUP_CHROMA_PIXELSUB(4,  4,  cpu); \
> @@ -721,8 +721,8 @@
>      SETUP_CHROMA_PIXELSUB(32, 32, cpu);
>  
>  #define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].sub_ps = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].add_ps = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_PIXELSUB_PS_422(cpu) \
>      SETUP_CHROMA_PIXELSUB_422(4,  8,  cpu); \
> @@ -819,7 +819,7 @@
>      SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
>  
>  #define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
> -    p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
> +    p.cu[BLOCK_ ## W ## x ## H].var = x265_pixel_var_ ## W ## x ## H ## cpu;
>  
>  #define LUMA_VAR(cpu) \
>      SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
> @@ -828,7 +828,7 @@
>      SETUP_PIXEL_VAR_DEF(64, 64, cpu);
>  
>  #define SETUP_PIXEL_SSE_SP_DEF(W, H, cpu) \
> -    p.sse_sp[LUMA_ ## W ## x ## H] = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
> +    p.pu[LUMA_ ## W ## x ## H].sse_sp = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
>  
>  #define LUMA_SSE_SP(cpu) \
>      SETUP_PIXEL_SSE_SP_DEF(4,   4, cpu); \
> @@ -858,7 +858,7 @@
>      SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
>  
>  #define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
> -    p.luma_addAvg[LUMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
> +    p.pu[LUMA_ ## W ## x ## H].luma_addAvg = x265_addAvg_ ## W ## x ## H ## cpu;
>  
>  #define LUMA_ADDAVG(cpu) \
>      SETUP_LUMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
> @@ -888,7 +888,7 @@
>      SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
>  
>  #define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].addAvg = x265_addAvg_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_ADDAVG(cpu) \
>      SETUP_CHROMA_ADDAVG_FUNC_DEF(2,  4,  cpu); \
> @@ -917,7 +917,7 @@
>      SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
>  
>  #define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].addAvg = x265_addAvg_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_ADDAVG_422(cpu) \
>      SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2,  8,  cpu); \
> @@ -1054,10 +1054,10 @@
>      SETUP_INTRA_ANG16_32(33, 33, cpu);
>  
>  #define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_VERT_FILTERS(cpu) \
>      SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
> @@ -1088,10 +1088,10 @@
>      SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
>  
>  #define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_VERT_FILTERS_422(cpu) \
>      SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
> @@ -1122,10 +1122,10 @@
>      SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
>  
>  #define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
> -    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_VERT_FILTERS_444(cpu) \
>      SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
> @@ -1154,8 +1154,8 @@
>      SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu);
>  
>  #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
> -    p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_HORIZ_FILTERS(cpu) \
>      SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
> @@ -1184,8 +1184,8 @@
>      SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu);
>  
>  #define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
> -    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_HORIZ_FILTERS_422(cpu) \
>      SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
> @@ -1214,8 +1214,8 @@
>      SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu);
>  
>  #define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
> -    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> -    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
> +    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
>  
>  #define CHROMA_HORIZ_FILTERS_444(cpu) \
>      SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
> @@ -1257,44 +1257,44 @@
>  
>          INIT6(satd, _sse2);
>          HEVC_SATD(sse2);
> -        p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
> +        p.pu[LUMA_4x4].satd = x265_pixel_satd_4x4_mmx2;
>  
> -        p.sa8d_inter[LUMA_4x4]  = x265_pixel_satd_4x4_mmx2;
> +        p.pu[LUMA_4x4].sa8d_inter  = x265_pixel_satd_4x4_mmx2;
>          SA8D_INTER_FROM_BLOCK(sse2);
> -        p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
> -        p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
> +        p.pu[LUMA_8x8].sa8d_inter = x265_pixel_sa8d_8x8_sse2;
> +        p.pu[LUMA_16x16].sa8d_inter = x265_pixel_sa8d_16x16_sse2;
>  
> -        p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_mmx2;
> -        p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_mmx2;
> -        p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_mmx2;
> -        p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_sse2;
> -        p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2;
> -        p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2;
> -        p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2;
> -        p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2;
> -        p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2;
> -        p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2;
> -        p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2;
> -        p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2;
> -        p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2;
> -        p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2;
> -        p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2;
> -        p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2;
> -        p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2;
> -        p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2;
> -        p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
> -        p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
> -        p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2;
> -        p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2;
> -        p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2;
> -        p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2;
> -        p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2;
> +        p.pu[LUMA_4x4].sse_ss   = x265_pixel_ssd_ss_4x4_mmx2;
> +        p.pu[LUMA_4x8].sse_ss   = x265_pixel_ssd_ss_4x8_mmx2;
> +        p.pu[LUMA_4x16].sse_ss  = x265_pixel_ssd_ss_4x16_mmx2;
> +        p.pu[LUMA_8x4].sse_ss   = x265_pixel_ssd_ss_8x4_sse2;
> +        p.pu[LUMA_8x8].sse_ss   = x265_pixel_ssd_ss_8x8_sse2;
> +        p.pu[LUMA_8x16].sse_ss  = x265_pixel_ssd_ss_8x16_sse2;
> +        p.pu[LUMA_8x32].sse_ss  = x265_pixel_ssd_ss_8x32_sse2;
> +        p.pu[LUMA_12x16].sse_ss = x265_pixel_ssd_ss_12x16_sse2;
> +        p.pu[LUMA_16x4].sse_ss  = x265_pixel_ssd_ss_16x4_sse2;
> +        p.pu[LUMA_16x8].sse_ss  = x265_pixel_ssd_ss_16x8_sse2;
> +        p.pu[LUMA_16x12].sse_ss = x265_pixel_ssd_ss_16x12_sse2;
> +        p.pu[LUMA_16x16].sse_ss = x265_pixel_ssd_ss_16x16_sse2;
> +        p.pu[LUMA_16x32].sse_ss = x265_pixel_ssd_ss_16x32_sse2;
> +        p.pu[LUMA_16x64].sse_ss = x265_pixel_ssd_ss_16x64_sse2;
> +        p.pu[LUMA_24x32].sse_ss = x265_pixel_ssd_ss_24x32_sse2;
> +        p.pu[LUMA_32x8].sse_ss  = x265_pixel_ssd_ss_32x8_sse2;
> +        p.pu[LUMA_32x16].sse_ss = x265_pixel_ssd_ss_32x16_sse2;
> +        p.pu[LUMA_32x24].sse_ss = x265_pixel_ssd_ss_32x24_sse2;
> +        p.pu[LUMA_32x32].sse_ss = x265_pixel_ssd_ss_32x32_sse2;
> +        p.pu[LUMA_32x64].sse_ss = x265_pixel_ssd_ss_32x64_sse2;
> +        p.pu[LUMA_48x64].sse_ss = x265_pixel_ssd_ss_48x64_sse2;
> +        p.pu[LUMA_64x16].sse_ss = x265_pixel_ssd_ss_64x16_sse2;
> +        p.pu[LUMA_64x32].sse_ss = x265_pixel_ssd_ss_64x32_sse2;
> +        p.pu[LUMA_64x48].sse_ss = x265_pixel_ssd_ss_64x48_sse2;
> +        p.pu[LUMA_64x64].sse_ss = x265_pixel_ssd_ss_64x64_sse2;
>  
> -        p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
> -        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
> -        p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
> -        p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
> -        p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
> +        p.cu[BLOCK_4x4].transpose   = x265_transpose4_sse2;
> +        p.cu[BLOCK_8x8].transpose   = x265_transpose8_sse2;
> +        p.cu[BLOCK_16x16].transpose = x265_transpose16_sse2;
> +        p.cu[BLOCK_32x32].transpose = x265_transpose32_sse2;
> +        p.cu[BLOCK_64x64].transpose = x265_transpose64_sse2;
>  
>          p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
>          p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
> @@ -1303,43 +1303,43 @@
>          LUMA_VAR(_sse2);
>  
>          SAD_X3(sse2);
> -        p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
> -        p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
> -        p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
> -        p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
> -        p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
> -        p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
> -        p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
> -        p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
> -        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
> +        p.pu[LUMA_4x4].sad_x3   = x265_pixel_sad_x3_4x4_mmx2;
> +        p.pu[LUMA_4x8].sad_x3   = x265_pixel_sad_x3_4x8_mmx2;
> +        p.pu[LUMA_4x16].sad_x3  = x265_pixel_sad_x3_4x16_mmx2;
> +        p.pu[LUMA_8x4].sad_x3   = x265_pixel_sad_x3_8x4_sse2;
> +        p.pu[LUMA_8x8].sad_x3   = x265_pixel_sad_x3_8x8_sse2;
> +        p.pu[LUMA_8x16].sad_x3  = x265_pixel_sad_x3_8x16_sse2;
> +        p.pu[LUMA_8x32].sad_x3  = x265_pixel_sad_x3_8x32_sse2;
> +        p.pu[LUMA_16x4].sad_x3  = x265_pixel_sad_x3_16x4_sse2;
> +        p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_mmx2;
>  
>          SAD_X4(sse2);
> -        p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
> -        p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
> -        p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
> -        p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
> -        p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
> -        p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
> -        p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
> -        p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
> -        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
> +        p.pu[LUMA_4x4].sad_x4   = x265_pixel_sad_x4_4x4_mmx2;
> +        p.pu[LUMA_4x8].sad_x4   = x265_pixel_sad_x4_4x8_mmx2;
> +        p.pu[LUMA_4x16].sad_x4  = x265_pixel_sad_x4_4x16_mmx2;
> +        p.pu[LUMA_8x4].sad_x4   = x265_pixel_sad_x4_8x4_sse2;
> +        p.pu[LUMA_8x8].sad_x4   = x265_pixel_sad_x4_8x8_sse2;
> +        p.pu[LUMA_8x16].sad_x4  = x265_pixel_sad_x4_8x16_sse2;
> +        p.pu[LUMA_8x32].sad_x4  = x265_pixel_sad_x4_8x32_sse2;
> +        p.pu[LUMA_16x4].sad_x4  = x265_pixel_sad_x4_16x4_sse2;
> +        p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_mmx2;
>  
> -        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
> -        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
> -        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
> -        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
> +        p.cu[BLOCK_4x4].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_4_sse2;
> +        p.cu[BLOCK_8x8].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_8_sse2;
> +        p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_sse2;
> +        p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_sse2;
> +        p.cu[BLOCK_4x4].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_4_sse2;
> +        p.cu[BLOCK_8x8].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_8_sse2;
> +        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_sse2;
> +        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_sse2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_4_sse2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_8_sse2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_sse2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_sse2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_4_sse2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_8_sse2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_sse2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_sse2;
>  
>          CHROMA_PIXELSUB_PS(_sse2);
>          CHROMA_PIXELSUB_PS_422(_sse2);
> @@ -1357,28 +1357,28 @@
>          p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
>          p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
>  
> -        p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
> -        p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
> -        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
> -        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
> +        p.cu[BLOCK_4x4].blockfill_s = x265_blockfill_s_4x4_sse2;
> +        p.cu[BLOCK_8x8].blockfill_s = x265_blockfill_s_8x8_sse2;
> +        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_sse2;
> +        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_sse2;
>  
>          // TODO: overflow on 12-bits mode!
> -        p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
> -        p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
> -        p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
> -        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
> +        p.cu[BLOCK_4x4].ssd_s   = x265_pixel_ssd_s_4_sse2;
> +        p.cu[BLOCK_8x8].ssd_s   = x265_pixel_ssd_s_8_sse2;
> +        p.cu[BLOCK_16x16].ssd_s = x265_pixel_ssd_s_16_sse2;
> +        p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_sse2;
>  
> -        p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
> -        p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
> -        p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
> -        p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
> +        p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> +        p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
> +        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_sse2;
> +        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_sse2;
>  
> -        p.dct[DCT_4x4] = x265_dct4_sse2;
> -        p.idct[IDCT_4x4] = x265_idct4_sse2;
> +        p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
> +        p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
>  #if X86_64
> -        p.idct[IDCT_8x8] = x265_idct8_sse2;
> +        p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
>  #endif
> -        p.idct[IDST_4x4] = x265_idst4_sse2;
> +        p.idst4x4 = x265_idst4_sse2;
>  
>          LUMA_SS_FILTERS(_sse2);
>      }
> @@ -1389,8 +1389,8 @@
>  
>          INTRA_ANG_SSSE3(ssse3);
>  
> -        p.dct[DST_4x4] = x265_dst4_ssse3;
> -        p.idct[IDCT_8x8] = x265_idct8_ssse3;
> +        p.dst4x4 = x265_dst4_ssse3;
> +        p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
>          p.count_nonzero = x265_count_nonzero_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
> @@ -1405,7 +1405,7 @@
>          CHROMA_VERT_FILTERS_SSE4_422(_sse4);
>          CHROMA_HORIZ_FILTERS_444(_sse4);
>  
> -        p.dct[DCT_8x8] = x265_dct8_sse4;
> +        p.cu[BLOCK_8x8].dct = x265_dct8_sse4;
>          p.quant = x265_quant_sse4;
>          p.nquant = x265_nquant_sse4;
>          p.dequant_normal = x265_dequant_normal_sse4;
> @@ -1423,12 +1423,12 @@
>          INTRA_ANG_SSE4_COMMON(sse4);
>          INTRA_ANG_SSE4_HIGH(sse4);
>  
> -        p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
> +        p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
>  #if X86_64
> -        p.psy_cost_pp[BLOCK_8x8] = x265_psyCost_pp_8x8_sse4;
> -        p.psy_cost_pp[BLOCK_16x16] = x265_psyCost_pp_16x16_sse4;
> -        p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
> -        p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
> +        p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_sse4;
> +        p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_sse4;
> +        p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_sse4;
> +        p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_sse4;
>  #endif
>      }
>      if (cpuMask & X265_CPU_XOP)
> @@ -1440,59 +1440,59 @@
>      }
>      if (cpuMask & X265_CPU_AVX2)
>      {
> -        p.dct[DCT_4x4] = x265_dct4_avx2;
> +        p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
>          p.quant = x265_quant_avx2;
>          p.nquant = x265_nquant_avx2;
> -        p.dequant_normal = x265_dequant_normal_avx2;
> +        p.dequant_normal  = x265_dequant_normal_avx2;
>          p.scale1D_128to64 = x265_scale1D_128to64_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_4_avx2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_8_avx2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_avx2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_avx2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_4_avx2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_8_avx2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_avx2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_avx2;
>  #if X86_64
> -        p.dct[DCT_8x8] = x265_dct8_avx2;
> -        p.dct[DCT_16x16] = x265_dct16_avx2;
> -        p.dct[DCT_32x32] = x265_dct32_avx2;
> -        p.idct[IDCT_4x4] = x265_idct4_avx2;
> -        p.idct[IDCT_8x8] = x265_idct8_avx2;
> -        p.idct[IDCT_16x16] = x265_idct16_avx2;
> -        p.idct[IDCT_32x32] = x265_idct32_avx2;
> -        p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
> -        p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
> -        p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
> -        p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
> +        p.cu[BLOCK_8x8].dct   = x265_dct8_avx2;
> +        p.cu[BLOCK_16x16].dct = x265_dct16_avx2;
> +        p.cu[BLOCK_32x32].dct = x265_dct32_avx2;
> +        p.cu[BLOCK_4x4].idct  = x265_idct4_avx2;
> +        p.cu[BLOCK_8x8].idct  = x265_idct8_avx2;
> +        p.cu[BLOCK_16x16].idct = x265_idct16_avx2;
> +        p.cu[BLOCK_32x32].idct = x265_idct32_avx2;
> +        p.cu[BLOCK_8x8].transpose = x265_transpose8_avx2;
> +        p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
> +        p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
> +        p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
>  #endif
>      }
>      /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
>      for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
>      {
> -        p.sse_pp[i] = (pixelcmp_t)p.sse_ss[i];
> -        p.sse_sp[i] = (pixelcmp_sp_t)p.sse_ss[i];
> +        p.pu[i].sse_pp = (pixelcmp_t)p.pu[i].sse_ss;
> +        p.pu[i].sse_sp = (pixelcmp_sp_t)p.pu[i].sse_ss;
>      }
>  
>      for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
>      {
> -        p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
> -        p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
> -        p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
> +        p.pu[i].luma_copy_ps = (copy_ps_t)p.pu[i].luma_copy_ss;
> +        p.pu[i].luma_copy_sp = (copy_sp_t)p.pu[i].luma_copy_ss;
> +        p.pu[i].luma_copy_pp = (copy_pp_t)p.pu[i].luma_copy_ss;
>      }
>  
>      for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
>      {
> -        p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
> -        p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
> -        p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
> +        p.chroma[X265_CSP_I420].pu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_ss;
> +        p.chroma[X265_CSP_I420].pu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_ss;
> +        p.chroma[X265_CSP_I420].pu[i].copy_pp = (copy_pp_t)p.chroma[X265_CSP_I420].pu[i].copy_ss;
>      }
>  
>      for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
>      {
> -        p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
> -        p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
> -        p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
> +        p.chroma[X265_CSP_I422].pu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_ss;
> +        p.chroma[X265_CSP_I422].pu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_ss;
> +        p.chroma[X265_CSP_I422].pu[i].copy_pp = (copy_pp_t)p.chroma[X265_CSP_I422].pu[i].copy_ss;
>      }
>  
>  #else // if HIGH_BIT_DEPTH
> @@ -1502,7 +1502,7 @@
>          INIT8(sad, _mmx2);
>          INIT8(sad_x3, _mmx2);
>          INIT8(sad_x4, _mmx2);
> -        p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
> +        p.pu[LUMA_4x4].satd = x265_pixel_satd_4x4_mmx2;
>          p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
>  
>          PIXEL_AVG(sse2);
> @@ -1541,52 +1541,52 @@
>          // until all partitions are coded and commit smaller patches, easier to
>          // review.
>  
> -        p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
> -        p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
> -        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
> -        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
> +        p.cu[BLOCK_4x4].blockfill_s = x265_blockfill_s_4x4_sse2;
> +        p.cu[BLOCK_8x8].blockfill_s = x265_blockfill_s_8x8_sse2;
> +        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_sse2;
> +        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_sse2;
>  
> -        p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
> -        p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
> -        p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
> -        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
> +        p.cu[BLOCK_4x4].ssd_s = x265_pixel_ssd_s_4_sse2;
> +        p.cu[BLOCK_8x8].ssd_s = x265_pixel_ssd_s_8_sse2;
> +        p.cu[BLOCK_16x16].ssd_s = x265_pixel_ssd_s_16_sse2;
> +        p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_sse2;
>  
>          p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>          SA8D_INTER_FROM_BLOCK(sse2);
>  
> -        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
> -        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
> -        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
> -        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
> -        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
> -        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
> -        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
> +        p.cu[BLOCK_4x4].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_4_sse2;
> +        p.cu[BLOCK_8x8].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_8_sse2;
> +        p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_sse2;
> +        p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_sse2;
> +        p.cu[BLOCK_4x4].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_4_sse2;
> +        p.cu[BLOCK_8x8].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_8_sse2;
> +        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_sse2;
> +        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_sse2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_4_sse2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_8_sse2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_sse2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_sse2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_4_sse2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_8_sse2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_sse2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_sse2;
>  
> -        p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
> -        p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
> -        p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
> -        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
> -        p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
> -        p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
> -        p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
> +        p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> +        p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
> +        p.cu[BLOCK_4x4].transpose = x265_transpose4_sse2;
> +        p.cu[BLOCK_8x8].transpose = x265_transpose8_sse2;
> +        p.cu[BLOCK_16x16].transpose = x265_transpose16_sse2;
> +        p.cu[BLOCK_32x32].transpose = x265_transpose32_sse2;
> +        p.cu[BLOCK_64x64].transpose = x265_transpose64_sse2;
>          p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
>          p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
>  
> -        p.dct[DCT_4x4] = x265_dct4_sse2;
> -        p.idct[IDCT_4x4] = x265_idct4_sse2;
> +        p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
> +        p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
>  #if X86_64
> -        p.idct[IDCT_8x8] = x265_idct8_sse2;
> +        p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
>  #endif
> -        p.idct[IDST_4x4] = x265_idst4_sse2;
> +        p.idst4x4 = x265_idst4_sse2;
>  
>          p.planecopy_sp = x265_downShift_16_sse2;
>      }
> @@ -1594,7 +1594,7 @@
>      {
>          p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
>          SA8D_INTER_FROM_BLOCK(ssse3);
> -        p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
> +        p.pu[LUMA_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3;
>          ASSGN_SSE(ssse3);
>          PIXEL_AVG(ssse3);
>          PIXEL_AVG_W4(ssse3);
> @@ -1605,23 +1605,23 @@
>          p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
>          SAD_X3(ssse3);
>          SAD_X4(ssse3);
> -        p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
> -        p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
> -        p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
> -        p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3;
> -        p.sad_x3[LUMA_8x32]  = x265_pixel_sad_x3_8x32_ssse3;
> -        p.sad_x4[LUMA_8x32]  = x265_pixel_sad_x4_8x32_ssse3;
> +        p.pu[LUMA_8x4].sad_x4  = x265_pixel_sad_x4_8x4_ssse3;
> +        p.pu[LUMA_8x8].sad_x4  = x265_pixel_sad_x4_8x8_ssse3;
> +        p.pu[LUMA_8x16].sad_x3 = x265_pixel_sad_x3_8x16_ssse3;
> +        p.pu[LUMA_8x16].sad_x4 = x265_pixel_sad_x4_8x16_ssse3;
> +        p.pu[LUMA_8x32].sad_x3 = x265_pixel_sad_x3_8x32_ssse3;
> +        p.pu[LUMA_8x32].sad_x4 = x265_pixel_sad_x4_8x32_ssse3;
>  
> -        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
> -        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
> +        p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_ssse3;
> +        p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_ssse3;
>  
>          p.luma_p2s = x265_luma_p2s_ssse3;
>          p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
>          p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
>          p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_ssse3; // for i444, chroma_p2s can use luma_p2s
>  
> -        p.dct[DST_4x4] = x265_dst4_ssse3;
> -        p.idct[IDCT_8x8] = x265_idct8_ssse3;
> +        p.dst4x4 = x265_dst4_ssse3;
> +        p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
>          p.count_nonzero = x265_count_nonzero_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
> @@ -1638,21 +1638,21 @@
>          CHROMA_ADDAVG_422(_sse4);
>  
>          // TODO: check POPCNT flag!
> -        p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
> -        p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_sse4;
> -        p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_sse4;
> -        p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_sse4;
> +        p.cu[BLOCK_4x4].copy_cnt = x265_copy_cnt_4_sse4;
> +        p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_sse4;
> +        p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_sse4;
> +        p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_sse4;
>  
>          HEVC_SATD(sse4);
>          SA8D_INTER_FROM_BLOCK(sse4);
>  
> -        p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
> -        p.sse_pp[LUMA_24x32] = x265_pixel_ssd_24x32_sse4;
> -        p.sse_pp[LUMA_48x64] = x265_pixel_ssd_48x64_sse4;
> -        p.sse_pp[LUMA_64x16] = x265_pixel_ssd_64x16_sse4;
> -        p.sse_pp[LUMA_64x32] = x265_pixel_ssd_64x32_sse4;
> -        p.sse_pp[LUMA_64x48] = x265_pixel_ssd_64x48_sse4;
> -        p.sse_pp[LUMA_64x64] = x265_pixel_ssd_64x64_sse4;
> +        p.pu[LUMA_12x16].sse_pp = x265_pixel_ssd_12x16_sse4;
> +        p.pu[LUMA_24x32].sse_pp = x265_pixel_ssd_24x32_sse4;
> +        p.pu[LUMA_48x64].sse_pp = x265_pixel_ssd_48x64_sse4;
> +        p.pu[LUMA_64x16].sse_pp = x265_pixel_ssd_64x16_sse4;
> +        p.pu[LUMA_64x32].sse_pp = x265_pixel_ssd_64x32_sse4;
> +        p.pu[LUMA_64x48].sse_pp = x265_pixel_ssd_64x48_sse4;
> +        p.pu[LUMA_64x64].sse_pp = x265_pixel_ssd_64x64_sse4;
>  
>          LUMA_SSE_SP(_sse4);
>  
> @@ -1673,17 +1673,17 @@
>          ASSGN_SSE_SS(sse4);
>  
>          // MUST be done after LUMA_FILTERS() to overwrite default version
> -        p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_sse4;
> +        p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse4;
>  
> -        p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
> -        p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
> -        p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_2x4].copy_sp = x265_blockcopy_sp_2x4_sse4;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_2x8].copy_sp = x265_blockcopy_sp_2x8_sse4;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_6x8].copy_sp = x265_blockcopy_sp_6x8_sse4;
>          CHROMA_BLOCKCOPY(ps, _sse4);
>          CHROMA_BLOCKCOPY_422(ps, _sse4);
>          LUMA_BLOCKCOPY(ps, _sse4);
>  
> -        p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
> -        p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
> +        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_sse4;
> +        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_sse4;
>          p.quant = x265_quant_sse4;
>          p.nquant = x265_nquant_sse4;
>          p.dequant_normal = x265_dequant_normal_sse4;
> @@ -1707,14 +1707,14 @@
>          INTRA_ANG_SSE4_COMMON(sse4);
>          INTRA_ANG_SSE4(sse4);
>  
> -        p.dct[DCT_8x8] = x265_dct8_sse4;
> +        p.cu[BLOCK_8x8].dct = x265_dct8_sse4;
>          p.denoiseDct = x265_denoise_dct_sse4;
> -        p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
> +        p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
>  #if X86_64
> -        p.psy_cost_pp[BLOCK_8x8] = x265_psyCost_pp_8x8_sse4;
> -        p.psy_cost_pp[BLOCK_16x16] = x265_psyCost_pp_16x16_sse4;
> -        p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
> -        p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
> +        p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_sse4;
> +        p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_sse4;
> +        p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_sse4;
> +        p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_sse4;
>  #endif
>      }
>      if (cpuMask & X265_CPU_AVX)
> @@ -1727,36 +1727,36 @@
>          ASSGN_SSE_SS(avx);
>          SAD_X3(avx);
>          SAD_X4(avx);
> -        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
> -        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
> -        p.sad_x3[LUMA_16x4]  = x265_pixel_sad_x3_16x4_avx;
> -        p.sad_x4[LUMA_16x4]  = x265_pixel_sad_x4_16x4_avx;
> +        p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_avx;
> +        p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_avx;
> +        p.pu[LUMA_16x4].sad_x3  = x265_pixel_sad_x3_16x4_avx;
> +        p.pu[LUMA_16x4].sad_x4  = x265_pixel_sad_x4_16x4_avx;
>  
>          p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
>          p.ssim_end_4 = x265_pixel_ssim_end4_avx;
> -        p.luma_copy_ss[LUMA_64x16] = x265_blockcopy_ss_64x16_avx;
> -        p.luma_copy_ss[LUMA_64x32] = x265_blockcopy_ss_64x32_avx;
> -        p.luma_copy_ss[LUMA_64x48] = x265_blockcopy_ss_64x48_avx;
> -        p.luma_copy_ss[LUMA_64x64] = x265_blockcopy_ss_64x64_avx;
> +        p.pu[LUMA_64x16].luma_copy_ss = x265_blockcopy_ss_64x16_avx;
> +        p.pu[LUMA_64x32].luma_copy_ss = x265_blockcopy_ss_64x32_avx;
> +        p.pu[LUMA_64x48].luma_copy_ss = x265_blockcopy_ss_64x48_avx;
> +        p.pu[LUMA_64x64].luma_copy_ss = x265_blockcopy_ss_64x64_avx;
>  
> -        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x8] = x265_blockcopy_pp_32x8_avx;
> -        p.luma_copy_pp[LUMA_32x8] = x265_blockcopy_pp_32x8_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_32x8].copy_pp = x265_blockcopy_pp_32x8_avx;
> +        p.pu[LUMA_32x8].luma_copy_pp = x265_blockcopy_pp_32x8_avx;
>  
> -        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x16] = x265_blockcopy_pp_32x16_avx;
> -        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x16] = x265_blockcopy_pp_32x16_avx;
> -        p.luma_copy_pp[LUMA_32x16] = x265_blockcopy_pp_32x16_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
> +        p.pu[LUMA_32x16].luma_copy_pp = x265_blockcopy_pp_32x16_avx;
>  
> -        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x24] = x265_blockcopy_pp_32x24_avx;
> -        p.luma_copy_pp[LUMA_32x24] = x265_blockcopy_pp_32x24_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_32x24].copy_pp = x265_blockcopy_pp_32x24_avx;
> +        p.pu[LUMA_32x24].luma_copy_pp = x265_blockcopy_pp_32x24_avx;
>  
> -        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x32] = x265_blockcopy_pp_32x32_avx;
> -        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x32] = x265_blockcopy_pp_32x32_avx;
> -        p.luma_copy_pp[LUMA_32x32]  = x265_blockcopy_pp_32x32_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
> +        p.pu[LUMA_32x32].luma_copy_pp  = x265_blockcopy_pp_32x32_avx;
>  
> -        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x48] = x265_blockcopy_pp_32x48_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_32x48].copy_pp = x265_blockcopy_pp_32x48_avx;
>  
> -        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x64] = x265_blockcopy_pp_32x64_avx;
> -        p.luma_copy_pp[LUMA_32x64]  = x265_blockcopy_pp_32x64_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_32x64].copy_pp = x265_blockcopy_pp_32x64_avx;
> +        p.pu[LUMA_32x64].luma_copy_pp = x265_blockcopy_pp_32x64_avx;
>      }
>      if (cpuMask & X265_CPU_XOP)
>      {
> @@ -1771,139 +1771,139 @@
>          INIT2(sad_x4, _avx2);
>          INIT4(satd, _avx2);
>          INIT2_NAME(sse_pp, ssd, _avx2);
> -        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
> -        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
> -        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
> +        p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_avx2;
> +        p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_avx2;
> +        p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
>  
>          /* Need to update assembly code as per changed interface of the copy_cnt primitive, once
>           * code is updated, avx2 version will be enabled */
>  
> -        p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
> -        p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
> -        p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
> +        p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_avx2;
> +        p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_avx2;
> +        p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_avx2;
>  
> -        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
> -        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
> +        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_avx2;
> +        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_avx2;
>  
> -        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
> -        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
> -        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_4_avx2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_8_avx2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_avx2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_avx2;
> +        p.cu[BLOCK_4x4].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_4_avx2;
> +        p.cu[BLOCK_8x8].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_8_avx2;
> +        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_avx2;
> +        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_avx2;
>  
>          p.denoiseDct = x265_denoise_dct_avx2;
> -        p.dct[DCT_4x4] = x265_dct4_avx2;
> +        p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
>          p.quant = x265_quant_avx2;
>          p.nquant = x265_nquant_avx2;
>          p.dequant_normal = x265_dequant_normal_avx2;
>  
> -        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
> -        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
> -        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
> -        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x16] = x265_blockcopy_ss_16x16_avx;
> -        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x32] = x265_blockcopy_ss_16x32_avx;
> -        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x8] = x265_blockcopy_ss_16x8_avx;
> -        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x16] = x265_blockcopy_ss_16x16_avx;
> -        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x24] = x265_blockcopy_ss_16x24_avx;
> -        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx;
> -        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x4].copy_ss  = x265_blockcopy_ss_16x4_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x12].copy_ss = x265_blockcopy_ss_16x12_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x8].copy_ss  = x265_blockcopy_ss_16x8_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_16x8] .copy_ss = x265_blockcopy_ss_16x8_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_16x24].copy_ss = x265_blockcopy_ss_16x24_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].copy_ss = x265_blockcopy_ss_16x64_avx;
>          p.scale1D_128to64 = x265_scale1D_128to64_avx2;
>  
>          p.weight_pp = x265_weight_pp_avx2;
>  
>  #if X86_64
>  
> -        p.dct[DCT_8x8] = x265_dct8_avx2;
> -        p.dct[DCT_16x16] = x265_dct16_avx2;
> -        p.dct[DCT_32x32] = x265_dct32_avx2;
> -        p.idct[IDCT_4x4] = x265_idct4_avx2;
> -        p.idct[IDCT_8x8] = x265_idct8_avx2;
> -        p.idct[IDCT_16x16] = x265_idct16_avx2;
> -        p.idct[IDCT_32x32] = x265_idct32_avx2;
> +        p.cu[BLOCK_8x8].dct    = x265_dct8_avx2;
> +        p.cu[BLOCK_16x16].dct  = x265_dct16_avx2;
> +        p.cu[BLOCK_32x32].dct  = x265_dct32_avx2;
> +        p.cu[BLOCK_4x4].idct   = x265_idct4_avx2;
> +        p.cu[BLOCK_8x8].idct   = x265_idct8_avx2;
> +        p.cu[BLOCK_16x16].idct = x265_idct16_avx2;
> +        p.cu[BLOCK_32x32].idct = x265_idct32_avx2;
>  
> -        p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
> -        p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
> -        p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
> -        p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
> +        p.cu[BLOCK_8x8].transpose   = x265_transpose8_avx2;
> +        p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
> +        p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
> +        p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
>  
> -        p.luma_vpp[LUMA_12x16] = x265_interp_8tap_vert_pp_12x16_avx2;
> +        p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2;
>  
> -        p.luma_vpp[LUMA_16x4] = x265_interp_8tap_vert_pp_16x4_avx2;
> -        p.luma_vpp[LUMA_16x8] = x265_interp_8tap_vert_pp_16x8_avx2;
> -        p.luma_vpp[LUMA_16x12] = x265_interp_8tap_vert_pp_16x12_avx2;
> -        p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2;
> -        p.luma_vpp[LUMA_16x32] = x265_interp_8tap_vert_pp_16x32_avx2;
> -        p.luma_vpp[LUMA_16x64] = x265_interp_8tap_vert_pp_16x64_avx2;
> +        p.pu[LUMA_16x4].luma_vpp  = x265_interp_8tap_vert_pp_16x4_avx2;
> +        p.pu[LUMA_16x8].luma_vpp  = x265_interp_8tap_vert_pp_16x8_avx2;
> +        p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2;
> +        p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2;
> +        p.pu[LUMA_16x32].luma_vpp = x265_interp_8tap_vert_pp_16x32_avx2;
> +        p.pu[LUMA_16x64].luma_vpp = x265_interp_8tap_vert_pp_16x64_avx2;
>  
> -        p.luma_vpp[LUMA_24x32] = x265_interp_8tap_vert_pp_24x32_avx2;
> +        p.pu[LUMA_24x32].luma_vpp = x265_interp_8tap_vert_pp_24x32_avx2;
>  
> -        p.luma_vpp[LUMA_32x8] = x265_interp_8tap_vert_pp_32x8_avx2;
> -        p.luma_vpp[LUMA_32x16] = x265_interp_8tap_vert_pp_32x16_avx2;
> -        p.luma_vpp[LUMA_32x24] = x265_interp_8tap_vert_pp_32x24_avx2;
> -        p.luma_vpp[LUMA_32x32] = x265_interp_8tap_vert_pp_32x32_avx2;
> -        p.luma_vpp[LUMA_32x64] = x265_interp_8tap_vert_pp_32x64_avx2;
> +        p.pu[LUMA_32x8].luma_vpp  = x265_interp_8tap_vert_pp_32x8_avx2;
> +        p.pu[LUMA_32x16].luma_vpp = x265_interp_8tap_vert_pp_32x16_avx2;
> +        p.pu[LUMA_32x24].luma_vpp = x265_interp_8tap_vert_pp_32x24_avx2;
> +        p.pu[LUMA_32x32].luma_vpp = x265_interp_8tap_vert_pp_32x32_avx2;
> +        p.pu[LUMA_32x64].luma_vpp = x265_interp_8tap_vert_pp_32x64_avx2;
>  
> -        p.luma_vpp[LUMA_48x64] = x265_interp_8tap_vert_pp_48x64_avx2;
> +        p.pu[LUMA_48x64].luma_vpp = x265_interp_8tap_vert_pp_48x64_avx2;
>  
> -        p.luma_vpp[LUMA_64x16] = x265_interp_8tap_vert_pp_64x16_avx2;
> -        p.luma_vpp[LUMA_64x32] = x265_interp_8tap_vert_pp_64x32_avx2;
> -        p.luma_vpp[LUMA_64x48] = x265_interp_8tap_vert_pp_64x48_avx2;
> -        p.luma_vpp[LUMA_64x64] = x265_interp_8tap_vert_pp_64x64_avx2;
> +        p.pu[LUMA_64x16].luma_vpp = x265_interp_8tap_vert_pp_64x16_avx2;
> +        p.pu[LUMA_64x32].luma_vpp = x265_interp_8tap_vert_pp_64x32_avx2;
> +        p.pu[LUMA_64x48].luma_vpp = x265_interp_8tap_vert_pp_64x48_avx2;
> +        p.pu[LUMA_64x64].luma_vpp = x265_interp_8tap_vert_pp_64x64_avx2;
>  #endif
> -        p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
> +        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2;
>  
> -        p.luma_hpp[LUMA_8x4] = x265_interp_8tap_horiz_pp_8x4_avx2;
> -        p.luma_hpp[LUMA_8x8] = x265_interp_8tap_horiz_pp_8x8_avx2;
> -        p.luma_hpp[LUMA_8x16] = x265_interp_8tap_horiz_pp_8x16_avx2;
> -        p.luma_hpp[LUMA_8x32] = x265_interp_8tap_horiz_pp_8x32_avx2;
> +        p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2;
> +        p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
> +        p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
> +        p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
>  
> -        p.luma_hpp[LUMA_16x4] = x265_interp_8tap_horiz_pp_16x4_avx2;
> -        p.luma_hpp[LUMA_16x8] = x265_interp_8tap_horiz_pp_16x8_avx2;
> -        p.luma_hpp[LUMA_16x12] = x265_interp_8tap_horiz_pp_16x12_avx2;
> -        p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
> -        p.luma_hpp[LUMA_16x32] = x265_interp_8tap_horiz_pp_16x32_avx2;
> -        p.luma_hpp[LUMA_16x64] = x265_interp_8tap_horiz_pp_16x64_avx2;
> +        p.pu[LUMA_16x4].luma_hpp = x265_interp_8tap_horiz_pp_16x4_avx2;
> +        p.pu[LUMA_16x8].luma_hpp = x265_interp_8tap_horiz_pp_16x8_avx2;
> +        p.pu[LUMA_16x12].luma_hpp = x265_interp_8tap_horiz_pp_16x12_avx2;
> +        p.pu[LUMA_16x16].luma_hpp = x265_interp_8tap_horiz_pp_16x16_avx2;
> +        p.pu[LUMA_16x32].luma_hpp = x265_interp_8tap_horiz_pp_16x32_avx2;
> +        p.pu[LUMA_16x64].luma_hpp = x265_interp_8tap_horiz_pp_16x64_avx2;
>  
> -        p.luma_hpp[LUMA_32x8] = x265_interp_8tap_horiz_pp_32x8_avx2;
> -        p.luma_hpp[LUMA_32x16] = x265_interp_8tap_horiz_pp_32x16_avx2;
> -        p.luma_hpp[LUMA_32x24] = x265_interp_8tap_horiz_pp_32x24_avx2;
> -        p.luma_hpp[LUMA_32x32] = x265_interp_8tap_horiz_pp_32x32_avx2;
> -        p.luma_hpp[LUMA_32x64] = x265_interp_8tap_horiz_pp_32x64_avx2;
> +        p.pu[LUMA_32x8].luma_hpp  = x265_interp_8tap_horiz_pp_32x8_avx2;
> +        p.pu[LUMA_32x16].luma_hpp = x265_interp_8tap_horiz_pp_32x16_avx2;
> +        p.pu[LUMA_32x24].luma_hpp = x265_interp_8tap_horiz_pp_32x24_avx2;
> +        p.pu[LUMA_32x32].luma_hpp = x265_interp_8tap_horiz_pp_32x32_avx2;
> +        p.pu[LUMA_32x64].luma_hpp = x265_interp_8tap_horiz_pp_32x64_avx2;
>  
> -        p.luma_hpp[LUMA_64x64] = x265_interp_8tap_horiz_pp_64x64_avx2;
> -        p.luma_hpp[LUMA_64x48] = x265_interp_8tap_horiz_pp_64x48_avx2;
> -        p.luma_hpp[LUMA_64x32] = x265_interp_8tap_horiz_pp_64x32_avx2;
> -        p.luma_hpp[LUMA_64x16] = x265_interp_8tap_horiz_pp_64x16_avx2;
> +        p.pu[LUMA_64x64].luma_hpp = x265_interp_8tap_horiz_pp_64x64_avx2;
> +        p.pu[LUMA_64x48].luma_hpp = x265_interp_8tap_horiz_pp_64x48_avx2;
> +        p.pu[LUMA_64x32].luma_hpp = x265_interp_8tap_horiz_pp_64x32_avx2;
> +        p.pu[LUMA_64x16].luma_hpp = x265_interp_8tap_horiz_pp_64x16_avx2;
>  
> -        p.luma_hpp[LUMA_48x64] = x265_interp_8tap_horiz_pp_48x64_avx2;
> +        p.pu[LUMA_48x64].luma_hpp = x265_interp_8tap_horiz_pp_48x64_avx2;
>  
> -        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_8x8] = x265_interp_4tap_horiz_pp_8x8_avx2;
> -        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_4x4] = x265_interp_4tap_horiz_pp_4x4_avx2;
> -        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_32x32] = x265_interp_4tap_horiz_pp_32x32_avx2;
> -        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_16x16] = x265_interp_4tap_horiz_pp_16x16_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_8x8].filter_hpp = x265_interp_4tap_horiz_pp_8x8_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_32x32].filter_hpp = x265_interp_4tap_horiz_pp_32x32_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
>  
> -        p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
> +        p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_avx2;
>  
> -        p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2;
> -        p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
> -        p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
> -        p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
> +        p.pu[LUMA_8x4].luma_vpp = x265_interp_8tap_vert_pp_8x4_avx2;
> +        p.pu[LUMA_8x8].luma_vpp = x265_interp_8tap_vert_pp_8x8_avx2;
> +        p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2;
> +        p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2;
>  
>          // color space i420
> -        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
> -        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_8x8] = x265_interp_4tap_vert_pp_8x8_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2;
>  
>          // color space i422
> -        p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
> +        p.chroma[X265_CSP_I422].pu[CHROMA422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
>  
> -        p.luma_vps[LUMA_4x4] = x265_interp_8tap_vert_ps_4x4_avx2;
> +        p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_avx2;
>  
>  #if X86_64
> -        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;
> -        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2;
> +        p.chroma[X265_CSP_I420].pu[CHROMA_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2;
>  #endif
>      }
>  #endif // if HIGH_BIT_DEPTH
> diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/yuv.cpp
> --- a/source/common/yuv.cpp	Fri Jan 09 11:35:26 2015 +0530
> +++ b/source/common/yuv.cpp	Thu Jan 08 15:23:38 2015 -0600
> @@ -81,32 +81,32 @@
>  void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const
>  {
>      pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
> -    primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size);
> +    primitives.pu[m_part].luma_copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
>  
>      pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
>      pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
> -    primitives.chroma[m_csp].copy_pp[m_part](dstU, dstPic.m_strideC, m_buf[1], m_csize);
> -    primitives.chroma[m_csp].copy_pp[m_part](dstV, dstPic.m_strideC, m_buf[2], m_csize);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
>  }
>  
>  void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
>  {
>      const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
> -    primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride);
> +    primitives.pu[m_part].luma_copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
>  
>      const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
>      const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
> -    primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPic.m_strideC);
> -    primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPic.m_strideC);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
>  }
>  
>  void Yuv::copyFromYuv(const Yuv& srcYuv)
>  {
>      X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
>  
> -    primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
> -    primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
> -    primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
> +    primitives.pu[m_part].luma_copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
>  }
>  
>  /* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
> @@ -115,47 +115,47 @@
>      X265_CHECK(m_size == FENC_STRIDE && m_size >= srcYuv.m_size, "PU buffer size mismatch\n");
>  
>      const pixel* srcY = srcYuv.m_buf[0] + getAddrOffset(absPartIdx, srcYuv.m_size);
> -    primitives.luma_copy_pp[partEnum](m_buf[0], m_size, srcY, srcYuv.m_size);
> +    primitives.pu[partEnum].luma_copy_pp(m_buf[0], m_size, srcY, srcYuv.m_size);
>  
>      if (bChroma)
>      {
>          const pixel* srcU = srcYuv.m_buf[1] + srcYuv.getChromaAddrOffset(absPartIdx);
>          const pixel* srcV = srcYuv.m_buf[2] + srcYuv.getChromaAddrOffset(absPartIdx);
> -        primitives.chroma[m_csp].copy_pp[partEnum](m_buf[1], m_csize, srcU, srcYuv.m_csize);
> -        primitives.chroma[m_csp].copy_pp[partEnum](m_buf[2], m_csize, srcV, srcYuv.m_csize);
> +        primitives.chroma[m_csp].pu[partEnum].copy_pp(m_buf[1], m_csize, srcU, srcYuv.m_csize);
> +        primitives.chroma[m_csp].pu[partEnum].copy_pp(m_buf[2], m_csize, srcV, srcYuv.m_csize);
>      }
>  }
>  
>  void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const
>  {
>      pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
> -    primitives.luma_copy_pp[m_part](dstY, dstYuv.m_size, m_buf[0], m_size);
> +    primitives.pu[m_part].luma_copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
>  
>      pixel* dstU = dstYuv.getCbAddr(absPartIdx);
>      pixel* dstV = dstYuv.getCrAddr(absPartIdx);
> -    primitives.chroma[m_csp].copy_pp[m_part](dstU, dstYuv.m_csize, m_buf[1], m_csize);
> -    primitives.chroma[m_csp].copy_pp[m_part](dstV, dstYuv.m_csize, m_buf[2], m_csize);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
> +    primitives.chroma[m_csp].pu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
>  }
>  
>  void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
>  {
>      pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
>      pixel* dstY = dstYuv.m_buf[0];
> -    primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size);
> +    primitives.pu[dstYuv.m_part].luma_copy_pp(dstY, dstYuv.m_size, srcY, m_size);
>  
>      pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
>      pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
>      pixel* dstU = dstYuv.m_buf[1];
>      pixel* dstV = dstYuv.m_buf[2];
> -    primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstU, dstYuv.m_csize, srcU, m_csize);
> -    primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstV, dstYuv.m_csize, srcV, m_csize);
> +    primitives.chroma[m_csp].pu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
> +    primitives.chroma[m_csp].pu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
>  }
>  
>  void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
>  {
> -    primitives.luma_add_ps[log2SizeL - 2](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
> -    primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
> -    primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
> +    primitives.pu[log2SizeL - 2].luma_add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
> +    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
> +    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
>  }
>  
>  void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
> @@ -167,7 +167,7 @@
>          const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
>          const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
>          pixel* dstY = getLumaAddr(absPartIdx);
> -        primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
> +        primitives.pu[part].luma_addAvg(srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
>      }
>      if (bChroma)
>      {
> @@ -177,8 +177,8 @@
>          const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
>          pixel* dstU = getCbAddr(absPartIdx);
>          pixel* dstV = getCrAddr(absPartIdx);
> -        primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
> -        primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
> +        primitives.chroma[m_csp].pu[part].addAvg(srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
> +        primitives.chroma[m_csp].pu[part].addAvg(srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
>      }
>  }
>  
> @@ -186,7 +186,7 @@
>  {
>      const pixel* src = getLumaAddr(absPartIdx);
>      pixel* dst = dstYuv.getLumaAddr(absPartIdx);
> -    primitives.luma_copy_pp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
> +    primitives.pu[log2Size - 2].luma_copy_pp(dst, dstYuv.m_size, src, m_size);
>  }
>  
>  void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
> @@ -196,6 +196,6 @@
>      const pixel* srcV = getCrAddr(absPartIdx);
>      pixel* dstU = dstYuv.getCbAddr(absPartIdx);
>      pixel* dstV = dstYuv.getCrAddr(absPartIdx);
> -    primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, srcU, m_csize);
> -    primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, srcV, m_csize);
> +    primitives.chroma[m_csp].pu[part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
> +    primitives.chroma[m_csp].pu[part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
>  }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list