[x265] [PATCH 1 of 5] Refactor EncoderPrimitives under common

Kevin Wu kevin at multicorewareinc.com
Fri Jan 9 07:53:48 CET 2015


# HG changeset patch
# User Kevin Wu <kevin at multicorewareinc.com>
# Date 1420752218 21600
#      Thu Jan 08 15:23:38 2015 -0600
# Node ID c6ca0fd54aa7c50119c9e5bdbbd02d49abb45559
# Parent  1924c460d1304d9ce775f35864712dd98f758f9f
Refactor EncoderPrimitives under common.

diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/dct.cpp
--- a/source/common/dct.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/dct.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -765,22 +765,22 @@
     p.dequant_normal = dequant_normal_c;
     p.quant = quant_c;
     p.nquant = nquant_c;
-    p.dct[DST_4x4] = dst4_c;
-    p.dct[DCT_4x4] = dct4_c;
-    p.dct[DCT_8x8] = dct8_c;
-    p.dct[DCT_16x16] = dct16_c;
-    p.dct[DCT_32x32] = dct32_c;
-    p.idct[IDST_4x4] = idst4_c;
-    p.idct[IDCT_4x4] = idct4_c;
-    p.idct[IDCT_8x8] = idct8_c;
-    p.idct[IDCT_16x16] = idct16_c;
-    p.idct[IDCT_32x32] = idct32_c;
+    p.dst4x4 = dst4_c;
+    p.cu[BLOCK_4x4].dct   = dct4_c;
+    p.cu[BLOCK_8x8].dct   = dct8_c;
+    p.cu[BLOCK_16x16].dct = dct16_c;
+    p.cu[BLOCK_32x32].dct = dct32_c;
+    p.idst4x4 = idst4_c;
+    p.cu[BLOCK_4x4].idct   = idct4_c;
+    p.cu[BLOCK_8x8].idct   = idct8_c;
+    p.cu[BLOCK_16x16].idct = idct16_c;
+    p.cu[BLOCK_32x32].idct = idct32_c;
     p.count_nonzero = count_nonzero_c;
     p.denoiseDct = denoiseDct_c;
 
-    p.copy_cnt[BLOCK_4x4] = copy_count<4>;
-    p.copy_cnt[BLOCK_8x8] = copy_count<8>;
-    p.copy_cnt[BLOCK_16x16] = copy_count<16>;
-    p.copy_cnt[BLOCK_32x32] = copy_count<32>;
+    p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
+    p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
+    p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
+    p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
 }
 }
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/ipfilter.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -373,37 +373,37 @@
 // x265 private namespace
 
 #define CHROMA_420(W, H) \
-    p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
-    p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
-    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>;  \
-    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>;  \
-    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>;  \
-    p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
 
 #define CHROMA_422(W, H) \
-    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
-    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
-    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>;  \
-    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>;  \
-    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>;  \
-    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
 
 #define CHROMA_444(W, H) \
-    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
-    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
-    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>;  \
-    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>;  \
-    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>;  \
-    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
 
 #define LUMA(W, H) \
-    p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>; \
-    p.luma_hps[LUMA_ ## W ## x ## H]     = interp_horiz_ps_c<8, W, H>; \
-    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>;  \
-    p.luma_vps[LUMA_ ## W ## x ## H]     = interp_vert_ps_c<8, W, H>;  \
-    p.luma_vsp[LUMA_ ## W ## x ## H]     = interp_vert_sp_c<8, W, H>;  \
-    p.luma_vss[LUMA_ ## W ## x ## H]     = interp_vert_ss_c<8, W, H>;  \
-    p.luma_hvpp[LUMA_ ## W ## x ## H]    = interp_hv_pp_c<8, W, H>;
+    p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_c<8, W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_hps     = interp_horiz_ps_c<8, W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vpp     = interp_vert_pp_c<8, W, H>;  \
+    p.pu[LUMA_ ## W ## x ## H].luma_vps     = interp_vert_ps_c<8, W, H>;  \
+    p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_c<8, W, H>;  \
+    p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_c<8, W, H>;  \
+    p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>;
 
 void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
 {
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/lowres.h
--- a/source/common/lowres.h	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/lowres.h	Thu Jan 08 15:23:38 2015 -0600
@@ -69,7 +69,7 @@
             int qmvy = qmv.y + (qmv.y & 1);
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
             pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
-            primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
+            primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
             return buf;
         }
         else
@@ -91,7 +91,7 @@
             int qmvy = qmv.y + (qmv.y & 1);
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
             pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
-            primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
+            primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
             return comp(fenc, FENC_STRIDE, subpelbuf, 8);
         }
         else
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/pixel.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -33,58 +33,58 @@
 using namespace x265;
 
 #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
-    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+    p.pu[LUMA_4x4].FUNC_PREFIX   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_8x8].FUNC_PREFIX   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_8x4].FUNC_PREFIX   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_4x8].FUNC_PREFIX   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_16x16].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_16x8].FUNC_PREFIX  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_8x16].FUNC_PREFIX  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_16x12].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_12x16].FUNC_PREFIX = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_16x4].FUNC_PREFIX  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_4x16].FUNC_PREFIX  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_32x32].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_32x16].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_16x32].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_32x24].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_24x32].FUNC_PREFIX = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_32x8].FUNC_PREFIX  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_8x32].FUNC_PREFIX  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_64x64].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_64x32].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_32x64].FUNC_PREFIX = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_64x48].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_48x64].FUNC_PREFIX = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_64x16].FUNC_PREFIX = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.pu[LUMA_16x64].FUNC_PREFIX = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
 
 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
-    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
-    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX<8,  8>; \
-    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX<8,  4>; \
-    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX<4,  8>; \
-    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
-    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX<16,  8>; \
-    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX<8, 16>; \
-    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
-    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
-    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX<16,  4>; \
-    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX<4, 16>; \
-    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
-    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
-    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
-    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
-    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
-    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX<32,  8>; \
-    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX<8, 32>; \
-    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
-    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
-    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
-    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
-    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
-    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
-    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
+    p.pu[LUMA_4x4].FUNC_PREFIX   = FUNC_PREFIX<4,  4>; \
+    p.pu[LUMA_8x8].FUNC_PREFIX   = FUNC_PREFIX<8,  8>; \
+    p.pu[LUMA_8x4].FUNC_PREFIX   = FUNC_PREFIX<8,  4>; \
+    p.pu[LUMA_4x8].FUNC_PREFIX   = FUNC_PREFIX<4,  8>; \
+    p.pu[LUMA_16x16].FUNC_PREFIX = FUNC_PREFIX<16, 16>; \
+    p.pu[LUMA_16x8].FUNC_PREFIX  = FUNC_PREFIX<16,  8>; \
+    p.pu[LUMA_8x16].FUNC_PREFIX  = FUNC_PREFIX<8, 16>; \
+    p.pu[LUMA_16x12].FUNC_PREFIX = FUNC_PREFIX<16, 12>; \
+    p.pu[LUMA_12x16].FUNC_PREFIX = FUNC_PREFIX<12, 16>; \
+    p.pu[LUMA_16x4].FUNC_PREFIX  = FUNC_PREFIX<16,  4>; \
+    p.pu[LUMA_4x16].FUNC_PREFIX  = FUNC_PREFIX<4, 16>; \
+    p.pu[LUMA_32x32].FUNC_PREFIX = FUNC_PREFIX<32, 32>; \
+    p.pu[LUMA_32x16].FUNC_PREFIX = FUNC_PREFIX<32, 16>; \
+    p.pu[LUMA_16x32].FUNC_PREFIX = FUNC_PREFIX<16, 32>; \
+    p.pu[LUMA_32x24].FUNC_PREFIX = FUNC_PREFIX<32, 24>; \
+    p.pu[LUMA_24x32].FUNC_PREFIX = FUNC_PREFIX<24, 32>; \
+    p.pu[LUMA_32x8].FUNC_PREFIX  = FUNC_PREFIX<32,  8>; \
+    p.pu[LUMA_8x32].FUNC_PREFIX  = FUNC_PREFIX<8, 32>; \
+    p.pu[LUMA_64x64].FUNC_PREFIX = FUNC_PREFIX<64, 64>; \
+    p.pu[LUMA_64x32].FUNC_PREFIX = FUNC_PREFIX<64, 32>; \
+    p.pu[LUMA_32x64].FUNC_PREFIX = FUNC_PREFIX<32, 64>; \
+    p.pu[LUMA_64x48].FUNC_PREFIX = FUNC_PREFIX<64, 48>; \
+    p.pu[LUMA_48x64].FUNC_PREFIX = FUNC_PREFIX<48, 64>; \
+    p.pu[LUMA_64x16].FUNC_PREFIX = FUNC_PREFIX<64, 16>; \
+    p.pu[LUMA_16x64].FUNC_PREFIX = FUNC_PREFIX<16, 64>;
 
 namespace {
 // place functions in anonymous namespace (file static)
@@ -1019,132 +1019,132 @@
     SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
 
     // satd
-    p.satd[LUMA_4x4]   = satd_4x4;
-    p.satd[LUMA_8x8]   = satd8<8, 8>;
-    p.satd[LUMA_8x4]   = satd_8x4;
-    p.satd[LUMA_4x8]   = satd4<4, 8>;
-    p.satd[LUMA_16x16] = satd8<16, 16>;
-    p.satd[LUMA_16x8]  = satd8<16, 8>;
-    p.satd[LUMA_8x16]  = satd8<8, 16>;
-    p.satd[LUMA_16x12] = satd8<16, 12>;
-    p.satd[LUMA_12x16] = satd4<12, 16>;
-    p.satd[LUMA_16x4]  = satd8<16, 4>;
-    p.satd[LUMA_4x16]  = satd4<4, 16>;
-    p.satd[LUMA_32x32] = satd8<32, 32>;
-    p.satd[LUMA_32x16] = satd8<32, 16>;
-    p.satd[LUMA_16x32] = satd8<16, 32>;
-    p.satd[LUMA_32x24] = satd8<32, 24>;
-    p.satd[LUMA_24x32] = satd8<24, 32>;
-    p.satd[LUMA_32x8]  = satd8<32, 8>;
-    p.satd[LUMA_8x32]  = satd8<8, 32>;
-    p.satd[LUMA_64x64] = satd8<64, 64>;
-    p.satd[LUMA_64x32] = satd8<64, 32>;
-    p.satd[LUMA_32x64] = satd8<32, 64>;
-    p.satd[LUMA_64x48] = satd8<64, 48>;
-    p.satd[LUMA_48x64] = satd8<48, 64>;
-    p.satd[LUMA_64x16] = satd8<64, 16>;
-    p.satd[LUMA_16x64] = satd8<16, 64>;
+    p.pu[LUMA_4x4].satd   = satd_4x4;
+    p.pu[LUMA_8x8].satd   = satd8<8, 8>;
+    p.pu[LUMA_8x4].satd   = satd_8x4;
+    p.pu[LUMA_4x8].satd   = satd4<4, 8>;
+    p.pu[LUMA_16x16].satd = satd8<16, 16>;
+    p.pu[LUMA_16x8].satd  = satd8<16, 8>;
+    p.pu[LUMA_8x16].satd  = satd8<8, 16>;
+    p.pu[LUMA_16x12].satd = satd8<16, 12>;
+    p.pu[LUMA_12x16].satd = satd4<12, 16>;
+    p.pu[LUMA_16x4].satd  = satd8<16, 4>;
+    p.pu[LUMA_4x16].satd  = satd4<4, 16>;
+    p.pu[LUMA_32x32].satd = satd8<32, 32>;
+    p.pu[LUMA_32x16].satd = satd8<32, 16>;
+    p.pu[LUMA_16x32].satd = satd8<16, 32>;
+    p.pu[LUMA_32x24].satd = satd8<32, 24>;
+    p.pu[LUMA_24x32].satd = satd8<24, 32>;
+    p.pu[LUMA_32x8].satd  = satd8<32, 8>;
+    p.pu[LUMA_8x32].satd  = satd8<8, 32>;
+    p.pu[LUMA_64x64].satd = satd8<64, 64>;
+    p.pu[LUMA_64x32].satd = satd8<64, 32>;
+    p.pu[LUMA_32x64].satd = satd8<32, 64>;
+    p.pu[LUMA_64x48].satd = satd8<64, 48>;
+    p.pu[LUMA_48x64].satd = satd8<48, 64>;
+    p.pu[LUMA_64x16].satd = satd8<64, 16>;
+    p.pu[LUMA_16x64].satd = satd8<16, 64>;
 
-    p.chroma[X265_CSP_I420].satd[CHROMA_2x2]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = satd_4x4;
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = satd8<8, 8>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_2x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x4].satd   = satd_4x4;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x8].satd   = satd8<8, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x16].satd = satd8<16, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x32].satd = satd8<32, 32>;
 
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x2]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_2x4]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = satd_8x4;
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = satd4<4, 8>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = satd8<16, 8>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = satd8<8, 16>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_2x4].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x4].satd   = satd_8x4;
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x8].satd   = satd4<4, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x8].satd  = satd8<16, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x16].satd  = satd8<8, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x16].satd = satd8<32, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x32].satd = satd8<16, 32>;
 
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x6]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_6x8]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x2]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_2x8]   = NULL;
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = satd4<16, 4>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = satd4<4, 16>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = satd8<32, 8>;
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = satd8<8, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x6].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_6x8].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_2x8].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x12].satd = satd4<16, 12>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_12x16].satd = satd4<12, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x4].satd  = satd4<16, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x16].satd  = satd4<4, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x24].satd = satd8<32, 24>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_24x32].satd = satd8<24, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x8].satd  = satd8<32, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x32].satd  = satd8<8, 32>;
 
-    p.chroma[X265_CSP_I422].satd[CHROMA422_2x4]   = NULL;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = satd4<4, 8>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = satd8<8, 16>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_2x4].satd   = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x8].satd   = satd4<4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x16].satd  = satd8<8, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].satd = satd8<16, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x64].satd = satd8<32, 64>;
 
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = satd_4x4;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_2x8]   = NULL;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = satd8<8, 8>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = satd4<4, 16>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = satd8<8, 32>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x4].satd   = satd_4x4;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_2x8].satd   = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x8].satd   = satd8<8, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x16].satd  = satd4<4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x16].satd = satd8<16, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x32].satd  = satd8<8, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x32].satd = satd8<32, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].satd = satd8<16, 64>;
 
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_6x16]  = NULL;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = satd4<8, 4>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_2x16]  = NULL;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = satd8<16, 8>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x12].satd  = satd4<8, 12>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_6x16].satd  = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x4].satd   = satd4<8, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_2x16].satd  = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x24].satd = satd8<16, 24>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_12x32].satd = satd4<12, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x8].satd  = satd8<16, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x32].satd  = satd4<4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x48].satd = satd8<32, 48>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_24x64].satd = satd8<24, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x16].satd = satd8<32, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x64].satd  = satd8<8, 64>;
 
 #define CHROMA_420(W, H) \
-    p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H]  = addAvg<W, H>;         \
-    p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
-    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
-    p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].addAvg  = addAvg<W, H>;         \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>;
 
 #define CHROMA_422(W, H) \
-    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H]  = addAvg<W, H>;         \
-    p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
-    p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
-    p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].addAvg  = addAvg<W, H>;         \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>;
 
 #define CHROMA_444(W, H) \
-    p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H]    = p.satd[LUMA_ ## W ## x ## H]; \
-    p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
-    p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
-    p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
-    p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].satd    = p.pu[LUMA_ ## W ## x ## H].satd; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].addAvg  = addAvg<W, H>; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>;
 
 #define LUMA(W, H) \
-    p.luma_addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
-    p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
-    p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
-    p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
+    p.pu[LUMA_ ## W ## x ## H].luma_addAvg  = addAvg<W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_copy_pp = blockcopy_pp_c<W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_copy_sp = blockcopy_sp_c<W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_copy_ps = blockcopy_ps_c<W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_copy_ss = blockcopy_ss_c<W, H>;
 
 #define LUMA_PIXELSUB(W, H) \
-    p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
-    p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+    p.pu[LUMA_ ## W ## x ## H].luma_sub_ps = pixel_sub_ps_c<W, H>; \
+    p.pu[LUMA_ ## W ## x ## H].luma_add_ps = pixel_add_ps_c<W, H>;
 
 #define CHROMA_PIXELSUB_420(W, H) \
-    p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;  \
-    p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>;  \
+    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
 
 #define CHROMA_PIXELSUB_422(W, H) \
-    p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
-    p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
+    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
 
 #define CHROMA_PIXELSUB_444(W, H) \
-    p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
-    p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+    p.chroma[X265_CSP_I444].cu[LUMA_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
+    p.chroma[X265_CSP_I444].cu[LUMA_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
 
     LUMA(4, 4);
     LUMA(8, 8);
@@ -1269,89 +1269,89 @@
     SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
     SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
 
-    p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
-    p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
-    p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
-    p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
-    p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
+    p.cu[BLOCK_4x4].blockfill_s   = blockfil_s_c<4>;
+    p.cu[BLOCK_8x8].blockfill_s   = blockfil_s_c<8>;
+    p.cu[BLOCK_16x16].blockfill_s = blockfil_s_c<16>;
+    p.cu[BLOCK_32x32].blockfill_s = blockfil_s_c<32>;
+    p.cu[BLOCK_64x64].blockfill_s = blockfil_s_c<64>;
 
-    p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
-    p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
-    p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
-    p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
-    p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
-    p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
-    p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
-    p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
-    p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
-    p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
-    p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
-    p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
-    p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
-    p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
-    p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
-    p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
+    p.cu[BLOCK_4x4].cpy2Dto1D_shl   = cpy2Dto1D_shl<4>;
+    p.cu[BLOCK_8x8].cpy2Dto1D_shl   = cpy2Dto1D_shl<8>;
+    p.cu[BLOCK_16x16].cpy2Dto1D_shl = cpy2Dto1D_shl<16>;
+    p.cu[BLOCK_32x32].cpy2Dto1D_shl = cpy2Dto1D_shl<32>;
+    p.cu[BLOCK_4x4].cpy2Dto1D_shr   = cpy2Dto1D_shr<4>;
+    p.cu[BLOCK_8x8].cpy2Dto1D_shr   = cpy2Dto1D_shr<8>;
+    p.cu[BLOCK_16x16].cpy2Dto1D_shr = cpy2Dto1D_shr<16>;
+    p.cu[BLOCK_32x32].cpy2Dto1D_shr = cpy2Dto1D_shr<32>;
+    p.cu[BLOCK_4x4].cpy1Dto2D_shl   = cpy1Dto2D_shl<4>;
+    p.cu[BLOCK_8x8].cpy1Dto2D_shl   = cpy1Dto2D_shl<8>;
+    p.cu[BLOCK_16x16].cpy1Dto2D_shl = cpy1Dto2D_shl<16>;
+    p.cu[BLOCK_32x32].cpy1Dto2D_shl = cpy1Dto2D_shl<32>;
+    p.cu[BLOCK_4x4].cpy1Dto2D_shr   = cpy1Dto2D_shr<4>;
+    p.cu[BLOCK_8x8].cpy1Dto2D_shr   = cpy1Dto2D_shr<8>;
+    p.cu[BLOCK_16x16].cpy1Dto2D_shr = cpy1Dto2D_shr<16>;
+    p.cu[BLOCK_32x32].cpy1Dto2D_shr = cpy1Dto2D_shr<32>;
 
-    p.sa8d[BLOCK_4x4]   = satd_4x4;
-    p.sa8d[BLOCK_8x8]   = sa8d_8x8;
-    p.sa8d[BLOCK_16x16] = sa8d_16x16;
-    p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
-    p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
+    p.cu[BLOCK_4x4].sa8d   = satd_4x4;
+    p.cu[BLOCK_8x8].sa8d   = sa8d_8x8;
+    p.cu[BLOCK_16x16].sa8d = sa8d_16x16;
+    p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
+    p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
 
-    p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
-    p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
-    p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
-    p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
-    p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
+    p.cu[BLOCK_4x4].psy_cost_pp   = psyCost_pp<BLOCK_4x4>;
+    p.cu[BLOCK_8x8].psy_cost_pp   = psyCost_pp<BLOCK_8x8>;
+    p.cu[BLOCK_16x16].psy_cost_pp = psyCost_pp<BLOCK_16x16>;
+    p.cu[BLOCK_32x32].psy_cost_pp = psyCost_pp<BLOCK_32x32>;
+    p.cu[BLOCK_64x64].psy_cost_pp = psyCost_pp<BLOCK_64x64>;
 
-    p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
-    p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
-    p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
-    p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
-    p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
+    p.cu[BLOCK_4x4].psy_cost_ss   = psyCost_ss<BLOCK_4x4>;
+    p.cu[BLOCK_8x8].psy_cost_ss   = psyCost_ss<BLOCK_8x8>;
+    p.cu[BLOCK_16x16].psy_cost_ss = psyCost_ss<BLOCK_16x16>;
+    p.cu[BLOCK_32x32].psy_cost_ss = psyCost_ss<BLOCK_32x32>;
+    p.cu[BLOCK_64x64].psy_cost_ss = psyCost_ss<BLOCK_64x64>;
 
-    p.sa8d_inter[LUMA_4x4]   = satd_4x4;
-    p.sa8d_inter[LUMA_8x8]   = sa8d_8x8;
-    p.sa8d_inter[LUMA_8x4]   = satd_8x4;
-    p.sa8d_inter[LUMA_4x8]   = satd4<4, 8>;
-    p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
-    p.sa8d_inter[LUMA_16x8]  = sa8d8<16, 8>;
-    p.sa8d_inter[LUMA_8x16]  = sa8d8<8, 16>;
-    p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
-    p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
-    p.sa8d_inter[LUMA_4x16]  = satd4<4, 16>;
-    p.sa8d_inter[LUMA_16x4]  = satd8<16, 4>;
-    p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
-    p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
-    p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
-    p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
-    p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
-    p.sa8d_inter[LUMA_32x8]  = sa8d8<32, 8>;
-    p.sa8d_inter[LUMA_8x32]  = sa8d8<8, 32>;
-    p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
-    p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
-    p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
-    p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
-    p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
-    p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
-    p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
+    p.pu[LUMA_4x4].sa8d_inter   = satd_4x4;
+    p.pu[LUMA_8x8].sa8d_inter   = sa8d_8x8;
+    p.pu[LUMA_8x4].sa8d_inter   = satd_8x4;
+    p.pu[LUMA_4x8].sa8d_inter   = satd4<4, 8>;
+    p.pu[LUMA_16x16].sa8d_inter = sa8d_16x16;
+    p.pu[LUMA_16x8].sa8d_inter  = sa8d8<16, 8>;
+    p.pu[LUMA_8x16].sa8d_inter  = sa8d8<8, 16>;
+    p.pu[LUMA_16x12].sa8d_inter = satd8<16, 12>;
+    p.pu[LUMA_12x16].sa8d_inter = satd4<12, 16>;
+    p.pu[LUMA_4x16].sa8d_inter  = satd4<4, 16>;
+    p.pu[LUMA_16x4].sa8d_inter  = satd8<16, 4>;
+    p.pu[LUMA_32x32].sa8d_inter = sa8d16<32, 32>;
+    p.pu[LUMA_32x16].sa8d_inter = sa8d16<32, 16>;
+    p.pu[LUMA_16x32].sa8d_inter = sa8d16<16, 32>;
+    p.pu[LUMA_32x24].sa8d_inter = sa8d8<32, 24>;
+    p.pu[LUMA_24x32].sa8d_inter = sa8d8<24, 32>;
+    p.pu[LUMA_32x8].sa8d_inter  = sa8d8<32, 8>;
+    p.pu[LUMA_8x32].sa8d_inter  = sa8d8<8, 32>;
+    p.pu[LUMA_64x64].sa8d_inter = sa8d16<64, 64>;
+    p.pu[LUMA_64x32].sa8d_inter = sa8d16<64, 32>;
+    p.pu[LUMA_32x64].sa8d_inter = sa8d16<32, 64>;
+    p.pu[LUMA_64x48].sa8d_inter = sa8d16<64, 48>;
+    p.pu[LUMA_48x64].sa8d_inter = sa8d16<48, 64>;
+    p.pu[LUMA_64x16].sa8d_inter = sa8d16<64, 16>;
+    p.pu[LUMA_16x64].sa8d_inter = sa8d16<16, 64>;
 
-    p.calcresidual[BLOCK_4x4] = getResidual<4>;
-    p.calcresidual[BLOCK_8x8] = getResidual<8>;
-    p.calcresidual[BLOCK_16x16] = getResidual<16>;
-    p.calcresidual[BLOCK_32x32] = getResidual<32>;
-    p.calcresidual[BLOCK_64x64] = NULL;
+    p.cu[BLOCK_4x4].calcresidual   = getResidual<4>;
+    p.cu[BLOCK_8x8].calcresidual   = getResidual<8>;
+    p.cu[BLOCK_16x16].calcresidual = getResidual<16>;
+    p.cu[BLOCK_32x32].calcresidual = getResidual<32>;
+    p.cu[BLOCK_64x64].calcresidual = NULL;
 
-    p.transpose[BLOCK_4x4] = transpose<4>;
-    p.transpose[BLOCK_8x8] = transpose<8>;
-    p.transpose[BLOCK_16x16] = transpose<16>;
-    p.transpose[BLOCK_32x32] = transpose<32>;
-    p.transpose[BLOCK_64x64] = transpose<64>;
+    p.cu[BLOCK_4x4].transpose   = transpose<4>;
+    p.cu[BLOCK_8x8].transpose   = transpose<8>;
+    p.cu[BLOCK_16x16].transpose = transpose<16>;
+    p.cu[BLOCK_32x32].transpose = transpose<32>;
+    p.cu[BLOCK_64x64].transpose = transpose<64>;
 
-    p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
-    p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
-    p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
-    p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
+    p.cu[BLOCK_4x4].ssd_s   = pixel_ssd_s_c<4>;
+    p.cu[BLOCK_8x8].ssd_s   = pixel_ssd_s_c<8>;
+    p.cu[BLOCK_16x16].ssd_s = pixel_ssd_s_c<16>;
+    p.cu[BLOCK_32x32].ssd_s = pixel_ssd_s_c<32>;
 
     p.weight_pp = weight_pp_c;
     p.weight_sp = weight_sp_c;
@@ -1362,10 +1362,10 @@
     p.ssim_4x4x2_core = ssim_4x4x2_core;
     p.ssim_end_4 = ssim_end_4;
 
-    p.var[BLOCK_8x8] = pixel_var<8>;
-    p.var[BLOCK_16x16] = pixel_var<16>;
-    p.var[BLOCK_32x32] = pixel_var<32>;
-    p.var[BLOCK_64x64] = pixel_var<64>;
+    p.cu[BLOCK_8x8].var   = pixel_var<8>;
+    p.cu[BLOCK_16x16].var = pixel_var<16>;
+    p.cu[BLOCK_32x32].var = pixel_var<32>;
+    p.cu[BLOCK_64x64].var = pixel_var<64>;
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.propagateCost = estimateCUPropagateCost;
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/predict.cpp
--- a/source/common/predict.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/predict.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -334,13 +334,13 @@
     int yFrac = mv.y & 0x3;
 
     if (!(yFrac | xFrac))
-        primitives.luma_copy_pp[partEnum](dst, dstStride, src, srcStride);
+        primitives.pu[partEnum].luma_copy_pp(dst, dstStride, src, srcStride);
     else if (!yFrac)
-        primitives.luma_hpp[partEnum](src, srcStride, dst, dstStride, xFrac);
+        primitives.pu[partEnum].luma_hpp(src, srcStride, dst, dstStride, xFrac);
     else if (!xFrac)
-        primitives.luma_vpp[partEnum](src, srcStride, dst, dstStride, yFrac);
+        primitives.pu[partEnum].luma_vpp(src, srcStride, dst, dstStride, yFrac);
     else
-        primitives.luma_hvpp[partEnum](src, srcStride, dst, dstStride, xFrac, yFrac);
+        primitives.pu[partEnum].luma_hvpp(src, srcStride, dst, dstStride, xFrac, yFrac);
 }
 
 void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
@@ -363,16 +363,16 @@
     if (!(yFrac | xFrac))
         primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight);
     else if (!yFrac)
-        primitives.luma_hps[partEnum](src, srcStride, dst, dstStride, xFrac, 0);
+        primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
     else if (!xFrac)
-        primitives.luma_vps[partEnum](src, srcStride, dst, dstStride, yFrac);
+        primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
     else
     {
         int tmpStride = m_puWidth;
         int filterSize = NTAPS_LUMA;
         int halfFilterSize = (filterSize >> 1);
-        primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1);
-        primitives.luma_vss[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
+        primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
+        primitives.pu[partEnum].luma_vss(m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
     }
 }
 
@@ -399,18 +399,18 @@
     
     if (!(yFrac | xFrac))
     {
-        primitives.chroma[m_csp].copy_pp[partEnum](dstCb, dstStride, refCb, refStride);
-        primitives.chroma[m_csp].copy_pp[partEnum](dstCr, dstStride, refCr, refStride);
+        primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCb, dstStride, refCb, refStride);
+        primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCr, dstStride, refCr, refStride);
     }
     else if (!yFrac)
     {
-        primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
-        primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
     }
     else if (!xFrac)
     {
-        primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-        primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
     }
     else
     {
@@ -418,11 +418,11 @@
         int filterSize = NTAPS_CHROMA;
         int halfFilterSize = (filterSize >> 1);
 
-        primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
 
-        primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
     }
 }
 
@@ -459,23 +459,23 @@
     }
     else if (!yFrac)
     {
-        primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
-        primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
     }
     else if (!xFrac)
     {
-        primitives.chroma[m_csp].filter_vps[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-        primitives.chroma[m_csp].filter_vps[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
     }
     else
     {
         int extStride = cxWidth;
         int filterSize = NTAPS_CHROMA;
         int halfFilterSize = (filterSize >> 1);
-        primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-        primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
+        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
     }
 }
 
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/primitives.cpp
--- a/source/common/primitives.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/primitives.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -71,79 +71,79 @@
     /* copy reusable luma primitives to chroma 4:4:4 */
     for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
     {
-        p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
-        p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
-        p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
-        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
-        p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
-        p.chroma[X265_CSP_I444].satd[i] = p.satd[i];
+        p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].luma_copy_pp;
+        p.chroma[X265_CSP_I444].pu[i].copy_ps = p.pu[i].luma_copy_ps;
+        p.chroma[X265_CSP_I444].pu[i].copy_sp = p.pu[i].luma_copy_sp;
+        p.chroma[X265_CSP_I444].pu[i].copy_ss = p.pu[i].luma_copy_ss;
+        p.chroma[X265_CSP_I444].pu[i].addAvg  = p.pu[i].luma_addAvg;
+        p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
     }
 
     for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
     {
-        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];
-        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
+        p.chroma[X265_CSP_I444].cu[i].add_ps  = p.pu[i].luma_add_ps;
+        p.chroma[X265_CSP_I444].cu[i].sub_ps  = p.pu[i].luma_sub_ps;
     }
 
-    primitives.sa8d[BLOCK_4x4]   = primitives.satd[LUMA_4x4];
-    primitives.sa8d[BLOCK_8x8]   = primitives.sa8d_inter[LUMA_8x8];
-    primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
-    primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];
-    primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];
+    primitives.cu[BLOCK_4x4].sa8d   = primitives.pu[LUMA_4x4].satd;
+    primitives.cu[BLOCK_8x8].sa8d   = primitives.pu[LUMA_8x8].sa8d_inter;
+    primitives.cu[BLOCK_16x16].sa8d = primitives.pu[LUMA_16x16].sa8d_inter;
+    primitives.cu[BLOCK_32x32].sa8d = primitives.pu[LUMA_32x32].sa8d_inter;
+    primitives.cu[BLOCK_64x64].sa8d = primitives.pu[LUMA_64x64].sa8d_inter;
 
     // SA8D devolves to SATD for blocks not even multiples of 8x8
-    primitives.sa8d_inter[LUMA_4x4]   = primitives.satd[LUMA_4x4];
-    primitives.sa8d_inter[LUMA_4x8]   = primitives.satd[LUMA_4x8];
-    primitives.sa8d_inter[LUMA_4x16]  = primitives.satd[LUMA_4x16];
-    primitives.sa8d_inter[LUMA_8x4]   = primitives.satd[LUMA_8x4];
-    primitives.sa8d_inter[LUMA_16x4]  = primitives.satd[LUMA_16x4];
-    primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12];
-    primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16];
+    primitives.pu[LUMA_4x4].sa8d_inter   = primitives.pu[LUMA_4x4].satd;
+    primitives.pu[LUMA_4x8].sa8d_inter   = primitives.pu[LUMA_4x8].satd;
+    primitives.pu[LUMA_4x16].sa8d_inter  = primitives.pu[LUMA_4x16].satd;
+    primitives.pu[LUMA_8x4].sa8d_inter   = primitives.pu[LUMA_8x4].satd;
+    primitives.pu[LUMA_16x4].sa8d_inter  = primitives.pu[LUMA_16x4].satd;
+    primitives.pu[LUMA_16x12].sa8d_inter = primitives.pu[LUMA_16x12].satd;
+    primitives.pu[LUMA_12x16].sa8d_inter = primitives.pu[LUMA_12x16].satd;
 
     // Chroma SATD can often reuse luma primitives
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = primitives.satd[LUMA_4x4];
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = primitives.satd[LUMA_8x8];
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = primitives.satd[LUMA_16x16];
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = primitives.satd[LUMA_32x32];
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x4].satd   = primitives.pu[LUMA_4x4].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x8].satd   = primitives.pu[LUMA_8x8].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x16].satd = primitives.pu[LUMA_16x16].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x32].satd = primitives.pu[LUMA_32x32].satd;
 
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = primitives.satd[LUMA_8x4];
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = primitives.satd[LUMA_4x8];
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = primitives.satd[LUMA_16x8];
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = primitives.satd[LUMA_8x16];
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = primitives.satd[LUMA_32x16];
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = primitives.satd[LUMA_16x32];
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x4].satd   = primitives.pu[LUMA_8x4].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x8].satd   = primitives.pu[LUMA_4x8].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x8].satd  = primitives.pu[LUMA_16x8].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x16].satd  = primitives.pu[LUMA_8x16].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x16].satd = primitives.pu[LUMA_32x16].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x32].satd = primitives.pu[LUMA_16x32].satd;
 
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = primitives.satd[LUMA_16x12];
-    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = primitives.satd[LUMA_12x16];
-    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = primitives.satd[LUMA_16x4];
-    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = primitives.satd[LUMA_4x16];
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = primitives.satd[LUMA_32x24];
-    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = primitives.satd[LUMA_24x32];
-    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = primitives.satd[LUMA_32x8];
-    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = primitives.satd[LUMA_8x32];
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x12].satd = primitives.pu[LUMA_16x12].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_12x16].satd = primitives.pu[LUMA_12x16].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_16x4].satd  = primitives.pu[LUMA_16x4].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_4x16].satd  = primitives.pu[LUMA_4x16].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x24].satd = primitives.pu[LUMA_32x24].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_24x32].satd = primitives.pu[LUMA_24x32].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_32x8].satd  = primitives.pu[LUMA_32x8].satd;
+    p.chroma[X265_CSP_I420].pu[CHROMA_8x32].satd  = primitives.pu[LUMA_8x32].satd;
 
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = primitives.satd[LUMA_4x8];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = primitives.satd[LUMA_8x16];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = primitives.satd[LUMA_16x32];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = primitives.satd[LUMA_32x64];
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x8].satd   = primitives.pu[LUMA_4x8].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x16].satd  = primitives.pu[LUMA_8x16].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].satd = primitives.pu[LUMA_16x32].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x64].satd = primitives.pu[LUMA_32x64].satd;
 
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = primitives.satd[LUMA_4x4];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = primitives.satd[LUMA_8x8];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = primitives.satd[LUMA_4x16];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = primitives.satd[LUMA_16x16];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = primitives.satd[LUMA_8x32];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = primitives.satd[LUMA_32x32];
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = primitives.satd[LUMA_16x64];
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x4].satd   = primitives.pu[LUMA_4x4].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x8].satd   = primitives.pu[LUMA_8x8].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_4x16].satd  = primitives.pu[LUMA_4x16].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x16].satd = primitives.pu[LUMA_16x16].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x32].satd  = primitives.pu[LUMA_8x32].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x32].satd = primitives.pu[LUMA_32x32].satd;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].satd = primitives.pu[LUMA_16x64].satd;
 
     //p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = primitives.satd[LUMA_8x4];
+    p.chroma[X265_CSP_I422].pu[CHROMA422_8x4].satd  = primitives.pu[LUMA_8x4].satd;
     //p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
     //p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = primitives.satd[LUMA_16x8];
+    p.chroma[X265_CSP_I422].pu[CHROMA422_16x8].satd = primitives.pu[LUMA_16x8].satd;
     //p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
     //p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
     //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
-    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16];
+    p.chroma[X265_CSP_I422].pu[CHROMA422_32x16].satd = primitives.pu[LUMA_32x16].satd;
     //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
 }
 }
@@ -158,7 +158,7 @@
         cpuid = x265::cpu_detect();
 
     // initialize global variables
-    if (!primitives.sad[0])
+    if (!primitives.pu[0].sad)
     {
         Setup_C_Primitives(primitives);
 
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/primitives.h
--- a/source/common/primitives.h	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/primitives.h	Thu Jan 08 15:23:38 2015 -0600
@@ -42,7 +42,7 @@
     LUMA_4x4,   LUMA_8x8,   LUMA_16x16, LUMA_32x32, LUMA_64x64,
     // Rectangular
     LUMA_8x4,   LUMA_4x8,
-    LUMA_16x8,  LUMA_8x16,  
+    LUMA_16x8,  LUMA_8x16,
     LUMA_32x16, LUMA_16x32,
     LUMA_64x32, LUMA_32x64,
     // Asymmetrical (0.75, 0.25)
@@ -206,42 +206,76 @@
  * a vectorized primitive, or a C function. */
 struct EncoderPrimitives
 {
-    pixelcmp_t            sad[NUM_LUMA_PARTITIONS];        // Sum of Differences for each size
-    pixelcmp_x3_t         sad_x3[NUM_LUMA_PARTITIONS];     // Sum of Differences 3x for each size
-    pixelcmp_x4_t         sad_x4[NUM_LUMA_PARTITIONS];     // Sum of Differences 4x for each size
-    pixelcmp_t            sse_pp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
-    pixelcmp_ss_t         sse_ss[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, short) fenc alignment not assumed
-    pixelcmp_sp_t         sse_sp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, pixel) fenc alignment not assumed
-    pixel_ssd_s_t         ssd_s[NUM_SQUARE_BLOCKS - 1];    // Sum of Square Error (short) fenc alignment not assumed
-    pixelcmp_t            satd[NUM_LUMA_PARTITIONS];       // Sum of Transformed differences (HADAMARD)
-    pixelcmp_t            sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
-    pixelcmp_t            sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks
-    pixelcmp_t            psy_cost_pp[NUM_SQUARE_BLOCKS];  // difference in AC energy between two blocks
-    pixelcmp_ss_t         psy_cost_ss[NUM_SQUARE_BLOCKS];
+  struct PU
+  {
+    pixelcmp_t            sad;        // Sum of Differences for each size
+    pixelcmp_x3_t         sad_x3;     // Sum of Differences 3x for each size
+    pixelcmp_x4_t         sad_x4;     // Sum of Differences 4x for each size
+    pixelcmp_t            sse_pp;     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
+    pixelcmp_ss_t         sse_ss;     // Sum of Square Error (short, short) fenc alignment not assumed
+    pixelcmp_sp_t         sse_sp;     // Sum of Square Error (short, pixel) fenc alignment not assumed
+    pixelcmp_t            satd;       // Sum of Transformed differences (HADAMARD)
+    pixelcmp_t            sa8d_inter; // sa8d primitives for motion search partitions
 
-    dct_t                 dct[NUM_DCTS];
-    idct_t                idct[NUM_IDCTS];
+    pixelavg_pp_t         pixelavg_pp;
+    addAvg_t              luma_addAvg;
+
+    filter_pp_t           luma_hpp;
+    filter_hps_t          luma_hps;
+    filter_pp_t           luma_vpp;
+    filter_ps_t           luma_vps;
+    filter_sp_t           luma_vsp;
+    filter_ss_t           luma_vss;
+    filter_hv_pp_t        luma_hvpp;
+
+    copy_pp_t             luma_copy_pp;
+    copy_sp_t             luma_copy_sp;
+    copy_ps_t             luma_copy_ps;
+    copy_ss_t             luma_copy_ss;
+
+    pixel_sub_ps_t        luma_sub_ps;
+    pixel_add_ps_t        luma_add_ps;
+
+  } pu[NUM_LUMA_PARTITIONS];
+
+  struct CU
+  {
+    dct_t                 dct;
+    idct_t                idct;
+    calcresidual_t        calcresidual;
+    blockfill_s_t         blockfill_s;  // block fill with value
+    cpy2Dto1D_shl_t       cpy2Dto1D_shl;
+    cpy2Dto1D_shr_t       cpy2Dto1D_shr;
+    cpy1Dto2D_shl_t       cpy1Dto2D_shl;
+    cpy1Dto2D_shr_t       cpy1Dto2D_shr;
+    copy_cnt_t            copy_cnt;
+
+    transpose_t           transpose;
+
+    var_t                 var;
+
+    pixelcmp_t            sa8d;         // sa8d primitives for square intra blocks
+    pixel_ssd_s_t         ssd_s;    // Sum of Square Error (short) fenc alignment not assumed
+    pixelcmp_t            psy_cost_pp;  // difference in AC energy between two blocks
+    pixelcmp_ss_t         psy_cost_ss;
+
+  } cu[NUM_SQUARE_BLOCKS];
+
+    dct_t                 dst4x4;
+    idct_t                idst4x4;
+
     quant_t               quant;
     nquant_t              nquant;
     dequant_scaling_t     dequant_scaling;
     dequant_normal_t      dequant_normal;
     count_nonzero_t       count_nonzero;
     denoiseDct_t          denoiseDct;
-    calcresidual_t        calcresidual[NUM_SQUARE_BLOCKS];
-    blockfill_s_t         blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
-    cpy2Dto1D_shl_t       cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
-    cpy2Dto1D_shr_t       cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
-    cpy1Dto2D_shl_t       cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
-    cpy1Dto2D_shr_t       cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
-    copy_cnt_t            copy_cnt[NUM_SQUARE_BLOCKS - 1];
 
     intra_pred_t          intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
     intra_allangs_t       intra_pred_allangs[NUM_TR_SIZE];
-    transpose_t           transpose[NUM_SQUARE_BLOCKS];
     scale_t               scale1D_128to64;
     scale_t               scale2D_64to32;
 
-    var_t                 var[NUM_SQUARE_BLOCKS];
     ssim_4x4x2_core_t     ssim_4x4x2_core;
     ssim_end4_t           ssim_end_4;
 
@@ -261,42 +295,36 @@
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
-    pixelavg_pp_t         pixelavg_pp[NUM_LUMA_PARTITIONS];
-    addAvg_t              luma_addAvg[NUM_LUMA_PARTITIONS];
 
-    filter_pp_t           luma_hpp[NUM_LUMA_PARTITIONS];
-    filter_hps_t          luma_hps[NUM_LUMA_PARTITIONS];
-    filter_pp_t           luma_vpp[NUM_LUMA_PARTITIONS];
-    filter_ps_t           luma_vps[NUM_LUMA_PARTITIONS];
-    filter_sp_t           luma_vsp[NUM_LUMA_PARTITIONS];
-    filter_ss_t           luma_vss[NUM_LUMA_PARTITIONS];
-    filter_hv_pp_t        luma_hvpp[NUM_LUMA_PARTITIONS];
     filter_p2s_t          luma_p2s;
 
-    copy_pp_t             luma_copy_pp[NUM_LUMA_PARTITIONS];
-    copy_sp_t             luma_copy_sp[NUM_LUMA_PARTITIONS];
-    copy_ps_t             luma_copy_ps[NUM_LUMA_PARTITIONS];
-    copy_ss_t             luma_copy_ss[NUM_LUMA_PARTITIONS];
-    pixel_sub_ps_t        luma_sub_ps[NUM_SQUARE_BLOCKS];
-    pixel_add_ps_t        luma_add_ps[NUM_SQUARE_BLOCKS];
+    struct Chroma
+    {
+      struct PUChroma
+      {
+        // ME and MC
+        pixelcmp_t      satd;
+        filter_pp_t     filter_vpp;
+        filter_ps_t     filter_vps;
+        filter_sp_t     filter_vsp;
+        filter_ss_t     filter_vss;
+        filter_pp_t     filter_hpp;
+        filter_hps_t    filter_hps;
+        addAvg_t        addAvg;
+        copy_pp_t       copy_pp;
+        copy_sp_t       copy_sp;
+        copy_ps_t       copy_ps;
+        copy_ss_t       copy_ss;
+      } pu[NUM_LUMA_PARTITIONS];
 
-    struct
-    {
-        pixelcmp_t      satd[NUM_LUMA_PARTITIONS];
-        filter_pp_t     filter_vpp[NUM_LUMA_PARTITIONS];
-        filter_ps_t     filter_vps[NUM_LUMA_PARTITIONS];
-        filter_sp_t     filter_vsp[NUM_LUMA_PARTITIONS];
-        filter_ss_t     filter_vss[NUM_LUMA_PARTITIONS];
-        filter_pp_t     filter_hpp[NUM_LUMA_PARTITIONS];
-        filter_hps_t    filter_hps[NUM_LUMA_PARTITIONS];
-        addAvg_t        addAvg[NUM_LUMA_PARTITIONS];
-        copy_pp_t       copy_pp[NUM_LUMA_PARTITIONS];
-        copy_sp_t       copy_sp[NUM_LUMA_PARTITIONS];
-        copy_ps_t       copy_ps[NUM_LUMA_PARTITIONS];
-        copy_ss_t       copy_ss[NUM_LUMA_PARTITIONS];
-        pixel_sub_ps_t  sub_ps[NUM_SQUARE_BLOCKS];
-        pixel_add_ps_t  add_ps[NUM_SQUARE_BLOCKS];
-        filter_p2s_t    p2s;
+      struct CUChroma
+      {
+        pixelcmp_t sa8d;
+        pixel_sub_ps_t  sub_ps;
+        pixel_add_ps_t  add_ps;
+      } cu[NUM_SQUARE_BLOCKS];
+
+      filter_p2s_t    p2s;
     } chroma[X265_CSP_COUNT];
 };
 
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/quant.cpp
--- a/source/common/quant.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/quant.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -329,7 +329,7 @@
     if (cu.m_tqBypass[absPartIdx])
     {
         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
-        return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
+        return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
     }
 
     bool isLuma  = ttype == TEXT_LUMA;
@@ -341,21 +341,21 @@
     {
 #if X265_DEPTH <= 10
         X265_CHECK(transformShift >= 0, "invalid transformShift\n");
-        primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
+        primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
 #else
         if (transformShift >= 0)
-            primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
+            primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
         else
-            primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
+            primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift);
 #endif
     }
     else
     {
         bool isIntra = cu.isIntra(absPartIdx);
         int useDST = !sizeIdx && isLuma && isIntra;
-        int index = DCT_4x4 + sizeIdx - useDST;
+        int index = BLOCK_4x4 + sizeIdx - useDST;
 
-        primitives.dct[index](residual, m_resiDctCoeff, resiStride);
+        primitives.cu[index].dct(residual, m_resiDctCoeff, resiStride);
 
         /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
          * there is no risk of performing this DCT unnecessarily */
@@ -363,8 +363,8 @@
         {
             int trSize = 1 << log2TrSize;
             /* perform DCT on source pixels for psy-rdoq */
-            primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
-            primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
+            primitives.pu[sizeIdx].luma_copy_ps(m_fencShortBuf, trSize, fenc, fencStride);
+            primitives.cu[index].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
         }
 
         if (m_nr)
@@ -411,7 +411,7 @@
     const uint32_t sizeIdx = log2TrSize - 2;
     if (transQuantBypass)
     {
-        primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
+        primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
         return;
     }
 
@@ -438,12 +438,12 @@
     {
 #if X265_DEPTH <= 10
         X265_CHECK(transformShift > 0, "invalid transformShift\n");
-        primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
+        primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
 #else
         if (transformShift > 0)
-            primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
+            primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
         else
-            primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
+            primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
 #endif
     }
     else
@@ -461,11 +461,11 @@
             const int add_2nd = 1 << (shift_2nd - 1);
 
             int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
-            primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
+            primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
             return;
         }
 
-        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
+        primitives.cu[BLOCK_4x4 + sizeIdx - useDST].idct(m_resiDctCoeff, residual, resiStride);
     }
 }
 
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/shortyuv.cpp
--- a/source/common/shortyuv.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/shortyuv.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -74,9 +74,9 @@
 void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size)
 {
     const int sizeIdx = log2Size - 2;
-    primitives.luma_sub_ps[sizeIdx](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    primitives.pu[sizeIdx].luma_sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
+    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
 }
 
 void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
@@ -84,7 +84,7 @@
     const int16_t* src = getLumaAddr(absPartIdx);
     int16_t* dst = dstYuv.getLumaAddr(absPartIdx);
 
-    primitives.luma_copy_ss[log2Size - 2](dst, dstYuv.m_size, src, m_size);
+    primitives.pu[log2Size - 2].luma_copy_ss(dst, dstYuv.m_size, src, m_size);
 }
 
 void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
@@ -92,7 +92,7 @@
     const int16_t* src = getLumaAddr(absPartIdx);
     pixel* dst = dstYuv.getLumaAddr(absPartIdx);
 
-    primitives.luma_copy_sp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
+    primitives.pu[log2Size - 2].luma_copy_sp(dst, dstYuv.m_size, src, m_size);
 }
 
 void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -103,8 +103,8 @@
     int16_t* dstU = dstYuv.getCbAddr(absPartIdx);
     int16_t* dstV = dstYuv.getCrAddr(absPartIdx);
 
-    primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, srcU, m_csize);
-    primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, srcV, m_csize);
+    primitives.chroma[m_csp].pu[part].copy_ss(dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].pu[part].copy_ss(dstV, dstYuv.m_csize, srcV, m_csize);
 }
 
 void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -115,6 +115,6 @@
     pixel* dstU = dstYuv.getCbAddr(absPartIdx);
     pixel* dstV = dstYuv.getCrAddr(absPartIdx);
 
-    primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, srcU, m_csize);
-    primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, srcV, m_csize);
+    primitives.chroma[m_csp].pu[part].copy_sp(dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].pu[part].copy_sp(dstV, dstYuv.m_csize, srcV, m_csize);
 }
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/vec/dct-sse3.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -1402,9 +1402,9 @@
      * still somewhat rare on end-user PCs we still compile and link these SSE3
      * intrinsic SIMD functions */
 #if !HIGH_BIT_DEPTH
-    p.idct[IDCT_8x8] = idct8;
-    p.idct[IDCT_16x16] = idct16;
-    p.idct[IDCT_32x32] = idct32;
+    p.cu[BLOCK_8x8].idct   = idct8;
+    p.cu[BLOCK_16x16].idct = idct16;
+    p.cu[BLOCK_32x32].idct = idct32;
 #endif
 }
 }
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/vec/dct-ssse3.cpp
--- a/source/common/vec/dct-ssse3.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/vec/dct-ssse3.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -1111,8 +1111,8 @@
      * still somewhat rare on end-user PCs we still compile and link these SSSE3
      * intrinsic SIMD functions */
 #if !HIGH_BIT_DEPTH
-    p.dct[DCT_16x16] = dct16;
-    p.dct[DCT_32x32] = dct32;
+    p.cu[BLOCK_16x16].dct = dct16;
+    p.cu[BLOCK_32x32].dct = dct32;
 #endif
 }
 }
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -46,29 +46,29 @@
     const int filterSize = NTAPS_LUMA;
     const int halfFilterSize = filterSize >> 1;
 
-    x265::primitives.luma_hps[size](src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
-    x265::primitives.luma_vsp[size](immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
+    x265::primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
+    x265::primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
 }
 
 #define INIT2_NAME(name1, name2, cpu) \
-    p.name1[LUMA_16x16] = x265_pixel_ ## name2 ## _16x16 ## cpu; \
-    p.name1[LUMA_16x8]  = x265_pixel_ ## name2 ## _16x8 ## cpu;
+    p.pu[LUMA_16x16].name1 = x265_pixel_ ## name2 ## _16x16 ## cpu; \
+    p.pu[LUMA_16x8].name1  = x265_pixel_ ## name2 ## _16x8 ## cpu;
 #define INIT4_NAME(name1, name2, cpu) \
     INIT2_NAME(name1, name2, cpu) \
-    p.name1[LUMA_8x16]  = x265_pixel_ ## name2 ## _8x16 ## cpu; \
-    p.name1[LUMA_8x8]   = x265_pixel_ ## name2 ## _8x8 ## cpu;
+    p.pu[LUMA_8x16].name1  = x265_pixel_ ## name2 ## _8x16 ## cpu; \
+    p.pu[LUMA_8x8].name1   = x265_pixel_ ## name2 ## _8x8 ## cpu;
 #define INIT5_NAME(name1, name2, cpu) \
     INIT4_NAME(name1, name2, cpu) \
-    p.name1[LUMA_8x4]   = x265_pixel_ ## name2 ## _8x4 ## cpu;
+    p.pu[LUMA_8x4].name1   = x265_pixel_ ## name2 ## _8x4 ## cpu;
 #define INIT6_NAME(name1, name2, cpu) \
     INIT5_NAME(name1, name2, cpu) \
-    p.name1[LUMA_4x8]   = x265_pixel_ ## name2 ## _4x8 ## cpu;
+    p.pu[LUMA_4x8].name1   = x265_pixel_ ## name2 ## _4x8 ## cpu;
 #define INIT7_NAME(name1, name2, cpu) \
     INIT6_NAME(name1, name2, cpu) \
-    p.name1[LUMA_4x4]   = x265_pixel_ ## name2 ## _4x4 ## cpu;
+    p.pu[LUMA_4x4].name1   = x265_pixel_ ## name2 ## _4x4 ## cpu;
 #define INIT8_NAME(name1, name2, cpu) \
     INIT7_NAME(name1, name2, cpu) \
-    p.name1[LUMA_4x16]  = x265_pixel_ ## name2 ## _4x16 ## cpu;
+    p.pu[LUMA_4x16].name1  = x265_pixel_ ## name2 ## _4x16 ## cpu;
 #define INIT2(name, cpu) INIT2_NAME(name, name, cpu)
 #define INIT4(name, cpu) INIT4_NAME(name, name, cpu)
 #define INIT5(name, cpu) INIT5_NAME(name, name, cpu)
@@ -77,220 +77,220 @@
 #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
 
 #define HEVC_SATD(cpu) \
-    p.satd[LUMA_4x8]   = x265_pixel_satd_4x8_ ## cpu; \
-    p.satd[LUMA_4x16]   = x265_pixel_satd_4x16_ ## cpu; \
-    p.satd[LUMA_8x4]   = x265_pixel_satd_8x4_ ## cpu; \
-    p.satd[LUMA_8x8]   = x265_pixel_satd_8x8_ ## cpu; \
-    p.satd[LUMA_8x16]   = x265_pixel_satd_8x16_ ## cpu; \
-    p.satd[LUMA_8x32]   = x265_pixel_satd_8x32_ ## cpu; \
-    p.satd[LUMA_12x16]   = x265_pixel_satd_12x16_ ## cpu; \
-    p.satd[LUMA_16x4]   = x265_pixel_satd_16x4_ ## cpu; \
-    p.satd[LUMA_16x8]   = x265_pixel_satd_16x8_ ## cpu; \
-    p.satd[LUMA_16x12]   = x265_pixel_satd_16x12_ ## cpu; \
-    p.satd[LUMA_16x16]   = x265_pixel_satd_16x16_ ## cpu; \
-    p.satd[LUMA_16x32]   = x265_pixel_satd_16x32_ ## cpu; \
-    p.satd[LUMA_16x64]   = x265_pixel_satd_16x64_ ## cpu; \
-    p.satd[LUMA_24x32]   = x265_pixel_satd_24x32_ ## cpu; \
-    p.satd[LUMA_32x8]   = x265_pixel_satd_32x8_ ## cpu; \
-    p.satd[LUMA_32x16]   = x265_pixel_satd_32x16_ ## cpu; \
-    p.satd[LUMA_32x24]   = x265_pixel_satd_32x24_ ## cpu; \
-    p.satd[LUMA_32x32]   = x265_pixel_satd_32x32_ ## cpu; \
-    p.satd[LUMA_32x64]   = x265_pixel_satd_32x64_ ## cpu; \
-    p.satd[LUMA_48x64]   = x265_pixel_satd_48x64_ ## cpu; \
-    p.satd[LUMA_64x16]   = x265_pixel_satd_64x16_ ## cpu; \
-    p.satd[LUMA_64x32]   = x265_pixel_satd_64x32_ ## cpu; \
-    p.satd[LUMA_64x48]   = x265_pixel_satd_64x48_ ## cpu; \
-    p.satd[LUMA_64x64]   = x265_pixel_satd_64x64_ ## cpu;
+    p.pu[LUMA_4x8].satd   = x265_pixel_satd_4x8_ ## cpu; \
+    p.pu[LUMA_4x16].satd  = x265_pixel_satd_4x16_ ## cpu; \
+    p.pu[LUMA_8x4].satd   = x265_pixel_satd_8x4_ ## cpu; \
+    p.pu[LUMA_8x8].satd   = x265_pixel_satd_8x8_ ## cpu; \
+    p.pu[LUMA_8x16].satd  = x265_pixel_satd_8x16_ ## cpu; \
+    p.pu[LUMA_8x32].satd  = x265_pixel_satd_8x32_ ## cpu; \
+    p.pu[LUMA_12x16].satd = x265_pixel_satd_12x16_ ## cpu; \
+    p.pu[LUMA_16x4].satd  = x265_pixel_satd_16x4_ ## cpu; \
+    p.pu[LUMA_16x8].satd  = x265_pixel_satd_16x8_ ## cpu; \
+    p.pu[LUMA_16x12].satd = x265_pixel_satd_16x12_ ## cpu; \
+    p.pu[LUMA_16x16].satd = x265_pixel_satd_16x16_ ## cpu; \
+    p.pu[LUMA_16x32].satd = x265_pixel_satd_16x32_ ## cpu; \
+    p.pu[LUMA_16x64].satd = x265_pixel_satd_16x64_ ## cpu; \
+    p.pu[LUMA_24x32].satd = x265_pixel_satd_24x32_ ## cpu; \
+    p.pu[LUMA_32x8].satd  = x265_pixel_satd_32x8_ ## cpu; \
+    p.pu[LUMA_32x16].satd = x265_pixel_satd_32x16_ ## cpu; \
+    p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_ ## cpu; \
+    p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_ ## cpu; \
+    p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_ ## cpu; \
+    p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_ ## cpu; \
+    p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_ ## cpu; \
+    p.pu[LUMA_64x32].satd = x265_pixel_satd_64x32_ ## cpu; \
+    p.pu[LUMA_64x48].satd = x265_pixel_satd_64x48_ ## cpu; \
+    p.pu[LUMA_64x64].satd = x265_pixel_satd_64x64_ ## cpu;
 
 #define SAD_X3(cpu) \
-    p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
-    p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
-    p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
-    p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
-    p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
-    p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
-    p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
-    p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
-    p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
-    p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
-    p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
-    p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
-    p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
-    p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
-    p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
-    p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
+    p.pu[LUMA_16x8].sad_x3  = x265_pixel_sad_x3_16x8_ ## cpu; \
+    p.pu[LUMA_16x12].sad_x3 = x265_pixel_sad_x3_16x12_ ## cpu; \
+    p.pu[LUMA_16x16].sad_x3 = x265_pixel_sad_x3_16x16_ ## cpu; \
+    p.pu[LUMA_16x32].sad_x3 = x265_pixel_sad_x3_16x32_ ## cpu; \
+    p.pu[LUMA_16x64].sad_x3 = x265_pixel_sad_x3_16x64_ ## cpu; \
+    p.pu[LUMA_32x8].sad_x3  = x265_pixel_sad_x3_32x8_ ## cpu; \
+    p.pu[LUMA_32x16].sad_x3 = x265_pixel_sad_x3_32x16_ ## cpu; \
+    p.pu[LUMA_32x24].sad_x3 = x265_pixel_sad_x3_32x24_ ## cpu; \
+    p.pu[LUMA_32x32].sad_x3 = x265_pixel_sad_x3_32x32_ ## cpu; \
+    p.pu[LUMA_32x64].sad_x3 = x265_pixel_sad_x3_32x64_ ## cpu; \
+    p.pu[LUMA_24x32].sad_x3 = x265_pixel_sad_x3_24x32_ ## cpu; \
+    p.pu[LUMA_48x64].sad_x3 = x265_pixel_sad_x3_48x64_ ## cpu; \
+    p.pu[LUMA_64x16].sad_x3 = x265_pixel_sad_x3_64x16_ ## cpu; \
+    p.pu[LUMA_64x32].sad_x3 = x265_pixel_sad_x3_64x32_ ## cpu; \
+    p.pu[LUMA_64x48].sad_x3 = x265_pixel_sad_x3_64x48_ ## cpu; \
+    p.pu[LUMA_64x64].sad_x3 = x265_pixel_sad_x3_64x64_ ## cpu
 
 #define SAD_X4(cpu) \
-    p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
-    p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
-    p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
-    p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
-    p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
-    p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
-    p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
-    p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
-    p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
-    p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
-    p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
-    p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
-    p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
-    p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
-    p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
-    p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
+    p.pu[LUMA_16x8].sad_x4  = x265_pixel_sad_x4_16x8_ ## cpu; \
+    p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_ ## cpu; \
+    p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_ ## cpu; \
+    p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_ ## cpu; \
+    p.pu[LUMA_16x64].sad_x4 = x265_pixel_sad_x4_16x64_ ## cpu; \
+    p.pu[LUMA_32x8].sad_x4  = x265_pixel_sad_x4_32x8_ ## cpu; \
+    p.pu[LUMA_32x16].sad_x4 = x265_pixel_sad_x4_32x16_ ## cpu; \
+    p.pu[LUMA_32x24].sad_x4 = x265_pixel_sad_x4_32x24_ ## cpu; \
+    p.pu[LUMA_32x32].sad_x4 = x265_pixel_sad_x4_32x32_ ## cpu; \
+    p.pu[LUMA_32x64].sad_x4 = x265_pixel_sad_x4_32x64_ ## cpu; \
+    p.pu[LUMA_24x32].sad_x4 = x265_pixel_sad_x4_24x32_ ## cpu; \
+    p.pu[LUMA_48x64].sad_x4 = x265_pixel_sad_x4_48x64_ ## cpu; \
+    p.pu[LUMA_64x16].sad_x4 = x265_pixel_sad_x4_64x16_ ## cpu; \
+    p.pu[LUMA_64x32].sad_x4 = x265_pixel_sad_x4_64x32_ ## cpu; \
+    p.pu[LUMA_64x48].sad_x4 = x265_pixel_sad_x4_64x48_ ## cpu; \
+    p.pu[LUMA_64x64].sad_x4 = x265_pixel_sad_x4_64x64_ ## cpu
 
 #define SAD(cpu) \
-    p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_ ## cpu; \
-    p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_ ## cpu; \
-    p.sad[LUMA_16x12] = x265_pixel_sad_16x12_ ## cpu; \
-    p.sad[LUMA_16x32] = x265_pixel_sad_16x32_ ## cpu; \
-    p.sad[LUMA_16x64] = x265_pixel_sad_16x64_ ## cpu; \
-    p.sad[LUMA_32x8]  = x265_pixel_sad_32x8_ ## cpu; \
-    p.sad[LUMA_32x16] = x265_pixel_sad_32x16_ ## cpu; \
-    p.sad[LUMA_32x24] = x265_pixel_sad_32x24_ ## cpu; \
-    p.sad[LUMA_32x32] = x265_pixel_sad_32x32_ ## cpu; \
-    p.sad[LUMA_32x64] = x265_pixel_sad_32x64_ ## cpu; \
-    p.sad[LUMA_64x16] = x265_pixel_sad_64x16_ ## cpu; \
-    p.sad[LUMA_64x32] = x265_pixel_sad_64x32_ ## cpu; \
-    p.sad[LUMA_64x48] = x265_pixel_sad_64x48_ ## cpu; \
-    p.sad[LUMA_64x64] = x265_pixel_sad_64x64_ ## cpu; \
-    p.sad[LUMA_48x64] = x265_pixel_sad_48x64_ ## cpu; \
-    p.sad[LUMA_24x32] = x265_pixel_sad_24x32_ ## cpu; \
-    p.sad[LUMA_12x16] = x265_pixel_sad_12x16_ ## cpu
+    p.pu[LUMA_8x32].sad  = x265_pixel_sad_8x32_ ## cpu; \
+    p.pu[LUMA_16x4].sad  = x265_pixel_sad_16x4_ ## cpu; \
+    p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_ ## cpu; \
+    p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_ ## cpu; \
+    p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu; \
+    p.pu[LUMA_32x8].sad  = x265_pixel_sad_32x8_ ## cpu; \
+    p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_ ## cpu; \
+    p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_ ## cpu; \
+    p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_ ## cpu; \
+    p.pu[LUMA_32x64].sad = x265_pixel_sad_32x64_ ## cpu; \
+    p.pu[LUMA_64x16].sad = x265_pixel_sad_64x16_ ## cpu; \
+    p.pu[LUMA_64x32].sad = x265_pixel_sad_64x32_ ## cpu; \
+    p.pu[LUMA_64x48].sad = x265_pixel_sad_64x48_ ## cpu; \
+    p.pu[LUMA_64x64].sad = x265_pixel_sad_64x64_ ## cpu; \
+    p.pu[LUMA_48x64].sad = x265_pixel_sad_48x64_ ## cpu; \
+    p.pu[LUMA_24x32].sad = x265_pixel_sad_24x32_ ## cpu; \
+    p.pu[LUMA_12x16].sad = x265_pixel_sad_12x16_ ## cpu
 
 #define ASSGN_SSE(cpu) \
-    p.sse_pp[LUMA_8x8]   = x265_pixel_ssd_8x8_ ## cpu; \
-    p.sse_pp[LUMA_8x4]   = x265_pixel_ssd_8x4_ ## cpu; \
-    p.sse_pp[LUMA_16x16] = x265_pixel_ssd_16x16_ ## cpu; \
-    p.sse_pp[LUMA_16x4]  = x265_pixel_ssd_16x4_ ## cpu; \
-    p.sse_pp[LUMA_16x8]  = x265_pixel_ssd_16x8_ ## cpu; \
-    p.sse_pp[LUMA_8x16]  = x265_pixel_ssd_8x16_ ## cpu; \
-    p.sse_pp[LUMA_16x12] = x265_pixel_ssd_16x12_ ## cpu; \
-    p.sse_pp[LUMA_32x32] = x265_pixel_ssd_32x32_ ## cpu; \
-    p.sse_pp[LUMA_32x16] = x265_pixel_ssd_32x16_ ## cpu; \
-    p.sse_pp[LUMA_16x32] = x265_pixel_ssd_16x32_ ## cpu; \
-    p.sse_pp[LUMA_8x32]  = x265_pixel_ssd_8x32_ ## cpu; \
-    p.sse_pp[LUMA_32x8]  = x265_pixel_ssd_32x8_ ## cpu; \
-    p.sse_pp[LUMA_32x24] = x265_pixel_ssd_32x24_ ## cpu; \
-    p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
-    p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
+    p.pu[LUMA_8x8].sse_pp   = x265_pixel_ssd_8x8_ ## cpu; \
+    p.pu[LUMA_8x4].sse_pp   = x265_pixel_ssd_8x4_ ## cpu; \
+    p.pu[LUMA_16x16].sse_pp = x265_pixel_ssd_16x16_ ## cpu; \
+    p.pu[LUMA_16x4].sse_pp  = x265_pixel_ssd_16x4_ ## cpu; \
+    p.pu[LUMA_16x8].sse_pp  = x265_pixel_ssd_16x8_ ## cpu; \
+    p.pu[LUMA_8x16].sse_pp  = x265_pixel_ssd_8x16_ ## cpu; \
+    p.pu[LUMA_16x12].sse_pp = x265_pixel_ssd_16x12_ ## cpu; \
+    p.pu[LUMA_32x32].sse_pp = x265_pixel_ssd_32x32_ ## cpu; \
+    p.pu[LUMA_32x16].sse_pp = x265_pixel_ssd_32x16_ ## cpu; \
+    p.pu[LUMA_16x32].sse_pp = x265_pixel_ssd_16x32_ ## cpu; \
+    p.pu[LUMA_8x32].sse_pp  = x265_pixel_ssd_8x32_ ## cpu; \
+    p.pu[LUMA_32x8].sse_pp  = x265_pixel_ssd_32x8_ ## cpu; \
+    p.pu[LUMA_32x24].sse_pp = x265_pixel_ssd_32x24_ ## cpu; \
+    p.pu[LUMA_32x64].sse_pp = x265_pixel_ssd_32x64_ ## cpu; \
+    p.pu[LUMA_16x64].sse_pp = x265_pixel_ssd_16x64_ ## cpu
 
 #define ASSGN_SSE_SS(cpu) \
-    p.sse_ss[LUMA_4x4]   = x265_pixel_ssd_ss_4x4_ ## cpu; \
-    p.sse_ss[LUMA_4x8]   = x265_pixel_ssd_ss_4x8_ ## cpu; \
-    p.sse_ss[LUMA_4x16]   = x265_pixel_ssd_ss_4x16_ ## cpu; \
-    p.sse_ss[LUMA_8x4]   = x265_pixel_ssd_ss_8x4_ ## cpu; \
-    p.sse_ss[LUMA_8x8]   = x265_pixel_ssd_ss_8x8_ ## cpu; \
-    p.sse_ss[LUMA_8x16]   = x265_pixel_ssd_ss_8x16_ ## cpu; \
-    p.sse_ss[LUMA_8x32]   = x265_pixel_ssd_ss_8x32_ ## cpu; \
-    p.sse_ss[LUMA_12x16]   = x265_pixel_ssd_ss_12x16_ ## cpu; \
-    p.sse_ss[LUMA_16x4]   = x265_pixel_ssd_ss_16x4_ ## cpu; \
-    p.sse_ss[LUMA_16x8]   = x265_pixel_ssd_ss_16x8_ ## cpu; \
-    p.sse_ss[LUMA_16x12]   = x265_pixel_ssd_ss_16x12_ ## cpu; \
-    p.sse_ss[LUMA_16x16]   = x265_pixel_ssd_ss_16x16_ ## cpu; \
-    p.sse_ss[LUMA_16x32]   = x265_pixel_ssd_ss_16x32_ ## cpu; \
-    p.sse_ss[LUMA_16x64]   = x265_pixel_ssd_ss_16x64_ ## cpu; \
-    p.sse_ss[LUMA_24x32]   = x265_pixel_ssd_ss_24x32_ ## cpu; \
-    p.sse_ss[LUMA_32x8]   = x265_pixel_ssd_ss_32x8_ ## cpu; \
-    p.sse_ss[LUMA_32x16]   = x265_pixel_ssd_ss_32x16_ ## cpu; \
-    p.sse_ss[LUMA_32x24]   = x265_pixel_ssd_ss_32x24_ ## cpu; \
-    p.sse_ss[LUMA_32x32]   = x265_pixel_ssd_ss_32x32_ ## cpu; \
-    p.sse_ss[LUMA_32x64]   = x265_pixel_ssd_ss_32x64_ ## cpu; \
-    p.sse_ss[LUMA_48x64]   = x265_pixel_ssd_ss_48x64_ ## cpu; \
-    p.sse_ss[LUMA_64x16]   = x265_pixel_ssd_ss_64x16_ ## cpu; \
-    p.sse_ss[LUMA_64x32]   = x265_pixel_ssd_ss_64x32_ ## cpu; \
-    p.sse_ss[LUMA_64x48]   = x265_pixel_ssd_ss_64x48_ ## cpu; \
-    p.sse_ss[LUMA_64x64]   = x265_pixel_ssd_ss_64x64_ ## cpu;
+    p.pu[LUMA_4x4].sse_ss   = x265_pixel_ssd_ss_4x4_ ## cpu; \
+    p.pu[LUMA_4x8].sse_ss   = x265_pixel_ssd_ss_4x8_ ## cpu; \
+    p.pu[LUMA_4x16].sse_ss  = x265_pixel_ssd_ss_4x16_ ## cpu; \
+    p.pu[LUMA_8x4].sse_ss   = x265_pixel_ssd_ss_8x4_ ## cpu; \
+    p.pu[LUMA_8x8].sse_ss   = x265_pixel_ssd_ss_8x8_ ## cpu; \
+    p.pu[LUMA_8x16].sse_ss  = x265_pixel_ssd_ss_8x16_ ## cpu; \
+    p.pu[LUMA_8x32].sse_ss  = x265_pixel_ssd_ss_8x32_ ## cpu; \
+    p.pu[LUMA_12x16].sse_ss = x265_pixel_ssd_ss_12x16_ ## cpu; \
+    p.pu[LUMA_16x4].sse_ss  = x265_pixel_ssd_ss_16x4_ ## cpu; \
+    p.pu[LUMA_16x8].sse_ss  = x265_pixel_ssd_ss_16x8_ ## cpu; \
+    p.pu[LUMA_16x12].sse_ss = x265_pixel_ssd_ss_16x12_ ## cpu; \
+    p.pu[LUMA_16x16].sse_ss = x265_pixel_ssd_ss_16x16_ ## cpu; \
+    p.pu[LUMA_16x32].sse_ss = x265_pixel_ssd_ss_16x32_ ## cpu; \
+    p.pu[LUMA_16x64].sse_ss = x265_pixel_ssd_ss_16x64_ ## cpu; \
+    p.pu[LUMA_24x32].sse_ss = x265_pixel_ssd_ss_24x32_ ## cpu; \
+    p.pu[LUMA_32x8].sse_ss  = x265_pixel_ssd_ss_32x8_ ## cpu; \
+    p.pu[LUMA_32x16].sse_ss = x265_pixel_ssd_ss_32x16_ ## cpu; \
+    p.pu[LUMA_32x24].sse_ss = x265_pixel_ssd_ss_32x24_ ## cpu; \
+    p.pu[LUMA_32x32].sse_ss = x265_pixel_ssd_ss_32x32_ ## cpu; \
+    p.pu[LUMA_32x64].sse_ss = x265_pixel_ssd_ss_32x64_ ## cpu; \
+    p.pu[LUMA_48x64].sse_ss = x265_pixel_ssd_ss_48x64_ ## cpu; \
+    p.pu[LUMA_64x16].sse_ss = x265_pixel_ssd_ss_64x16_ ## cpu; \
+    p.pu[LUMA_64x32].sse_ss = x265_pixel_ssd_ss_64x32_ ## cpu; \
+    p.pu[LUMA_64x48].sse_ss = x265_pixel_ssd_ss_64x48_ ## cpu; \
+    p.pu[LUMA_64x64].sse_ss = x265_pixel_ssd_ss_64x64_ ## cpu;
 
 #define SA8D_INTER_FROM_BLOCK(cpu) \
-    p.sa8d_inter[LUMA_4x8]   = x265_pixel_satd_4x8_ ## cpu; \
-    p.sa8d_inter[LUMA_8x4]   = x265_pixel_satd_8x4_ ## cpu; \
-    p.sa8d_inter[LUMA_4x16]  = x265_pixel_satd_4x16_ ## cpu; \
-    p.sa8d_inter[LUMA_16x4]  = x265_pixel_satd_16x4_ ## cpu; \
-    p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
-    p.sa8d_inter[LUMA_8x8]   = x265_pixel_sa8d_8x8_ ## cpu; \
-    p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_ ## cpu; \
-    p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
-    p.sa8d_inter[LUMA_16x8]  = x265_pixel_sa8d_16x8_ ## cpu; \
-    p.sa8d_inter[LUMA_8x16]  = x265_pixel_sa8d_8x16_ ## cpu; \
-    p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_ ## cpu; \
-    p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_ ## cpu; \
-    p.sa8d_inter[LUMA_32x8]  = x265_pixel_sa8d_32x8_ ## cpu; \
-    p.sa8d_inter[LUMA_8x32]  = x265_pixel_sa8d_8x32_ ## cpu; \
-    p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
-    p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_ ## cpu; \
-    p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_ ## cpu; \
-    p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
-    p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \
-    p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \
-    p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \
-    p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_ ## cpu; \
-    p.sa8d_inter[LUMA_64x16] = x265_pixel_sa8d_64x16_ ## cpu; \
-    p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_ ## cpu;
+    p.pu[LUMA_4x8].sa8d_inter   = x265_pixel_satd_4x8_ ## cpu; \
+    p.pu[LUMA_8x4].sa8d_inter   = x265_pixel_satd_8x4_ ## cpu; \
+    p.pu[LUMA_4x16].sa8d_inter  = x265_pixel_satd_4x16_ ## cpu; \
+    p.pu[LUMA_16x4].sa8d_inter  = x265_pixel_satd_16x4_ ## cpu; \
+    p.pu[LUMA_12x16].sa8d_inter = x265_pixel_satd_12x16_ ## cpu; \
+    p.pu[LUMA_8x8].sa8d_inter   = x265_pixel_sa8d_8x8_ ## cpu; \
+    p.pu[LUMA_16x16].sa8d_inter = x265_pixel_sa8d_16x16_ ## cpu; \
+    p.pu[LUMA_16x12].sa8d_inter = x265_pixel_satd_16x12_ ## cpu; \
+    p.pu[LUMA_16x8].sa8d_inter  = x265_pixel_sa8d_16x8_ ## cpu; \
+    p.pu[LUMA_8x16].sa8d_inter  = x265_pixel_sa8d_8x16_ ## cpu; \
+    p.pu[LUMA_32x24].sa8d_inter = x265_pixel_sa8d_32x24_ ## cpu; \
+    p.pu[LUMA_24x32].sa8d_inter = x265_pixel_sa8d_24x32_ ## cpu; \
+    p.pu[LUMA_32x8].sa8d_inter  = x265_pixel_sa8d_32x8_ ## cpu; \
+    p.pu[LUMA_8x32].sa8d_inter  = x265_pixel_sa8d_8x32_ ## cpu; \
+    p.pu[LUMA_32x32].sa8d_inter = x265_pixel_sa8d_32x32_ ## cpu; \
+    p.pu[LUMA_32x16].sa8d_inter = x265_pixel_sa8d_32x16_ ## cpu; \
+    p.pu[LUMA_16x32].sa8d_inter = x265_pixel_sa8d_16x32_ ## cpu; \
+    p.pu[LUMA_64x64].sa8d_inter = x265_pixel_sa8d_64x64_ ## cpu; \
+    p.pu[LUMA_64x32].sa8d_inter = x265_pixel_sa8d_64x32_ ## cpu; \
+    p.pu[LUMA_32x64].sa8d_inter = x265_pixel_sa8d_32x64_ ## cpu; \
+    p.pu[LUMA_64x48].sa8d_inter = x265_pixel_sa8d_64x48_ ## cpu; \
+    p.pu[LUMA_48x64].sa8d_inter = x265_pixel_sa8d_48x64_ ## cpu; \
+    p.pu[LUMA_64x16].sa8d_inter = x265_pixel_sa8d_64x16_ ## cpu; \
+    p.pu[LUMA_16x64].sa8d_inter = x265_pixel_sa8d_16x64_ ## cpu;
 
 #define PIXEL_AVG(cpu) \
-    p.pixelavg_pp[LUMA_64x64] = x265_pixel_avg_64x64_ ## cpu; \
-    p.pixelavg_pp[LUMA_64x48] = x265_pixel_avg_64x48_ ## cpu; \
-    p.pixelavg_pp[LUMA_64x32] = x265_pixel_avg_64x32_ ## cpu; \
-    p.pixelavg_pp[LUMA_64x16] = x265_pixel_avg_64x16_ ## cpu; \
-    p.pixelavg_pp[LUMA_48x64] = x265_pixel_avg_48x64_ ## cpu; \
-    p.pixelavg_pp[LUMA_32x64] = x265_pixel_avg_32x64_ ## cpu; \
-    p.pixelavg_pp[LUMA_32x32] = x265_pixel_avg_32x32_ ## cpu; \
-    p.pixelavg_pp[LUMA_32x24] = x265_pixel_avg_32x24_ ## cpu; \
-    p.pixelavg_pp[LUMA_32x16] = x265_pixel_avg_32x16_ ## cpu; \
-    p.pixelavg_pp[LUMA_32x8] = x265_pixel_avg_32x8_ ## cpu; \
-    p.pixelavg_pp[LUMA_24x32] = x265_pixel_avg_24x32_ ## cpu; \
-    p.pixelavg_pp[LUMA_16x64] = x265_pixel_avg_16x64_ ## cpu; \
-    p.pixelavg_pp[LUMA_16x32] = x265_pixel_avg_16x32_ ## cpu; \
-    p.pixelavg_pp[LUMA_16x16] = x265_pixel_avg_16x16_ ## cpu; \
-    p.pixelavg_pp[LUMA_16x12]  = x265_pixel_avg_16x12_ ## cpu; \
-    p.pixelavg_pp[LUMA_16x8]  = x265_pixel_avg_16x8_ ## cpu; \
-    p.pixelavg_pp[LUMA_16x4]  = x265_pixel_avg_16x4_ ## cpu; \
-    p.pixelavg_pp[LUMA_12x16] = x265_pixel_avg_12x16_ ## cpu; \
-    p.pixelavg_pp[LUMA_8x32]  = x265_pixel_avg_8x32_ ## cpu; \
-    p.pixelavg_pp[LUMA_8x16]  = x265_pixel_avg_8x16_ ## cpu; \
-    p.pixelavg_pp[LUMA_8x8]   = x265_pixel_avg_8x8_ ## cpu; \
-    p.pixelavg_pp[LUMA_8x4]   = x265_pixel_avg_8x4_ ## cpu;
+    p.pu[LUMA_64x64].pixelavg_pp = x265_pixel_avg_64x64_ ## cpu; \
+    p.pu[LUMA_64x48].pixelavg_pp = x265_pixel_avg_64x48_ ## cpu; \
+    p.pu[LUMA_64x32].pixelavg_pp = x265_pixel_avg_64x32_ ## cpu; \
+    p.pu[LUMA_64x16].pixelavg_pp = x265_pixel_avg_64x16_ ## cpu; \
+    p.pu[LUMA_48x64].pixelavg_pp = x265_pixel_avg_48x64_ ## cpu; \
+    p.pu[LUMA_32x64].pixelavg_pp = x265_pixel_avg_32x64_ ## cpu; \
+    p.pu[LUMA_32x32].pixelavg_pp = x265_pixel_avg_32x32_ ## cpu; \
+    p.pu[LUMA_32x24].pixelavg_pp = x265_pixel_avg_32x24_ ## cpu; \
+    p.pu[LUMA_32x16].pixelavg_pp = x265_pixel_avg_32x16_ ## cpu; \
+    p.pu[LUMA_32x8].pixelavg_pp  = x265_pixel_avg_32x8_ ## cpu; \
+    p.pu[LUMA_24x32].pixelavg_pp = x265_pixel_avg_24x32_ ## cpu; \
+    p.pu[LUMA_16x64].pixelavg_pp = x265_pixel_avg_16x64_ ## cpu; \
+    p.pu[LUMA_16x32].pixelavg_pp = x265_pixel_avg_16x32_ ## cpu; \
+    p.pu[LUMA_16x16].pixelavg_pp = x265_pixel_avg_16x16_ ## cpu; \
+    p.pu[LUMA_16x12].pixelavg_pp = x265_pixel_avg_16x12_ ## cpu; \
+    p.pu[LUMA_16x8].pixelavg_pp  = x265_pixel_avg_16x8_ ## cpu; \
+    p.pu[LUMA_16x4].pixelavg_pp  = x265_pixel_avg_16x4_ ## cpu; \
+    p.pu[LUMA_12x16].pixelavg_pp = x265_pixel_avg_12x16_ ## cpu; \
+    p.pu[LUMA_8x32].pixelavg_pp  = x265_pixel_avg_8x32_ ## cpu; \
+    p.pu[LUMA_8x16].pixelavg_pp  = x265_pixel_avg_8x16_ ## cpu; \
+    p.pu[LUMA_8x8].pixelavg_pp   = x265_pixel_avg_8x8_ ## cpu; \
+    p.pu[LUMA_8x4].pixelavg_pp   = x265_pixel_avg_8x4_ ## cpu;
 
 #define PIXEL_AVG_W4(cpu) \
-    p.pixelavg_pp[LUMA_4x4]  = x265_pixel_avg_4x4_ ## cpu; \
-    p.pixelavg_pp[LUMA_4x8]  = x265_pixel_avg_4x8_ ## cpu; \
-    p.pixelavg_pp[LUMA_4x16] = x265_pixel_avg_4x16_ ## cpu;
+    p.pu[LUMA_4x4].pixelavg_pp  = x265_pixel_avg_4x4_ ## cpu; \
+    p.pu[LUMA_4x8].pixelavg_pp  = x265_pixel_avg_4x8_ ## cpu; \
+    p.pu[LUMA_4x16].pixelavg_pp = x265_pixel_avg_4x16_ ## cpu;
 
 #define SETUP_CHROMA_FUNC_DEF_420(W, H, cpu) \
-    p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
-    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
-    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
-    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
-    p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
-    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
 
 #define CHROMA_FILTERS_420(cpu) \
     SETUP_CHROMA_FUNC_DEF_420(4, 4, cpu); \
@@ -538,37 +538,37 @@
 
 #if HIGH_BIT_DEPTH    // temporary, until all 10bit functions are completed
 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
-    p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
-    p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
-    p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
+    p.pu[LUMA_ ## W ## x ## H].luma_hpp = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_hps = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vpp = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vps = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vsp = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
 #else
 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
-    p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
-    p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
-    p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
+    p.pu[LUMA_ ## W ## x ## H].luma_hpp = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_hps = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vpp = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vps = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_vsp = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_ ## W ## x ## H>;
 #endif // if HIGH_BIT_DEPTH
 
 #define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
-    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
-    p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
+    p.pu[LUMA_ ## W ## x ## H].luma_sub_ps = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+    p.pu[LUMA_ ## W ## x ## H].luma_add_ps = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
 
 #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
-    p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.pu[LUMA_ ## W ## x ## H].luma_vsp = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
-    p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
+    p.pu[LUMA_ ## W ## x ## H].luma_vss = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
 
 #define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
-    p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
+    p.pu[LUMA_ ## W ## x ## H].luma_copy_ ## type = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
-    p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_ ## type = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
 
 #define CHROMA_BLOCKCOPY(type, cpu) \
     SETUP_CHROMA_BLOCKCOPY(type, 2,  4,  cpu); \
@@ -597,7 +597,7 @@
     SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
 
 #define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
-    p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_ ## type = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
 
 #define CHROMA_BLOCKCOPY_422(type, cpu) \
     SETUP_CHROMA_BLOCKCOPY_422(type, 2,  8,  cpu); \
@@ -653,7 +653,7 @@
     SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
 
 #define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
-    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].copy_sp = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_BLOCKCOPY_SP(cpu) \
     SETUP_CHROMA_BLOCKCOPY_SP(2,  4,  cpu); \
@@ -682,7 +682,7 @@
     SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
 
 #define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].copy_sp = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_BLOCKCOPY_SP_422(cpu) \
     SETUP_CHROMA_BLOCKCOPY_SP_422(2,  8,  cpu); \
@@ -711,8 +711,8 @@
     SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
 
 #define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
-    p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].sub_ps = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].cu[CHROMA_ ## W ## x ## H].add_ps = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
 
 #define CHROMA_PIXELSUB_PS(cpu) \
     SETUP_CHROMA_PIXELSUB(4,  4,  cpu); \
@@ -721,8 +721,8 @@
     SETUP_CHROMA_PIXELSUB(32, 32, cpu);
 
 #define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].sub_ps = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].cu[CHROMA422_ ## W ## x ## H].add_ps = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
 
 #define CHROMA_PIXELSUB_PS_422(cpu) \
     SETUP_CHROMA_PIXELSUB_422(4,  8,  cpu); \
@@ -819,7 +819,7 @@
     SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
 
 #define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
-    p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
+    p.cu[BLOCK_ ## W ## x ## H].var = x265_pixel_var_ ## W ## x ## H ## cpu;
 
 #define LUMA_VAR(cpu) \
     SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
@@ -828,7 +828,7 @@
     SETUP_PIXEL_VAR_DEF(64, 64, cpu);
 
 #define SETUP_PIXEL_SSE_SP_DEF(W, H, cpu) \
-    p.sse_sp[LUMA_ ## W ## x ## H] = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
+    p.pu[LUMA_ ## W ## x ## H].sse_sp = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
 
 #define LUMA_SSE_SP(cpu) \
     SETUP_PIXEL_SSE_SP_DEF(4,   4, cpu); \
@@ -858,7 +858,7 @@
     SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
 
 #define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
-    p.luma_addAvg[LUMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
+    p.pu[LUMA_ ## W ## x ## H].luma_addAvg = x265_addAvg_ ## W ## x ## H ## cpu;
 
 #define LUMA_ADDAVG(cpu) \
     SETUP_LUMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
@@ -888,7 +888,7 @@
     SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
 
 #define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
-    p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].addAvg = x265_addAvg_ ## W ## x ## H ## cpu;
 
 #define CHROMA_ADDAVG(cpu) \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(2,  4,  cpu); \
@@ -917,7 +917,7 @@
     SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
 
 #define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].addAvg = x265_addAvg_ ## W ## x ## H ## cpu;
 
 #define CHROMA_ADDAVG_422(cpu) \
     SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2,  8,  cpu); \
@@ -1054,10 +1054,10 @@
     SETUP_INTRA_ANG16_32(33, 33, cpu);
 
 #define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
-    p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_VERT_FILTERS(cpu) \
     SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
@@ -1088,10 +1088,10 @@
     SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
 
 #define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_VERT_FILTERS_422(cpu) \
     SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
@@ -1122,10 +1122,10 @@
     SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
 
 #define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
-    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_VERT_FILTERS_444(cpu) \
     SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
@@ -1154,8 +1154,8 @@
     SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu);
 
 #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
-    p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].pu[CHROMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
 
 #define CHROMA_HORIZ_FILTERS(cpu) \
     SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
@@ -1184,8 +1184,8 @@
     SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu);
 
 #define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
-    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].pu[CHROMA422_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
 
 #define CHROMA_HORIZ_FILTERS_422(cpu) \
     SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
@@ -1214,8 +1214,8 @@
     SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu);
 
 #define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
-    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
-    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
 
 #define CHROMA_HORIZ_FILTERS_444(cpu) \
     SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
@@ -1257,44 +1257,44 @@
 
         INIT6(satd, _sse2);
         HEVC_SATD(sse2);
-        p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
+        p.pu[LUMA_4x4].satd = x265_pixel_satd_4x4_mmx2;
 
-        p.sa8d_inter[LUMA_4x4]  = x265_pixel_satd_4x4_mmx2;
+        p.pu[LUMA_4x4].sa8d_inter  = x265_pixel_satd_4x4_mmx2;
         SA8D_INTER_FROM_BLOCK(sse2);
-        p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
-        p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
+        p.pu[LUMA_8x8].sa8d_inter = x265_pixel_sa8d_8x8_sse2;
+        p.pu[LUMA_16x16].sa8d_inter = x265_pixel_sa8d_16x16_sse2;
 
-        p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_mmx2;
-        p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_mmx2;
-        p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_mmx2;
-        p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_sse2;
-        p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2;
-        p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2;
-        p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2;
-        p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2;
-        p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2;
-        p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2;
-        p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2;
-        p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2;
-        p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2;
-        p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2;
-        p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2;
-        p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2;
-        p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2;
-        p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2;
-        p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
-        p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
-        p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2;
-        p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2;
-        p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2;
-        p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2;
-        p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2;
+        p.pu[LUMA_4x4].sse_ss   = x265_pixel_ssd_ss_4x4_mmx2;
+        p.pu[LUMA_4x8].sse_ss   = x265_pixel_ssd_ss_4x8_mmx2;
+        p.pu[LUMA_4x16].sse_ss  = x265_pixel_ssd_ss_4x16_mmx2;
+        p.pu[LUMA_8x4].sse_ss   = x265_pixel_ssd_ss_8x4_sse2;
+        p.pu[LUMA_8x8].sse_ss   = x265_pixel_ssd_ss_8x8_sse2;
+        p.pu[LUMA_8x16].sse_ss  = x265_pixel_ssd_ss_8x16_sse2;
+        p.pu[LUMA_8x32].sse_ss  = x265_pixel_ssd_ss_8x32_sse2;
+        p.pu[LUMA_12x16].sse_ss = x265_pixel_ssd_ss_12x16_sse2;
+        p.pu[LUMA_16x4].sse_ss  = x265_pixel_ssd_ss_16x4_sse2;
+        p.pu[LUMA_16x8].sse_ss  = x265_pixel_ssd_ss_16x8_sse2;
+        p.pu[LUMA_16x12].sse_ss = x265_pixel_ssd_ss_16x12_sse2;
+        p.pu[LUMA_16x16].sse_ss = x265_pixel_ssd_ss_16x16_sse2;
+        p.pu[LUMA_16x32].sse_ss = x265_pixel_ssd_ss_16x32_sse2;
+        p.pu[LUMA_16x64].sse_ss = x265_pixel_ssd_ss_16x64_sse2;
+        p.pu[LUMA_24x32].sse_ss = x265_pixel_ssd_ss_24x32_sse2;
+        p.pu[LUMA_32x8].sse_ss  = x265_pixel_ssd_ss_32x8_sse2;
+        p.pu[LUMA_32x16].sse_ss = x265_pixel_ssd_ss_32x16_sse2;
+        p.pu[LUMA_32x24].sse_ss = x265_pixel_ssd_ss_32x24_sse2;
+        p.pu[LUMA_32x32].sse_ss = x265_pixel_ssd_ss_32x32_sse2;
+        p.pu[LUMA_32x64].sse_ss = x265_pixel_ssd_ss_32x64_sse2;
+        p.pu[LUMA_48x64].sse_ss = x265_pixel_ssd_ss_48x64_sse2;
+        p.pu[LUMA_64x16].sse_ss = x265_pixel_ssd_ss_64x16_sse2;
+        p.pu[LUMA_64x32].sse_ss = x265_pixel_ssd_ss_64x32_sse2;
+        p.pu[LUMA_64x48].sse_ss = x265_pixel_ssd_ss_64x48_sse2;
+        p.pu[LUMA_64x64].sse_ss = x265_pixel_ssd_ss_64x64_sse2;
 
-        p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
-        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
-        p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
-        p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
-        p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
+        p.cu[BLOCK_4x4].transpose   = x265_transpose4_sse2;
+        p.cu[BLOCK_8x8].transpose   = x265_transpose8_sse2;
+        p.cu[BLOCK_16x16].transpose = x265_transpose16_sse2;
+        p.cu[BLOCK_32x32].transpose = x265_transpose32_sse2;
+        p.cu[BLOCK_64x64].transpose = x265_transpose64_sse2;
 
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
         p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
@@ -1303,43 +1303,43 @@
         LUMA_VAR(_sse2);
 
         SAD_X3(sse2);
-        p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
-        p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
-        p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
-        p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
-        p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
-        p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
-        p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
-        p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
-        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
+        p.pu[LUMA_4x4].sad_x3   = x265_pixel_sad_x3_4x4_mmx2;
+        p.pu[LUMA_4x8].sad_x3   = x265_pixel_sad_x3_4x8_mmx2;
+        p.pu[LUMA_4x16].sad_x3  = x265_pixel_sad_x3_4x16_mmx2;
+        p.pu[LUMA_8x4].sad_x3   = x265_pixel_sad_x3_8x4_sse2;
+        p.pu[LUMA_8x8].sad_x3   = x265_pixel_sad_x3_8x8_sse2;
+        p.pu[LUMA_8x16].sad_x3  = x265_pixel_sad_x3_8x16_sse2;
+        p.pu[LUMA_8x32].sad_x3  = x265_pixel_sad_x3_8x32_sse2;
+        p.pu[LUMA_16x4].sad_x3  = x265_pixel_sad_x3_16x4_sse2;
+        p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_mmx2;
 
         SAD_X4(sse2);
-        p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
-        p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
-        p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
-        p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
-        p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
-        p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
-        p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
-        p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
-        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
+        p.pu[LUMA_4x4].sad_x4   = x265_pixel_sad_x4_4x4_mmx2;
+        p.pu[LUMA_4x8].sad_x4   = x265_pixel_sad_x4_4x8_mmx2;
+        p.pu[LUMA_4x16].sad_x4  = x265_pixel_sad_x4_4x16_mmx2;
+        p.pu[LUMA_8x4].sad_x4   = x265_pixel_sad_x4_8x4_sse2;
+        p.pu[LUMA_8x8].sad_x4   = x265_pixel_sad_x4_8x8_sse2;
+        p.pu[LUMA_8x16].sad_x4  = x265_pixel_sad_x4_8x16_sse2;
+        p.pu[LUMA_8x32].sad_x4  = x265_pixel_sad_x4_8x32_sse2;
+        p.pu[LUMA_16x4].sad_x4  = x265_pixel_sad_x4_16x4_sse2;
+        p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_mmx2;
 
-        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
-        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
-        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
-        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
-        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
-        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
-        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
-        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
-        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
-        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
-        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
-        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
-        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
-        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
-        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
-        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
+        p.cu[BLOCK_4x4].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_4_sse2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_8_sse2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_sse2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_sse2;
+        p.cu[BLOCK_4x4].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_4_sse2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_8_sse2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_sse2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_sse2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_4_sse2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_8_sse2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_sse2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_sse2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_4_sse2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_8_sse2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_sse2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_sse2;
 
         CHROMA_PIXELSUB_PS(_sse2);
         CHROMA_PIXELSUB_PS_422(_sse2);
@@ -1357,28 +1357,28 @@
         p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
         p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
 
-        p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
-        p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
-        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
-        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
+        p.cu[BLOCK_4x4].blockfill_s = x265_blockfill_s_4x4_sse2;
+        p.cu[BLOCK_8x8].blockfill_s = x265_blockfill_s_8x8_sse2;
+        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_sse2;
+        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_sse2;
 
         // TODO: overflow on 12-bits mode!
-        p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
-        p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
-        p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
-        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
+        p.cu[BLOCK_4x4].ssd_s   = x265_pixel_ssd_s_4_sse2;
+        p.cu[BLOCK_8x8].ssd_s   = x265_pixel_ssd_s_8_sse2;
+        p.cu[BLOCK_16x16].ssd_s = x265_pixel_ssd_s_16_sse2;
+        p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_sse2;
 
-        p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
-        p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
-        p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
-        p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
+        p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
+        p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
+        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_sse2;
+        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_sse2;
 
-        p.dct[DCT_4x4] = x265_dct4_sse2;
-        p.idct[IDCT_4x4] = x265_idct4_sse2;
+        p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
+        p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
 #if X86_64
-        p.idct[IDCT_8x8] = x265_idct8_sse2;
+        p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
 #endif
-        p.idct[IDST_4x4] = x265_idst4_sse2;
+        p.idst4x4 = x265_idst4_sse2;
 
         LUMA_SS_FILTERS(_sse2);
     }
@@ -1389,8 +1389,8 @@
 
         INTRA_ANG_SSSE3(ssse3);
 
-        p.dct[DST_4x4] = x265_dst4_ssse3;
-        p.idct[IDCT_8x8] = x265_idct8_ssse3;
+        p.dst4x4 = x265_dst4_ssse3;
+        p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
         p.count_nonzero = x265_count_nonzero_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
@@ -1405,7 +1405,7 @@
         CHROMA_VERT_FILTERS_SSE4_422(_sse4);
         CHROMA_HORIZ_FILTERS_444(_sse4);
 
-        p.dct[DCT_8x8] = x265_dct8_sse4;
+        p.cu[BLOCK_8x8].dct = x265_dct8_sse4;
         p.quant = x265_quant_sse4;
         p.nquant = x265_nquant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
@@ -1423,12 +1423,12 @@
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
 
-        p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
+        p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
 #if X86_64
-        p.psy_cost_pp[BLOCK_8x8] = x265_psyCost_pp_8x8_sse4;
-        p.psy_cost_pp[BLOCK_16x16] = x265_psyCost_pp_16x16_sse4;
-        p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
-        p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
+        p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_sse4;
+        p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_sse4;
+        p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_sse4;
+        p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_sse4;
 #endif
     }
     if (cpuMask & X265_CPU_XOP)
@@ -1440,59 +1440,59 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
-        p.dct[DCT_4x4] = x265_dct4_avx2;
+        p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
-        p.dequant_normal = x265_dequant_normal_avx2;
+        p.dequant_normal  = x265_dequant_normal_avx2;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
-        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
-        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
-        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
-        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
-        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
-        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
-        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
-        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_4_avx2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_8_avx2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_avx2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_avx2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_4_avx2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_8_avx2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_avx2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_avx2;
 #if X86_64
-        p.dct[DCT_8x8] = x265_dct8_avx2;
-        p.dct[DCT_16x16] = x265_dct16_avx2;
-        p.dct[DCT_32x32] = x265_dct32_avx2;
-        p.idct[IDCT_4x4] = x265_idct4_avx2;
-        p.idct[IDCT_8x8] = x265_idct8_avx2;
-        p.idct[IDCT_16x16] = x265_idct16_avx2;
-        p.idct[IDCT_32x32] = x265_idct32_avx2;
-        p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
-        p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
-        p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
-        p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
+        p.cu[BLOCK_8x8].dct   = x265_dct8_avx2;
+        p.cu[BLOCK_16x16].dct = x265_dct16_avx2;
+        p.cu[BLOCK_32x32].dct = x265_dct32_avx2;
+        p.cu[BLOCK_4x4].idct  = x265_idct4_avx2;
+        p.cu[BLOCK_8x8].idct  = x265_idct8_avx2;
+        p.cu[BLOCK_16x16].idct = x265_idct16_avx2;
+        p.cu[BLOCK_32x32].idct = x265_idct32_avx2;
+        p.cu[BLOCK_8x8].transpose = x265_transpose8_avx2;
+        p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
+        p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
+        p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
 #endif
     }
     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
     for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
     {
-        p.sse_pp[i] = (pixelcmp_t)p.sse_ss[i];
-        p.sse_sp[i] = (pixelcmp_sp_t)p.sse_ss[i];
+        p.pu[i].sse_pp = (pixelcmp_t)p.pu[i].sse_ss;
+        p.pu[i].sse_sp = (pixelcmp_sp_t)p.pu[i].sse_ss;
     }
 
     for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
     {
-        p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
-        p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
-        p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
+        p.pu[i].luma_copy_ps = (copy_ps_t)p.pu[i].luma_copy_ss;
+        p.pu[i].luma_copy_sp = (copy_sp_t)p.pu[i].luma_copy_ss;
+        p.pu[i].luma_copy_pp = (copy_pp_t)p.pu[i].luma_copy_ss;
     }
 
     for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
     {
-        p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
-        p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
-        p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
+        p.chroma[X265_CSP_I420].pu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_ss;
+        p.chroma[X265_CSP_I420].pu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_ss;
+        p.chroma[X265_CSP_I420].pu[i].copy_pp = (copy_pp_t)p.chroma[X265_CSP_I420].pu[i].copy_ss;
     }
 
     for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
     {
-        p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
-        p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
-        p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
+        p.chroma[X265_CSP_I422].pu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_ss;
+        p.chroma[X265_CSP_I422].pu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_ss;
+        p.chroma[X265_CSP_I422].pu[i].copy_pp = (copy_pp_t)p.chroma[X265_CSP_I422].pu[i].copy_ss;
     }
 
 #else // if HIGH_BIT_DEPTH
@@ -1502,7 +1502,7 @@
         INIT8(sad, _mmx2);
         INIT8(sad_x3, _mmx2);
         INIT8(sad_x4, _mmx2);
-        p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
+        p.pu[LUMA_4x4].satd = x265_pixel_satd_4x4_mmx2;
         p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
 
         PIXEL_AVG(sse2);
@@ -1541,52 +1541,52 @@
         // until all partitions are coded and commit smaller patches, easier to
         // review.
 
-        p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
-        p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
-        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
-        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
+        p.cu[BLOCK_4x4].blockfill_s = x265_blockfill_s_4x4_sse2;
+        p.cu[BLOCK_8x8].blockfill_s = x265_blockfill_s_8x8_sse2;
+        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_sse2;
+        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_sse2;
 
-        p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
-        p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
-        p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
-        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
+        p.cu[BLOCK_4x4].ssd_s = x265_pixel_ssd_s_4_sse2;
+        p.cu[BLOCK_8x8].ssd_s = x265_pixel_ssd_s_8_sse2;
+        p.cu[BLOCK_16x16].ssd_s = x265_pixel_ssd_s_16_sse2;
+        p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_sse2;
 
         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
         SA8D_INTER_FROM_BLOCK(sse2);
 
-        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
-        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
-        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
-        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
-        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
-        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
-        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
-        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
-        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
-        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
-        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
-        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
-        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
-        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
-        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
-        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
+        p.cu[BLOCK_4x4].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_4_sse2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shl   = x265_cpy2Dto1D_shl_8_sse2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_sse2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_sse2;
+        p.cu[BLOCK_4x4].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_4_sse2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shr   = x265_cpy2Dto1D_shr_8_sse2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_sse2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_sse2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_4_sse2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_8_sse2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_sse2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_sse2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_4_sse2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_8_sse2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_sse2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_sse2;
 
-        p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
-        p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
-        p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
-        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
-        p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
-        p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
-        p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
+        p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
+        p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
+        p.cu[BLOCK_4x4].transpose = x265_transpose4_sse2;
+        p.cu[BLOCK_8x8].transpose = x265_transpose8_sse2;
+        p.cu[BLOCK_16x16].transpose = x265_transpose16_sse2;
+        p.cu[BLOCK_32x32].transpose = x265_transpose32_sse2;
+        p.cu[BLOCK_64x64].transpose = x265_transpose64_sse2;
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
         p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
 
-        p.dct[DCT_4x4] = x265_dct4_sse2;
-        p.idct[IDCT_4x4] = x265_idct4_sse2;
+        p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
+        p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
 #if X86_64
-        p.idct[IDCT_8x8] = x265_idct8_sse2;
+        p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
 #endif
-        p.idct[IDST_4x4] = x265_idst4_sse2;
+        p.idst4x4 = x265_idst4_sse2;
 
         p.planecopy_sp = x265_downShift_16_sse2;
     }
@@ -1594,7 +1594,7 @@
     {
         p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
         SA8D_INTER_FROM_BLOCK(ssse3);
-        p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
+        p.pu[LUMA_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3;
         ASSGN_SSE(ssse3);
         PIXEL_AVG(ssse3);
         PIXEL_AVG_W4(ssse3);
@@ -1605,23 +1605,23 @@
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
         SAD_X3(ssse3);
         SAD_X4(ssse3);
-        p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
-        p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
-        p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
-        p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3;
-        p.sad_x3[LUMA_8x32]  = x265_pixel_sad_x3_8x32_ssse3;
-        p.sad_x4[LUMA_8x32]  = x265_pixel_sad_x4_8x32_ssse3;
+        p.pu[LUMA_8x4].sad_x4  = x265_pixel_sad_x4_8x4_ssse3;
+        p.pu[LUMA_8x8].sad_x4  = x265_pixel_sad_x4_8x8_ssse3;
+        p.pu[LUMA_8x16].sad_x3 = x265_pixel_sad_x3_8x16_ssse3;
+        p.pu[LUMA_8x16].sad_x4 = x265_pixel_sad_x4_8x16_ssse3;
+        p.pu[LUMA_8x32].sad_x3 = x265_pixel_sad_x3_8x32_ssse3;
+        p.pu[LUMA_8x32].sad_x4 = x265_pixel_sad_x4_8x32_ssse3;
 
-        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
-        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
+        p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_ssse3;
+        p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_ssse3;
 
         p.luma_p2s = x265_luma_p2s_ssse3;
         p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
         p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
         p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_ssse3; // for i444, chroma_p2s can use luma_p2s
 
-        p.dct[DST_4x4] = x265_dst4_ssse3;
-        p.idct[IDCT_8x8] = x265_idct8_ssse3;
+        p.dst4x4 = x265_dst4_ssse3;
+        p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
         p.count_nonzero = x265_count_nonzero_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
@@ -1638,21 +1638,21 @@
         CHROMA_ADDAVG_422(_sse4);
 
         // TODO: check POPCNT flag!
-        p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
-        p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_sse4;
-        p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_sse4;
-        p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_sse4;
+        p.cu[BLOCK_4x4].copy_cnt = x265_copy_cnt_4_sse4;
+        p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_sse4;
+        p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_sse4;
+        p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_sse4;
 
         HEVC_SATD(sse4);
         SA8D_INTER_FROM_BLOCK(sse4);
 
-        p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
-        p.sse_pp[LUMA_24x32] = x265_pixel_ssd_24x32_sse4;
-        p.sse_pp[LUMA_48x64] = x265_pixel_ssd_48x64_sse4;
-        p.sse_pp[LUMA_64x16] = x265_pixel_ssd_64x16_sse4;
-        p.sse_pp[LUMA_64x32] = x265_pixel_ssd_64x32_sse4;
-        p.sse_pp[LUMA_64x48] = x265_pixel_ssd_64x48_sse4;
-        p.sse_pp[LUMA_64x64] = x265_pixel_ssd_64x64_sse4;
+        p.pu[LUMA_12x16].sse_pp = x265_pixel_ssd_12x16_sse4;
+        p.pu[LUMA_24x32].sse_pp = x265_pixel_ssd_24x32_sse4;
+        p.pu[LUMA_48x64].sse_pp = x265_pixel_ssd_48x64_sse4;
+        p.pu[LUMA_64x16].sse_pp = x265_pixel_ssd_64x16_sse4;
+        p.pu[LUMA_64x32].sse_pp = x265_pixel_ssd_64x32_sse4;
+        p.pu[LUMA_64x48].sse_pp = x265_pixel_ssd_64x48_sse4;
+        p.pu[LUMA_64x64].sse_pp = x265_pixel_ssd_64x64_sse4;
 
         LUMA_SSE_SP(_sse4);
 
@@ -1673,17 +1673,17 @@
         ASSGN_SSE_SS(sse4);
 
         // MUST be done after LUMA_FILTERS() to overwrite default version
-        p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_sse4;
+        p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse4;
 
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
+        p.chroma[X265_CSP_I420].pu[CHROMA_2x4].copy_sp = x265_blockcopy_sp_2x4_sse4;
+        p.chroma[X265_CSP_I420].pu[CHROMA_2x8].copy_sp = x265_blockcopy_sp_2x8_sse4;
+        p.chroma[X265_CSP_I420].pu[CHROMA_6x8].copy_sp = x265_blockcopy_sp_6x8_sse4;
         CHROMA_BLOCKCOPY(ps, _sse4);
         CHROMA_BLOCKCOPY_422(ps, _sse4);
         LUMA_BLOCKCOPY(ps, _sse4);
 
-        p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
-        p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
+        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_sse4;
+        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_sse4;
         p.quant = x265_quant_sse4;
         p.nquant = x265_nquant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
@@ -1707,14 +1707,14 @@
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4(sse4);
 
-        p.dct[DCT_8x8] = x265_dct8_sse4;
+        p.cu[BLOCK_8x8].dct = x265_dct8_sse4;
         p.denoiseDct = x265_denoise_dct_sse4;
-        p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
+        p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
 #if X86_64
-        p.psy_cost_pp[BLOCK_8x8] = x265_psyCost_pp_8x8_sse4;
-        p.psy_cost_pp[BLOCK_16x16] = x265_psyCost_pp_16x16_sse4;
-        p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
-        p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
+        p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_sse4;
+        p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_sse4;
+        p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_sse4;
+        p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_sse4;
 #endif
     }
     if (cpuMask & X265_CPU_AVX)
@@ -1727,36 +1727,36 @@
         ASSGN_SSE_SS(avx);
         SAD_X3(avx);
         SAD_X4(avx);
-        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
-        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
-        p.sad_x3[LUMA_16x4]  = x265_pixel_sad_x3_16x4_avx;
-        p.sad_x4[LUMA_16x4]  = x265_pixel_sad_x4_16x4_avx;
+        p.pu[LUMA_12x16].sad_x3 = x265_pixel_sad_x3_12x16_avx;
+        p.pu[LUMA_12x16].sad_x4 = x265_pixel_sad_x4_12x16_avx;
+        p.pu[LUMA_16x4].sad_x3  = x265_pixel_sad_x3_16x4_avx;
+        p.pu[LUMA_16x4].sad_x4  = x265_pixel_sad_x4_16x4_avx;
 
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
         p.ssim_end_4 = x265_pixel_ssim_end4_avx;
-        p.luma_copy_ss[LUMA_64x16] = x265_blockcopy_ss_64x16_avx;
-        p.luma_copy_ss[LUMA_64x32] = x265_blockcopy_ss_64x32_avx;
-        p.luma_copy_ss[LUMA_64x48] = x265_blockcopy_ss_64x48_avx;
-        p.luma_copy_ss[LUMA_64x64] = x265_blockcopy_ss_64x64_avx;
+        p.pu[LUMA_64x16].luma_copy_ss = x265_blockcopy_ss_64x16_avx;
+        p.pu[LUMA_64x32].luma_copy_ss = x265_blockcopy_ss_64x32_avx;
+        p.pu[LUMA_64x48].luma_copy_ss = x265_blockcopy_ss_64x48_avx;
+        p.pu[LUMA_64x64].luma_copy_ss = x265_blockcopy_ss_64x64_avx;
 
-        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x8] = x265_blockcopy_pp_32x8_avx;
-        p.luma_copy_pp[LUMA_32x8] = x265_blockcopy_pp_32x8_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_32x8].copy_pp = x265_blockcopy_pp_32x8_avx;
+        p.pu[LUMA_32x8].luma_copy_pp = x265_blockcopy_pp_32x8_avx;
 
-        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x16] = x265_blockcopy_pp_32x16_avx;
-        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x16] = x265_blockcopy_pp_32x16_avx;
-        p.luma_copy_pp[LUMA_32x16] = x265_blockcopy_pp_32x16_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_32x16].copy_pp = x265_blockcopy_pp_32x16_avx;
+        p.pu[LUMA_32x16].luma_copy_pp = x265_blockcopy_pp_32x16_avx;
 
-        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x24] = x265_blockcopy_pp_32x24_avx;
-        p.luma_copy_pp[LUMA_32x24] = x265_blockcopy_pp_32x24_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_32x24].copy_pp = x265_blockcopy_pp_32x24_avx;
+        p.pu[LUMA_32x24].luma_copy_pp = x265_blockcopy_pp_32x24_avx;
 
-        p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x32] = x265_blockcopy_pp_32x32_avx;
-        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x32] = x265_blockcopy_pp_32x32_avx;
-        p.luma_copy_pp[LUMA_32x32]  = x265_blockcopy_pp_32x32_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_32x32].copy_pp = x265_blockcopy_pp_32x32_avx;
+        p.pu[LUMA_32x32].luma_copy_pp  = x265_blockcopy_pp_32x32_avx;
 
-        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x48] = x265_blockcopy_pp_32x48_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_32x48].copy_pp = x265_blockcopy_pp_32x48_avx;
 
-        p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x64] = x265_blockcopy_pp_32x64_avx;
-        p.luma_copy_pp[LUMA_32x64]  = x265_blockcopy_pp_32x64_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_32x64].copy_pp = x265_blockcopy_pp_32x64_avx;
+        p.pu[LUMA_32x64].luma_copy_pp = x265_blockcopy_pp_32x64_avx;
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1771,139 +1771,139 @@
         INIT2(sad_x4, _avx2);
         INIT4(satd, _avx2);
         INIT2_NAME(sse_pp, ssd, _avx2);
-        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
-        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
-        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
+        p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_avx2;
+        p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_avx2;
+        p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
 
         /* Need to update assembly code as per changed interface of the copy_cnt primitive, once
          * code is updated, avx2 version will be enabled */
 
-        p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
-        p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
-        p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
+        p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_avx2;
+        p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_avx2;
+        p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_avx2;
 
-        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
-        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
+        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_avx2;
+        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_avx2;
 
-        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
-        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
-        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
-        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
-        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
-        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
-        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
-        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_4_avx2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shl   = x265_cpy1Dto2D_shl_8_avx2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_avx2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_32_avx2;
+        p.cu[BLOCK_4x4].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_4_avx2;
+        p.cu[BLOCK_8x8].cpy1Dto2D_shr   = x265_cpy1Dto2D_shr_8_avx2;
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_16_avx2;
+        p.cu[BLOCK_32x32].cpy1Dto2D_shr = x265_cpy1Dto2D_shr_32_avx2;
 
         p.denoiseDct = x265_denoise_dct_avx2;
-        p.dct[DCT_4x4] = x265_dct4_avx2;
+        p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
 
-        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
-        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
-        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
-        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x16] = x265_blockcopy_ss_16x16_avx;
-        p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x32] = x265_blockcopy_ss_16x32_avx;
-        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x8] = x265_blockcopy_ss_16x8_avx;
-        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x16] = x265_blockcopy_ss_16x16_avx;
-        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x24] = x265_blockcopy_ss_16x24_avx;
-        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx;
-        p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x4].copy_ss  = x265_blockcopy_ss_16x4_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x12].copy_ss = x265_blockcopy_ss_16x12_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x8].copy_ss  = x265_blockcopy_ss_16x8_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_16x8] .copy_ss = x265_blockcopy_ss_16x8_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_16x24].copy_ss = x265_blockcopy_ss_16x24_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].copy_ss = x265_blockcopy_ss_16x64_avx;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
 
         p.weight_pp = x265_weight_pp_avx2;
 
 #if X86_64
 
-        p.dct[DCT_8x8] = x265_dct8_avx2;
-        p.dct[DCT_16x16] = x265_dct16_avx2;
-        p.dct[DCT_32x32] = x265_dct32_avx2;
-        p.idct[IDCT_4x4] = x265_idct4_avx2;
-        p.idct[IDCT_8x8] = x265_idct8_avx2;
-        p.idct[IDCT_16x16] = x265_idct16_avx2;
-        p.idct[IDCT_32x32] = x265_idct32_avx2;
+        p.cu[BLOCK_8x8].dct    = x265_dct8_avx2;
+        p.cu[BLOCK_16x16].dct  = x265_dct16_avx2;
+        p.cu[BLOCK_32x32].dct  = x265_dct32_avx2;
+        p.cu[BLOCK_4x4].idct   = x265_idct4_avx2;
+        p.cu[BLOCK_8x8].idct   = x265_idct8_avx2;
+        p.cu[BLOCK_16x16].idct = x265_idct16_avx2;
+        p.cu[BLOCK_32x32].idct = x265_idct32_avx2;
 
-        p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
-        p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
-        p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
-        p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
+        p.cu[BLOCK_8x8].transpose   = x265_transpose8_avx2;
+        p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
+        p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
+        p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
 
-        p.luma_vpp[LUMA_12x16] = x265_interp_8tap_vert_pp_12x16_avx2;
+        p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2;
 
-        p.luma_vpp[LUMA_16x4] = x265_interp_8tap_vert_pp_16x4_avx2;
-        p.luma_vpp[LUMA_16x8] = x265_interp_8tap_vert_pp_16x8_avx2;
-        p.luma_vpp[LUMA_16x12] = x265_interp_8tap_vert_pp_16x12_avx2;
-        p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2;
-        p.luma_vpp[LUMA_16x32] = x265_interp_8tap_vert_pp_16x32_avx2;
-        p.luma_vpp[LUMA_16x64] = x265_interp_8tap_vert_pp_16x64_avx2;
+        p.pu[LUMA_16x4].luma_vpp  = x265_interp_8tap_vert_pp_16x4_avx2;
+        p.pu[LUMA_16x8].luma_vpp  = x265_interp_8tap_vert_pp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vpp = x265_interp_8tap_vert_pp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vpp = x265_interp_8tap_vert_pp_16x64_avx2;
 
-        p.luma_vpp[LUMA_24x32] = x265_interp_8tap_vert_pp_24x32_avx2;
+        p.pu[LUMA_24x32].luma_vpp = x265_interp_8tap_vert_pp_24x32_avx2;
 
-        p.luma_vpp[LUMA_32x8] = x265_interp_8tap_vert_pp_32x8_avx2;
-        p.luma_vpp[LUMA_32x16] = x265_interp_8tap_vert_pp_32x16_avx2;
-        p.luma_vpp[LUMA_32x24] = x265_interp_8tap_vert_pp_32x24_avx2;
-        p.luma_vpp[LUMA_32x32] = x265_interp_8tap_vert_pp_32x32_avx2;
-        p.luma_vpp[LUMA_32x64] = x265_interp_8tap_vert_pp_32x64_avx2;
+        p.pu[LUMA_32x8].luma_vpp  = x265_interp_8tap_vert_pp_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vpp = x265_interp_8tap_vert_pp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vpp = x265_interp_8tap_vert_pp_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vpp = x265_interp_8tap_vert_pp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vpp = x265_interp_8tap_vert_pp_32x64_avx2;
 
-        p.luma_vpp[LUMA_48x64] = x265_interp_8tap_vert_pp_48x64_avx2;
+        p.pu[LUMA_48x64].luma_vpp = x265_interp_8tap_vert_pp_48x64_avx2;
 
-        p.luma_vpp[LUMA_64x16] = x265_interp_8tap_vert_pp_64x16_avx2;
-        p.luma_vpp[LUMA_64x32] = x265_interp_8tap_vert_pp_64x32_avx2;
-        p.luma_vpp[LUMA_64x48] = x265_interp_8tap_vert_pp_64x48_avx2;
-        p.luma_vpp[LUMA_64x64] = x265_interp_8tap_vert_pp_64x64_avx2;
+        p.pu[LUMA_64x16].luma_vpp = x265_interp_8tap_vert_pp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vpp = x265_interp_8tap_vert_pp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vpp = x265_interp_8tap_vert_pp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vpp = x265_interp_8tap_vert_pp_64x64_avx2;
 #endif
-        p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
+        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2;
 
-        p.luma_hpp[LUMA_8x4] = x265_interp_8tap_horiz_pp_8x4_avx2;
-        p.luma_hpp[LUMA_8x8] = x265_interp_8tap_horiz_pp_8x8_avx2;
-        p.luma_hpp[LUMA_8x16] = x265_interp_8tap_horiz_pp_8x16_avx2;
-        p.luma_hpp[LUMA_8x32] = x265_interp_8tap_horiz_pp_8x32_avx2;
+        p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2;
+        p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
 
-        p.luma_hpp[LUMA_16x4] = x265_interp_8tap_horiz_pp_16x4_avx2;
-        p.luma_hpp[LUMA_16x8] = x265_interp_8tap_horiz_pp_16x8_avx2;
-        p.luma_hpp[LUMA_16x12] = x265_interp_8tap_horiz_pp_16x12_avx2;
-        p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
-        p.luma_hpp[LUMA_16x32] = x265_interp_8tap_horiz_pp_16x32_avx2;
-        p.luma_hpp[LUMA_16x64] = x265_interp_8tap_horiz_pp_16x64_avx2;
+        p.pu[LUMA_16x4].luma_hpp = x265_interp_8tap_horiz_pp_16x4_avx2;
+        p.pu[LUMA_16x8].luma_hpp = x265_interp_8tap_horiz_pp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_hpp = x265_interp_8tap_horiz_pp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_hpp = x265_interp_8tap_horiz_pp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_hpp = x265_interp_8tap_horiz_pp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_hpp = x265_interp_8tap_horiz_pp_16x64_avx2;
 
-        p.luma_hpp[LUMA_32x8] = x265_interp_8tap_horiz_pp_32x8_avx2;
-        p.luma_hpp[LUMA_32x16] = x265_interp_8tap_horiz_pp_32x16_avx2;
-        p.luma_hpp[LUMA_32x24] = x265_interp_8tap_horiz_pp_32x24_avx2;
-        p.luma_hpp[LUMA_32x32] = x265_interp_8tap_horiz_pp_32x32_avx2;
-        p.luma_hpp[LUMA_32x64] = x265_interp_8tap_horiz_pp_32x64_avx2;
+        p.pu[LUMA_32x8].luma_hpp  = x265_interp_8tap_horiz_pp_32x8_avx2;
+        p.pu[LUMA_32x16].luma_hpp = x265_interp_8tap_horiz_pp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_hpp = x265_interp_8tap_horiz_pp_32x24_avx2;
+        p.pu[LUMA_32x32].luma_hpp = x265_interp_8tap_horiz_pp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_hpp = x265_interp_8tap_horiz_pp_32x64_avx2;
 
-        p.luma_hpp[LUMA_64x64] = x265_interp_8tap_horiz_pp_64x64_avx2;
-        p.luma_hpp[LUMA_64x48] = x265_interp_8tap_horiz_pp_64x48_avx2;
-        p.luma_hpp[LUMA_64x32] = x265_interp_8tap_horiz_pp_64x32_avx2;
-        p.luma_hpp[LUMA_64x16] = x265_interp_8tap_horiz_pp_64x16_avx2;
+        p.pu[LUMA_64x64].luma_hpp = x265_interp_8tap_horiz_pp_64x64_avx2;
+        p.pu[LUMA_64x48].luma_hpp = x265_interp_8tap_horiz_pp_64x48_avx2;
+        p.pu[LUMA_64x32].luma_hpp = x265_interp_8tap_horiz_pp_64x32_avx2;
+        p.pu[LUMA_64x16].luma_hpp = x265_interp_8tap_horiz_pp_64x16_avx2;
 
-        p.luma_hpp[LUMA_48x64] = x265_interp_8tap_horiz_pp_48x64_avx2;
+        p.pu[LUMA_48x64].luma_hpp = x265_interp_8tap_horiz_pp_48x64_avx2;
 
-        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_8x8] = x265_interp_4tap_horiz_pp_8x8_avx2;
-        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_4x4] = x265_interp_4tap_horiz_pp_4x4_avx2;
-        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_32x32] = x265_interp_4tap_horiz_pp_32x32_avx2;
-        p.chroma[X265_CSP_I420].filter_hpp[CHROMA_16x16] = x265_interp_4tap_horiz_pp_16x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_8x8].filter_hpp = x265_interp_4tap_horiz_pp_8x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_32x32].filter_hpp = x265_interp_4tap_horiz_pp_32x32_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
 
-        p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
+        p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_avx2;
 
-        p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2;
-        p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
-        p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
-        p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
+        p.pu[LUMA_8x4].luma_vpp = x265_interp_8tap_vert_pp_8x4_avx2;
+        p.pu[LUMA_8x8].luma_vpp = x265_interp_8tap_vert_pp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2;
 
         // color space i420
-        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
-        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_8x8] = x265_interp_4tap_vert_pp_8x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2;
 
         // color space i422
-        p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
 
-        p.luma_vps[LUMA_4x4] = x265_interp_8tap_vert_ps_4x4_avx2;
+        p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_avx2;
 
 #if X86_64
-        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;
-        p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2;
 #endif
     }
 #endif // if HIGH_BIT_DEPTH
diff -r 1924c460d130 -r c6ca0fd54aa7 source/common/yuv.cpp
--- a/source/common/yuv.cpp	Fri Jan 09 11:35:26 2015 +0530
+++ b/source/common/yuv.cpp	Thu Jan 08 15:23:38 2015 -0600
@@ -81,32 +81,32 @@
 void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const
 {
     pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
-    primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size);
+    primitives.pu[m_part].luma_copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
 
     pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
     pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].copy_pp[m_part](dstU, dstPic.m_strideC, m_buf[1], m_csize);
-    primitives.chroma[m_csp].copy_pp[m_part](dstV, dstPic.m_strideC, m_buf[2], m_csize);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
 }
 
 void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
 {
     const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
-    primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride);
+    primitives.pu[m_part].luma_copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
 
     const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
     const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPic.m_strideC);
-    primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPic.m_strideC);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
 }
 
 void Yuv::copyFromYuv(const Yuv& srcYuv)
 {
     X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
 
-    primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
-    primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
-    primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+    primitives.pu[m_part].luma_copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
 }
 
 /* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
@@ -115,47 +115,47 @@
     X265_CHECK(m_size == FENC_STRIDE && m_size >= srcYuv.m_size, "PU buffer size mismatch\n");
 
     const pixel* srcY = srcYuv.m_buf[0] + getAddrOffset(absPartIdx, srcYuv.m_size);
-    primitives.luma_copy_pp[partEnum](m_buf[0], m_size, srcY, srcYuv.m_size);
+    primitives.pu[partEnum].luma_copy_pp(m_buf[0], m_size, srcY, srcYuv.m_size);
 
     if (bChroma)
     {
         const pixel* srcU = srcYuv.m_buf[1] + srcYuv.getChromaAddrOffset(absPartIdx);
         const pixel* srcV = srcYuv.m_buf[2] + srcYuv.getChromaAddrOffset(absPartIdx);
-        primitives.chroma[m_csp].copy_pp[partEnum](m_buf[1], m_csize, srcU, srcYuv.m_csize);
-        primitives.chroma[m_csp].copy_pp[partEnum](m_buf[2], m_csize, srcV, srcYuv.m_csize);
+        primitives.chroma[m_csp].pu[partEnum].copy_pp(m_buf[1], m_csize, srcU, srcYuv.m_csize);
+        primitives.chroma[m_csp].pu[partEnum].copy_pp(m_buf[2], m_csize, srcV, srcYuv.m_csize);
     }
 }
 
 void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const
 {
     pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
-    primitives.luma_copy_pp[m_part](dstY, dstYuv.m_size, m_buf[0], m_size);
+    primitives.pu[m_part].luma_copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
 
     pixel* dstU = dstYuv.getCbAddr(absPartIdx);
     pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-    primitives.chroma[m_csp].copy_pp[m_part](dstU, dstYuv.m_csize, m_buf[1], m_csize);
-    primitives.chroma[m_csp].copy_pp[m_part](dstV, dstYuv.m_csize, m_buf[2], m_csize);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
+    primitives.chroma[m_csp].pu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
 }
 
 void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
 {
     pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
     pixel* dstY = dstYuv.m_buf[0];
-    primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size);
+    primitives.pu[dstYuv.m_part].luma_copy_pp(dstY, dstYuv.m_size, srcY, m_size);
 
     pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
     pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
     pixel* dstU = dstYuv.m_buf[1];
     pixel* dstV = dstYuv.m_buf[2];
-    primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstU, dstYuv.m_csize, srcU, m_csize);
-    primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstV, dstYuv.m_csize, srcV, m_csize);
+    primitives.chroma[m_csp].pu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].pu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
 }
 
 void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
 {
-    primitives.luma_add_ps[log2SizeL - 2](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    primitives.pu[log2SizeL - 2].luma_add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
+    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
 }
 
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
@@ -167,7 +167,7 @@
         const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
         const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
         pixel* dstY = getLumaAddr(absPartIdx);
-        primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
+        primitives.pu[part].luma_addAvg(srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
     }
     if (bChroma)
     {
@@ -177,8 +177,8 @@
         const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
         pixel* dstU = getCbAddr(absPartIdx);
         pixel* dstV = getCrAddr(absPartIdx);
-        primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
-        primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
+        primitives.chroma[m_csp].pu[part].addAvg(srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
+        primitives.chroma[m_csp].pu[part].addAvg(srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
     }
 }
 
@@ -186,7 +186,7 @@
 {
     const pixel* src = getLumaAddr(absPartIdx);
     pixel* dst = dstYuv.getLumaAddr(absPartIdx);
-    primitives.luma_copy_pp[log2Size - 2](dst, dstYuv.m_size, src, m_size);
+    primitives.pu[log2Size - 2].luma_copy_pp(dst, dstYuv.m_size, src, m_size);
 }
 
 void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const
@@ -196,6 +196,6 @@
     const pixel* srcV = getCrAddr(absPartIdx);
     pixel* dstU = dstYuv.getCbAddr(absPartIdx);
     pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-    primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, srcU, m_csize);
-    primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, srcV, m_csize);
+    primitives.chroma[m_csp].pu[part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
+    primitives.chroma[m_csp].pu[part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
 }


More information about the x265-devel mailing list