[x265] primitives for RExt

Satoshi Nakagawa nakagawa424 at oki.com
Tue Aug 5 14:48:50 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1407242513 -32400
#      Tue Aug 05 21:41:53 2014 +0900
# Node ID 770c40d768d55e68e76c485d5dc61d014257e789
# Parent  0d4723a0080cff763ff20ab9c516c6e082496a0b
primitives for RExt

diff -r 0d4723a0080c -r 770c40d768d5 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/pixel.cpp	Tue Aug 05 21:41:53 2014 +0900
@@ -1015,13 +1015,6 @@
     p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
     p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
 
-#define CHROMA_422_X(W, H) \
-    p.chroma[X265_CSP_I422].addAvg[CHROMA422X_ ## W ## x ## H]  = addAvg<W, H>;        \
-    p.chroma[X265_CSP_I422].copy_pp[CHROMA422X_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
-    p.chroma[X265_CSP_I422].copy_sp[CHROMA422X_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.chroma[X265_CSP_I422].copy_ps[CHROMA422X_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
-    p.chroma[X265_CSP_I422].copy_ss[CHROMA422X_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
-
 #define CHROMA_444(W, H) \
     p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
     p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
@@ -1090,7 +1083,6 @@
     LUMA(16, 64);
     CHROMA_420(8,  32);
 
-    CHROMA_422_X(4, 8);
     CHROMA_422(4, 8);
     CHROMA_422(4, 4);
     CHROMA_422(2, 8);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/primitives.h
--- a/source/common/primitives.h	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/primitives.h	Tue Aug 05 21:41:53 2014 +0900
@@ -69,7 +69,7 @@
 
 enum Chroma422Partitions
 {
-    CHROMA422X_4x8,  CHROMA422_4x8,   CHROMA422_8x16,  CHROMA422_16x32, CHROMA422_32x64,
+    CHROMA422_2x4,   CHROMA422_4x8,   CHROMA422_8x16,  CHROMA422_16x32, CHROMA422_32x64,
     CHROMA422_4x4,   CHROMA422_2x8,
     CHROMA422_8x8,   CHROMA422_4x16,
     CHROMA422_16x16, CHROMA422_8x32,
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 05 21:41:53 2014 +0900
@@ -250,6 +250,12 @@
     p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
     p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
 
+#define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
+
 #define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
     p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
     p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
@@ -259,12 +265,18 @@
 #define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
     p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
+#define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
 #define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
     p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
     p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
 
+#define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+
 #define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
     p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
 
@@ -294,8 +306,33 @@
     SETUP_CHROMA_FUNC_DEF_420(32, 8, cpu); \
     SETUP_CHROMA_FUNC_DEF_420(8, 32, cpu);
 
+#define CHROMA_FILTERS_422(cpu) \
+    SETUP_CHROMA_FUNC_DEF_422(4, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(4, 4, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(2, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(8, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(8, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(4, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(8, 12, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(6, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(8, 4, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(2, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(16, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(16, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(8, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(16, 24, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(12, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(16, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(4, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(32, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(32, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(16, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(32, 48, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(24, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(32, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_422(8, 64, cpu);
+
 #define CHROMA_FILTERS_444(cpu) \
-    SETUP_CHROMA_FUNC_DEF_444(4, 4, cpu); \
     SETUP_CHROMA_FUNC_DEF_444(8, 8, cpu); \
     SETUP_CHROMA_FUNC_DEF_444(8, 4, cpu); \
     SETUP_CHROMA_FUNC_DEF_444(4, 8, cpu); \
@@ -312,12 +349,22 @@
     SETUP_CHROMA_FUNC_DEF_444(32, 24, cpu); \
     SETUP_CHROMA_FUNC_DEF_444(24, 32, cpu); \
     SETUP_CHROMA_FUNC_DEF_444(32, 8, cpu); \
-    SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu);
+    SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(64, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(64, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(32, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(64, 48, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(48, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(64, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF_444(16, 64, cpu);
 
 #define CHROMA_SP_FILTERS_SSE4_420(cpu) \
     SETUP_CHROMA_SP_FUNC_DEF_420(4, 4, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_420(4, 2, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_420(2, 4, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_420(4, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_420(6, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_420(2, 8, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_420(16, 16, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_420(16, 8, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_420(16, 12, cpu); \
@@ -339,8 +386,35 @@
     SETUP_CHROMA_SP_FUNC_DEF_420(8, 16, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_420(8, 32, cpu);
 
+#define CHROMA_SP_FILTERS_SSE4_422(cpu) \
+    SETUP_CHROMA_SP_FUNC_DEF_422(4, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(4, 4, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(2, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(4, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(6, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(2, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(16, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(16, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(16, 24, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(12, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(16, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(4, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(32, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(32, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(16, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(32, 48, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(24, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(32, 16, cpu);
+
+#define CHROMA_SP_FILTERS_422(cpu) \
+    SETUP_CHROMA_SP_FUNC_DEF_422(8, 4, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(8, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(8, 12, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(8, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(8, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_422(8, 64, cpu);
+
 #define CHROMA_SP_FILTERS_SSE4_444(cpu) \
-    SETUP_CHROMA_SP_FUNC_DEF_444(4, 4, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_444(4, 8, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_444(16, 16, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_444(16, 8, cpu); \
@@ -353,7 +427,14 @@
     SETUP_CHROMA_SP_FUNC_DEF_444(16, 32, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_444(32, 24, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF_444(24, 32, cpu); \
-    SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu);
+    SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(64, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(64, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(32, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(64, 48, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(48, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(64, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF_444(16, 64, cpu);
 
 #define CHROMA_SP_FILTERS_444(cpu) \
     SETUP_CHROMA_SP_FUNC_DEF_444(8, 8, cpu); \
@@ -389,8 +470,35 @@
     SETUP_CHROMA_SS_FUNC_DEF_420(2, 8, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF_420(6, 8, cpu);
 
+#define CHROMA_SS_FILTERS_422(cpu) \
+    SETUP_CHROMA_SS_FUNC_DEF_422(4, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(4, 4, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(8, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(8, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(4, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(8, 12, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(8, 4, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(16, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(16, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(8, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(16, 24, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(12, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(16, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(4, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(32, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(32, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(16, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(32, 48, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(24, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(32, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(8, 64, cpu);
+
+#define CHROMA_SS_FILTERS_SSE4_422(cpu) \
+    SETUP_CHROMA_SS_FUNC_DEF_422(2, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(2, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_422(6, 16, cpu);
+
 #define CHROMA_SS_FILTERS_444(cpu) \
-    SETUP_CHROMA_SS_FUNC_DEF_444(4, 4, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF_444(8, 8, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF_444(8, 4, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF_444(4, 8, cpu); \
@@ -407,7 +515,14 @@
     SETUP_CHROMA_SS_FUNC_DEF_444(32, 24, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF_444(24, 32, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF_444(32, 8, cpu); \
-    SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu);
+    SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(64, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(64, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(32, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(64, 48, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(48, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(64, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF_444(16, 64, cpu)
 
 #if HIGH_BIT_DEPTH    // temporary, until all 10bit functions are completed
 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
@@ -466,6 +581,35 @@
     SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
     SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
 
+#define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
+    p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
+
+#define CHROMA_BLOCKCOPY_422(type, cpu) \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 2,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 2, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 4,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 4,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 4, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 4, 32,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 6, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 8,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 8,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 8, 12,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 8, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 8, 32,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 8, 64,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 12, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 16,  8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 16, 24, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 16, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 16, 64, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 24, 64, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 32, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 32, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 32, 48, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_422(type, 32, 64, cpu);
+
 #define LUMA_BLOCKCOPY(type, cpu) \
     SETUP_LUMA_BLOCKCOPY(type, 4,   4, cpu); \
     SETUP_LUMA_BLOCKCOPY(type, 8,   8, cpu); \
@@ -497,10 +641,13 @@
     p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_BLOCKCOPY_SP(cpu) \
+    SETUP_CHROMA_BLOCKCOPY_SP(2,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(2,  8,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(4,  2,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(4,  4,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(4,  8,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(4,  16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(6,  8,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(8,  2,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(8,  4,  cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(8,  6,  cpu); \
@@ -519,35 +666,94 @@
     SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
     SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
 
-#define SETUP_CHROMA_LUMA(W1, H1, W2, H2, cpu) \
-    p.chroma[X265_CSP_I420].sub_ps[LUMA_ ## W1 ## x ## H1] = x265_pixel_sub_ps_ ## W2 ## x ## H2 ## cpu; \
-    p.chroma[X265_CSP_I420].add_ps[LUMA_ ## W1 ## x ## H1] = x265_pixel_add_ps_ ## W2 ## x ## H2 ## cpu;
+#define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_BLOCKCOPY_SP_422(cpu) \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(2,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(2, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(4,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(4,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(4, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(4,  32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(6, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(8,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(8,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(8, 12,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(8, 16,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(8,  32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(8,  64, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(12, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(16, 8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(16, 24, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(16, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(16, 64, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(24, 64, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(32, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(32, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(32, 48, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
+
+#define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
+    p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
 
 #define CHROMA_PIXELSUB_PS(cpu) \
-    SETUP_CHROMA_LUMA(8,   8, 4,  4,  cpu); \
-    SETUP_CHROMA_LUMA(8,   4, 4,  2,  cpu); \
-    SETUP_CHROMA_LUMA(4,   8, 2,  4,  cpu); \
-    SETUP_CHROMA_LUMA(16, 16, 8,  8,  cpu); \
-    SETUP_CHROMA_LUMA(16,  8, 8,  4,  cpu); \
-    SETUP_CHROMA_LUMA(8,  16, 4,  8,  cpu); \
-    SETUP_CHROMA_LUMA(16, 12, 8,  6,  cpu); \
-    SETUP_CHROMA_LUMA(12, 16, 6,  8,  cpu); \
-    SETUP_CHROMA_LUMA(16,  4, 8,  2,  cpu); \
-    SETUP_CHROMA_LUMA(4,  16, 2,  8,  cpu); \
-    SETUP_CHROMA_LUMA(32, 32, 16, 16, cpu); \
-    SETUP_CHROMA_LUMA(32, 16, 16, 8,  cpu); \
-    SETUP_CHROMA_LUMA(16, 32, 8,  16, cpu); \
-    SETUP_CHROMA_LUMA(32, 24, 16, 12, cpu); \
-    SETUP_CHROMA_LUMA(24, 32, 12, 16, cpu); \
-    SETUP_CHROMA_LUMA(32,  8, 16, 4,  cpu); \
-    SETUP_CHROMA_LUMA(8,  32, 4,  16, cpu); \
-    SETUP_CHROMA_LUMA(64, 64, 32, 32, cpu); \
-    SETUP_CHROMA_LUMA(64, 32, 32, 16, cpu); \
-    SETUP_CHROMA_LUMA(32, 64, 16, 32, cpu); \
-    SETUP_CHROMA_LUMA(64, 48, 32, 24, cpu); \
-    SETUP_CHROMA_LUMA(48, 64, 24, 32, cpu); \
-    SETUP_CHROMA_LUMA(64, 16, 32, 8,  cpu); \
-    SETUP_CHROMA_LUMA(16, 64, 8,  32, cpu);
+    SETUP_CHROMA_PIXELSUB(4,  4,  cpu); \
+    SETUP_CHROMA_PIXELSUB(4,  2,  cpu); \
+    SETUP_CHROMA_PIXELSUB(2,  4,  cpu); \
+    SETUP_CHROMA_PIXELSUB(8,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB(8,  4,  cpu); \
+    SETUP_CHROMA_PIXELSUB(4,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB(8,  6,  cpu); \
+    SETUP_CHROMA_PIXELSUB(6,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB(8,  2,  cpu); \
+    SETUP_CHROMA_PIXELSUB(2,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB(16, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB(16, 8,  cpu); \
+    SETUP_CHROMA_PIXELSUB(8,  16, cpu); \
+    SETUP_CHROMA_PIXELSUB(16, 12, cpu); \
+    SETUP_CHROMA_PIXELSUB(12, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB(16, 4,  cpu); \
+    SETUP_CHROMA_PIXELSUB(4,  16, cpu); \
+    SETUP_CHROMA_PIXELSUB(32, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB(32, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB(16, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB(32, 24, cpu); \
+    SETUP_CHROMA_PIXELSUB(24, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB(32, 8,  cpu); \
+    SETUP_CHROMA_PIXELSUB(8,  32, cpu);
+
+#define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_PIXELSUB_PS_422(cpu) \
+    SETUP_CHROMA_PIXELSUB_422(4,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(4,  4,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(2,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(8, 16,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(8,  8,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(4, 16,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(8, 12,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(6, 16,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(8,  4,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(2, 16,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(16, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(8,  32, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(16, 24, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(12, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(16, 8,  cpu); \
+    SETUP_CHROMA_PIXELSUB_422(4,  32, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(32, 64, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(32, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(16, 64, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(32, 48, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(24, 64, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(32, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_422(8,  64, cpu);
 
 #define LUMA_FILTERS(cpu) \
     SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
@@ -753,7 +959,36 @@
     SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
-    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG_422(cpu) \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 16,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 16,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 32,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(6, 16,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 12,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 16,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 32,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 64,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(12, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16,  8, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 24, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 64, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(24, 64, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
 
 #define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
     p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
@@ -897,6 +1132,72 @@
     SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
     SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
 
+#define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_VERT_FILTERS_422(cpu) \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(8, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(8, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(4, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(8, 12, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(8, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(16, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(16, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(8, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(16, 24, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(12, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(16, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(4, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(32, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(32, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(16, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(32, 48, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(24, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(32, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(8, 64, cpu);
+
+#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(2, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(2, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(4, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
+
+#define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
+    p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_VERT_FILTERS_444(cpu) \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(8, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(4, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(16, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(16, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(8, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(16, 12, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(12, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(16, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(4, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(32, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(32, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(16, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(32, 24, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(24, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(32, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(8, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(64, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(64, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(32, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(64, 48, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(48, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(64, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu)
+
 #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
     p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
     p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
@@ -927,6 +1228,66 @@
     SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
     SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
 
+#define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
+    p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_HORIZ_FILTERS_422(cpu) \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 12, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(6, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 24, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(12, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 48, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(24, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu)
+
+#define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
+    p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+    p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_HORIZ_FILTERS_444(cpu) \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 12, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(12, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 24, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(24, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 48, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(48, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 64, cpu)
+
 namespace x265 {
 // private x265 namespace
 void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
@@ -1010,14 +1371,20 @@
         p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
 
         CHROMA_PIXELSUB_PS(_sse2);
+        CHROMA_PIXELSUB_PS_422(_sse2);
         LUMA_PIXELSUB(_sse2);
 
         CHROMA_BLOCKCOPY(ss, _sse2);
+        CHROMA_BLOCKCOPY_422(ss, _sse2);
         LUMA_BLOCKCOPY(ss, _sse2);
 
         CHROMA_VERT_FILTERS(_sse2);
+        CHROMA_VERT_FILTERS_422(_sse2);
+        CHROMA_VERT_FILTERS_444(_sse2);
+        p.luma_p2s = x265_luma_p2s_sse2;
         p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
-        p.luma_p2s = x265_luma_p2s_sse2;
+        p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
+        p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
 
         p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
         p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
@@ -1061,9 +1428,13 @@
     {
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
+        CHROMA_ADDAVG_422(_sse4);
         LUMA_FILTERS(_sse4);
         CHROMA_HORIZ_FILTERS(_sse4);
         CHROMA_VERT_FILTERS_SSE4(_sse4);
+        CHROMA_HORIZ_FILTERS_422(_sse4);
+        CHROMA_VERT_FILTERS_SSE4_422(_sse4);
+        CHROMA_HORIZ_FILTERS_444(_sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
         p.quant = x265_quant_sse4;
@@ -1116,6 +1487,13 @@
         p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
     }
 
+    for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
+    {
+        p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
+        p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
+        p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
+    }
+
 #else // if HIGH_BIT_DEPTH
     if (cpuMask & X265_CPU_SSE2)
     {
@@ -1141,14 +1519,19 @@
 
         CHROMA_BLOCKCOPY(ss, _sse2);
         CHROMA_BLOCKCOPY(pp, _sse2);
+        CHROMA_BLOCKCOPY_422(ss, _sse2);
+        CHROMA_BLOCKCOPY_422(pp, _sse2);
         LUMA_BLOCKCOPY(ss, _sse2);
         LUMA_BLOCKCOPY(pp, _sse2);
         LUMA_BLOCKCOPY(sp, _sse2);
         CHROMA_BLOCKCOPY_SP(_sse2);
+        CHROMA_BLOCKCOPY_SP_422(_sse2);
 
         CHROMA_SS_FILTERS_420(_sse2);
+        CHROMA_SS_FILTERS_422(_sse2);
         CHROMA_SS_FILTERS_444(_sse2);
         CHROMA_SP_FILTERS_420(_sse2);
+        CHROMA_SP_FILTERS_422(_sse2);
         CHROMA_SP_FILTERS_444(_sse2);
         LUMA_SS_FILTERS(_sse2);
 
@@ -1215,6 +1598,7 @@
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
         p.luma_p2s = x265_luma_p2s_ssse3;
         p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
+        p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
         p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
 
         p.dct[DST_4x4] = x265_dst4_ssse3;
@@ -1227,6 +1611,7 @@
 
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
+        CHROMA_ADDAVG_422(_sse4);
         p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
 
         // TODO: check POPCNT flag!
@@ -1248,13 +1633,17 @@
         LUMA_SSE_SP(_sse4);
 
         CHROMA_PIXELSUB_PS(_sse4);
+        CHROMA_PIXELSUB_PS_422(_sse4);
         LUMA_PIXELSUB(_sse4);
 
         CHROMA_FILTERS_420(_sse4);
+        CHROMA_FILTERS_422(_sse4);
         CHROMA_FILTERS_444(_sse4);
         CHROMA_SS_FILTERS_SSE4_420(_sse4);
+        CHROMA_SS_FILTERS_SSE4_422(_sse4);
+        CHROMA_SP_FILTERS_SSE4_420(_sse4);
+        CHROMA_SP_FILTERS_SSE4_422(_sse4);
         CHROMA_SP_FILTERS_SSE4_444(_sse4);
-        CHROMA_SP_FILTERS_SSE4_420(_sse4);
         LUMA_SP_FILTERS(_sse4);
         LUMA_FILTERS(_sse4);
         ASSGN_SSE_SS(sse4);
@@ -1263,12 +1652,9 @@
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
         CHROMA_BLOCKCOPY(ps, _sse4);
+        CHROMA_BLOCKCOPY_422(ps, _sse4);
         LUMA_BLOCKCOPY(ps, _sse4);
 
-        p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x4] = x265_interp_4tap_vert_sp_2x4_sse4;
-        p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x8] = x265_interp_4tap_vert_sp_2x8_sse4;
-        p.chroma[X265_CSP_I420].filter_vsp[CHROMA_6x8] = x265_interp_4tap_vert_sp_6x8_sse4;
-
         p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;
         p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/blockcopy8.asm	Tue Aug 05 21:41:53 2014 +0900
@@ -92,6 +92,24 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_2x16, 4, 7, 0
+    mov     r6d,    16/2
+.loop:
+    mov     r4w,    [r2]
+    mov     r5w,    [r2 + r3]
+    dec     r6d
+    lea     r2,     [r2 + r3 * 2]
+    mov     [r0],       r4w
+    mov     [r0 + r1],  r5w
+    lea     r0,     [r0 + r1 * 2]
+    jnz     .loop
+    RET
+
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -166,6 +184,8 @@
 BLOCKCOPY_PP_W4_H8 4, 8
 BLOCKCOPY_PP_W4_H8 4, 16
 
+BLOCKCOPY_PP_W4_H8 4, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -232,6 +252,28 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_6x16, 4, 7, 2
+    mov     r6d,    16/2
+.loop:
+    movd    m0,     [r2]
+    mov     r4w,    [r2 + 4]
+    movd    m1,     [r2 + r3]
+    mov     r5w,    [r2 + r3 + 4]
+    lea     r2,     [r2 + r3 * 2]
+    movd    [r0],           m0
+    mov     [r0 + 4],       r4w
+    movd    [r0 + r1],      m1
+    mov     [r0 + r1 + 4],  r5w
+    lea     r0,     [r0 + r1 * 2]
+    dec     r6d
+    jnz     .loop
+    RET
+
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -286,6 +328,23 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x12, 4, 5, 2
+    mov      r4d,       12/2
+.loop:
+    movh     m0,        [r2]
+    movh     m1,        [r2 + r3]
+    movh     [r0],      m0
+    movh     [r0 + r1], m1
+    dec      r4d
+    lea      r0,        [r0 + 2 * r1]
+    lea      r2,        [r2 + 2 * r3]
+    jnz      .loop
+    RET
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W8_H8 2
@@ -330,6 +389,8 @@
 BLOCKCOPY_PP_W8_H8 8, 16
 BLOCKCOPY_PP_W8_H8 8, 32
 
+BLOCKCOPY_PP_W8_H8 8, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -370,6 +431,8 @@
 
 BLOCKCOPY_PP_W12_H4 12, 16
 
+BLOCKCOPY_PP_W12_H4 12, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -448,6 +511,8 @@
 BLOCKCOPY_PP_W16_H8 16, 32
 BLOCKCOPY_PP_W16_H8 16, 64
 
+BLOCKCOPY_PP_W16_H8 16, 24
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -487,6 +552,8 @@
 
 BLOCKCOPY_PP_W24_H4 24, 32
 
+BLOCKCOPY_PP_W24_H4 24, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -531,6 +598,8 @@
 BLOCKCOPY_PP_W32_H4 32, 32
 BLOCKCOPY_PP_W32_H4 32, 64
 
+BLOCKCOPY_PP_W32_H4 32, 48
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -718,6 +787,35 @@
 RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W2_H2 2
+INIT_XMM sse2
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+    add         r3,     r3
+    mov         r6d,    %2/2
+.loop:
+    movd        m0,     [r2]
+    movd        m1,     [r2 + r3]
+    dec         r6d
+    lea         r2,     [r2 + r3 * 2]
+    packuswb    m0,     m0
+    packuswb    m1,     m1
+    movd        r4d,        m0
+    movd        r5d,        m1
+    mov         [r0],       r4w
+    mov         [r0 + r1],  r5w
+    lea         r0,         [r0 + r1 * 2]
+    jnz         .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W2_H2 2,  4
+BLOCKCOPY_SP_W2_H2 2,  8
+
+BLOCKCOPY_SP_W2_H2 2, 16
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -862,6 +960,8 @@
 
 BLOCKCOPY_SP_W4_H8 4, 16
 
+BLOCKCOPY_SP_W4_H8 4, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -926,6 +1026,40 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W6_H2 2
+INIT_XMM sse2
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+    add         r3,     r3
+    mov         r6d,    %2/2
+.loop:
+    movh        m0, [r2]
+    movd        m2, [r2 + 8]
+    movh        m1, [r2 + r3]
+    movd        m3, [r2 + r3 + 8]
+    dec         r6d
+    lea         r2, [r2 + r3 * 2]
+    packuswb    m0, m0
+    packuswb    m2, m2
+    packuswb    m1, m1
+    packuswb    m3, m3
+    movd        r4d,            m2
+    movd        r5d,            m3
+    movd        [r0],           m0
+    mov         [r0 + 4],       r4w
+    movd        [r0 + r1],      m1
+    mov         [r0 + r1 + 4],  r5w
+    lea         r0, [r0 + r1 * 2]
+    jnz         .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W6_H2 6,  8
+
+BLOCKCOPY_SP_W6_H2 6, 16
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -1042,6 +1176,36 @@
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W8_H4 2
+INIT_XMM sse2
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+    add         r3,     r3
+    mov         r4d,    %2/4
+.loop:
+    movu        m0,     [r2]
+    movu        m1,     [r2 + r3]
+    lea         r2,     [r2 + r3 * 2]
+    movu        m2,     [r2]
+    movu        m3,     [r2 + r3]
+    dec         r4d
+    lea         r2,     [r2 + r3 * 2]
+    packuswb    m0,     m1
+    packuswb    m2,     m3
+    movlps      [r0],       m0
+    movhps      [r0 + r1],  m0
+    lea         r0,         [r0 + r1 * 2]
+    movlps      [r0],       m2
+    movhps      [r0 + r1],  m2
+    lea         r0,         [r0 + r1 * 2]
+    jnz         .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W8_H4 8, 12
+
+;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W8_H8 2
 INIT_XMM sse2
 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
@@ -1092,6 +1256,8 @@
 BLOCKCOPY_SP_W8_H8 8, 16
 BLOCKCOPY_SP_W8_H8 8, 32
 
+BLOCKCOPY_SP_W8_H8 8, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -1147,6 +1313,8 @@
 
 BLOCKCOPY_SP_W12_H4 12, 16
 
+BLOCKCOPY_SP_W12_H4 12, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -1196,6 +1364,8 @@
 BLOCKCOPY_SP_W16_H4 16, 32
 BLOCKCOPY_SP_W16_H4 16, 64
 
+BLOCKCOPY_SP_W16_H4 16, 24
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -1235,6 +1405,8 @@
 
 BLOCKCOPY_SP_W24_H2 24, 32
 
+BLOCKCOPY_SP_W24_H2 24, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -1281,6 +1453,8 @@
 BLOCKCOPY_SP_W32_H2 32, 32
 BLOCKCOPY_SP_W32_H2 32, 64
 
+BLOCKCOPY_SP_W32_H2 32, 48
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -1596,6 +1770,28 @@
 
 RET
 
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+    add         r1,         r1
+    mov         r4d,        16/2
+.loop:
+    movd        m0,         [r2]
+    movd        m1,         [r2 + r3]
+    dec         r4d
+    lea         r2,         [r2 + r3 * 2]
+    pmovzxbw    m0,         m0
+    pmovzxbw    m1,         m1
+    movd        [r0],       m0
+    movd        [r0 + r1],  m1
+    lea         r0,         [r0 + r1 * 2]
+    jnz         .loop
+    RET
+
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -1687,6 +1883,9 @@
 BLOCKCOPY_PS_W4_H4 4, 8
 BLOCKCOPY_PS_W4_H4 4, 16
 
+BLOCKCOPY_PS_W4_H4 4, 32
+
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -1732,6 +1931,8 @@
 
 BLOCKCOPY_PS_W6_H4 6, 8
 
+BLOCKCOPY_PS_W6_H4 6, 16
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -1862,6 +2063,9 @@
 BLOCKCOPY_PS_W8_H4  8, 16
 BLOCKCOPY_PS_W8_H4  8, 32
 
+BLOCKCOPY_PS_W8_H4  8, 12
+BLOCKCOPY_PS_W8_H4  8, 64
+
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
@@ -1898,6 +2102,8 @@
 
 BLOCKCOPY_PS_W12_H2 12, 16
 
+BLOCKCOPY_PS_W12_H2 12, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -1990,6 +2196,8 @@
 BLOCKCOPY_PS_W16_H4 16, 32
 BLOCKCOPY_PS_W16_H4 16, 64
 
+BLOCKCOPY_PS_W16_H4 16, 24
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -2033,6 +2241,8 @@
 
 BLOCKCOPY_PS_W24_H2 24, 32
 
+BLOCKCOPY_PS_W24_H2 24, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -2084,6 +2294,8 @@
 BLOCKCOPY_PS_W32_H2 32, 32
 BLOCKCOPY_PS_W32_H2 32, 64
 
+BLOCKCOPY_PS_W32_H2 32, 48
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -2280,6 +2492,26 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_2x16, 4, 7, 0
+    add     r1, r1
+    add     r3, r3
+    mov     r6d,    16/2
+.loop:
+    mov     r4d,    [r2]
+    mov     r5d,    [r2 + r3]
+    dec     r6d
+    lea     r2, [r2 + r3 * 2]
+    mov     [r0],       r4d
+    mov     [r0 + r1],  r5d
+    lea     r0, [r0 + r1 * 2]
+    jnz     .loop
+    RET
+
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -2361,6 +2593,8 @@
 BLOCKCOPY_SS_W4_H8 4, 8
 BLOCKCOPY_SS_W4_H8 4, 16
 
+BLOCKCOPY_SS_W4_H8 4, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -2417,6 +2651,30 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_6x16, 4, 5, 4
+    add     r1, r1
+    add     r3, r3
+    mov     r4d,    16/2
+.loop:
+    movh    m0, [r2]
+    movd    m2, [r2 + 8]
+    movh    m1, [r2 + r3]
+    movd    m3, [r2 + r3 + 8]
+    dec     r4d
+    lea     r2, [r2 + r3 * 2]
+    movh    [r0],           m0
+    movd    [r0 + 8],       m2
+    movh    [r0 + r1],      m1
+    movd    [r0 + r1 + 8],  m3
+    lea     r0, [r0 + r1 * 2]
+    jnz     .loop
+    RET
+
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -2483,6 +2741,26 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_8x12, 4, 5, 2
+    add     r1, r1
+    add     r3, r3
+    mov     r4d, 12/2
+.loop:
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    lea     r2, [r2 + 2 * r3]
+    dec     r4d
+    movu    [r0], m0
+    movu    [r0 + r1], m1
+    lea     r0, [r0 + 2 * r1]
+    jnz     .loop
+    RET
+
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W8_H8 2
@@ -2531,6 +2809,8 @@
 BLOCKCOPY_SS_W8_H8 8, 16
 BLOCKCOPY_SS_W8_H8 8, 32
 
+BLOCKCOPY_SS_W8_H8 8, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -2573,6 +2853,8 @@
 
 BLOCKCOPY_SS_W12_H4 12, 16
 
+BLOCKCOPY_SS_W12_H4 12, 32
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -2687,6 +2969,8 @@
 BLOCKCOPY_SS_W16_H8 16, 32
 BLOCKCOPY_SS_W16_H8 16, 64
 
+BLOCKCOPY_SS_W16_H8 16, 24
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -2737,6 +3021,8 @@
 
 BLOCKCOPY_SS_W24_H4 24, 32
 
+BLOCKCOPY_SS_W24_H4 24, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
@@ -2803,6 +3089,8 @@
 BLOCKCOPY_SS_W32_H4 32, 32
 BLOCKCOPY_SS_W32_H4 32, 64
 
+BLOCKCOPY_SS_W32_H4 32, 48
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/blockcopy8.h	Tue Aug 05 21:41:53 2014 +0900
@@ -83,12 +83,33 @@
 #define BLOCKCOPY_SP(cpu) \
     SETUP_BLOCKCOPY_SP(2, 4, cpu); \
     SETUP_BLOCKCOPY_SP(2, 8, cpu); \
-    SETUP_BLOCKCOPY_SP(6, 8, cpu);
+    SETUP_BLOCKCOPY_SP(6, 8, cpu); \
+    \
+    SETUP_BLOCKCOPY_SP(2, 16, cpu); \
+    SETUP_BLOCKCOPY_SP(4, 32, cpu); \
+    SETUP_BLOCKCOPY_SP(6, 16, cpu); \
+    SETUP_BLOCKCOPY_SP(8, 12, cpu); \
+    SETUP_BLOCKCOPY_SP(8, 64, cpu); \
+    SETUP_BLOCKCOPY_SP(12, 32, cpu); \
+    SETUP_BLOCKCOPY_SP(16, 24, cpu); \
+    SETUP_BLOCKCOPY_SP(24, 64, cpu); \
+    SETUP_BLOCKCOPY_SP(32, 48, cpu);
 
 #define BLOCKCOPY_SS_PP(cpu) \
     SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \
     SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(6, 8, cpu);
+    SETUP_BLOCKCOPY_SS_PP(6, 8, cpu); \
+    \
+    SETUP_BLOCKCOPY_SS_PP(2, 16, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(4, 32, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(6, 16, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(8, 12, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(8, 64, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \
+    SETUP_BLOCKCOPY_SS_PP(32, 48, cpu);
+    
 
 #define BLOCKCOPY_PS(cpu) \
     SETUP_BLOCKCOPY_PS(2, 4, cpu); \
@@ -121,13 +142,25 @@
     SETUP_BLOCKCOPY_PS(64, 16, cpu); \
     SETUP_BLOCKCOPY_PS(64, 32, cpu); \
     SETUP_BLOCKCOPY_PS(64, 48, cpu); \
-    SETUP_BLOCKCOPY_PS(64, 64, cpu);
+    SETUP_BLOCKCOPY_PS(64, 64, cpu); \
+    \
+    SETUP_BLOCKCOPY_PS(2, 16, cpu); \
+    SETUP_BLOCKCOPY_PS(4, 32, cpu); \
+    SETUP_BLOCKCOPY_PS(6, 16, cpu); \
+    SETUP_BLOCKCOPY_PS(8, 12, cpu); \
+    SETUP_BLOCKCOPY_PS(8, 64, cpu); \
+    SETUP_BLOCKCOPY_PS(12, 32, cpu); \
+    SETUP_BLOCKCOPY_PS(16, 24, cpu); \
+    SETUP_BLOCKCOPY_PS(24, 64, cpu); \
+    SETUP_BLOCKCOPY_PS(32, 48, cpu);
 
 BLOCKCOPY_COMMON(_sse2);
 BLOCKCOPY_SS_PP(_sse2);
 BLOCKCOPY_SP(_sse4);
 BLOCKCOPY_PS(_sse4);
 
+BLOCKCOPY_SP(_sse2);
+
 void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val);
 void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val);
 void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/ipfilter16.asm	Tue Aug 05 21:41:53 2014 +0900
@@ -926,6 +926,12 @@
 FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
 FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
 
+FILTER_CHROMA_H 2, 16, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 32, pp, 6, 8, 5
+FILTER_CHROMA_H 2, 16, ps, 7, 5, 6
+FILTER_CHROMA_H 4, 32, ps, 7, 6, 6
+
+
 %macro FILTER_W6_1 1
     movu        m3,         [r0]
     pshufb      m3,         m3, m2
@@ -1362,6 +1368,75 @@
     FILTER_W32_1 ps
     ret
 
+%macro FILTER_W8o_1 2
+    movu        m3,         [r0 + %2]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + %2 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + %2 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + %2 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         6
+    psrad       m5,         6
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         2
+    psrad       m5,         2
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + %2],       m3
+    movhps      [r2 + %2 + 8],   m3
+%endmacro
+
+%macro FILTER_W48_1 1
+    FILTER_W8o_1 %1, 0
+    FILTER_W8o_1 %1, 16
+    FILTER_W8o_1 %1, 32
+    FILTER_W8o_1 %1, 48
+    FILTER_W8o_1 %1, 64
+    FILTER_W8o_1 %1, 80
+%endmacro
+
+cglobal chroma_filter_pp_48x1_internal
+    FILTER_W48_1 pp
+    ret
+
+cglobal chroma_filter_ps_48x1_internal
+    FILTER_W48_1 ps
+    ret
+
+%macro FILTER_W64_1 1
+    FILTER_W8o_1 %1, 0
+    FILTER_W8o_1 %1, 16
+    FILTER_W8o_1 %1, 32
+    FILTER_W8o_1 %1, 48
+    FILTER_W8o_1 %1, 64
+    FILTER_W8o_1 %1, 80
+    FILTER_W8o_1 %1, 96
+    FILTER_W8o_1 %1, 112
+%endmacro
+
+cglobal chroma_filter_pp_64x1_internal
+    FILTER_W64_1 pp
+    ret
+
+cglobal chroma_filter_ps_64x1_internal
+    FILTER_W64_1 ps
+    ret
+
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -1453,6 +1528,36 @@
 IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
 IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
 
+IPFILTER_CHROMA 6, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 12, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 12, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 24, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 24, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 48, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 6, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 12, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 12, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 24, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 24, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 48, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 64, ps, 6, 7, 6
+
+IPFILTER_CHROMA 48, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 48, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 48, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 48, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 16, ps, 6, 7, 6
+
 
 %macro PROCESS_CHROMA_SP_W4_4R 0
     movq       m0, [r0]
@@ -1494,7 +1599,7 @@
 ;-----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_CHROMA_SS 4
 INIT_XMM sse2
-cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-1
+cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize
 
     add       r1d, r1d
     add       r3d, r3d
@@ -1508,7 +1613,7 @@
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 
 %ifnidn %3, ss
     %ifnidn %3, ps
@@ -1587,7 +1692,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -1653,6 +1758,65 @@
     FILTER_VER_CHROMA_SS 24, 32, pp, 8
     FILTER_VER_CHROMA_SS 32, 8, pp, 8
 
+
+    FILTER_VER_CHROMA_SS 16, 24, ss, 6
+    FILTER_VER_CHROMA_SS 12, 32, ss, 6
+    FILTER_VER_CHROMA_SS 4, 32, ss, 6
+    FILTER_VER_CHROMA_SS 32, 64, ss, 6
+    FILTER_VER_CHROMA_SS 16, 64, ss, 6
+    FILTER_VER_CHROMA_SS 32, 48, ss, 6
+    FILTER_VER_CHROMA_SS 24, 64, ss, 6
+
+    FILTER_VER_CHROMA_SS 16, 24, ps, 7
+    FILTER_VER_CHROMA_SS 12, 32, ps, 7
+    FILTER_VER_CHROMA_SS 4, 32, ps, 7
+    FILTER_VER_CHROMA_SS 32, 64, ps, 7
+    FILTER_VER_CHROMA_SS 16, 64, ps, 7
+    FILTER_VER_CHROMA_SS 32, 48, ps, 7
+    FILTER_VER_CHROMA_SS 24, 64, ps, 7
+
+    FILTER_VER_CHROMA_SS 16, 24, sp, 8
+    FILTER_VER_CHROMA_SS 12, 32, sp, 8
+    FILTER_VER_CHROMA_SS 4, 32, sp, 8
+    FILTER_VER_CHROMA_SS 32, 64, sp, 8
+    FILTER_VER_CHROMA_SS 16, 64, sp, 8
+    FILTER_VER_CHROMA_SS 32, 48, sp, 8
+    FILTER_VER_CHROMA_SS 24, 64, sp, 8
+
+    FILTER_VER_CHROMA_SS 16, 24, pp, 8
+    FILTER_VER_CHROMA_SS 12, 32, pp, 8
+    FILTER_VER_CHROMA_SS 4, 32, pp, 8
+    FILTER_VER_CHROMA_SS 32, 64, pp, 8
+    FILTER_VER_CHROMA_SS 16, 64, pp, 8
+    FILTER_VER_CHROMA_SS 32, 48, pp, 8
+    FILTER_VER_CHROMA_SS 24, 64, pp, 8
+
+
+    FILTER_VER_CHROMA_SS 48, 64, ss, 6
+    FILTER_VER_CHROMA_SS 64, 48, ss, 6
+    FILTER_VER_CHROMA_SS 64, 64, ss, 6
+    FILTER_VER_CHROMA_SS 64, 32, ss, 6
+    FILTER_VER_CHROMA_SS 64, 16, ss, 6
+
+    FILTER_VER_CHROMA_SS 48, 64, ps, 7
+    FILTER_VER_CHROMA_SS 64, 48, ps, 7
+    FILTER_VER_CHROMA_SS 64, 64, ps, 7
+    FILTER_VER_CHROMA_SS 64, 32, ps, 7
+    FILTER_VER_CHROMA_SS 64, 16, ps, 7
+
+    FILTER_VER_CHROMA_SS 48, 64, sp, 8
+    FILTER_VER_CHROMA_SS 64, 48, sp, 8
+    FILTER_VER_CHROMA_SS 64, 64, sp, 8
+    FILTER_VER_CHROMA_SS 64, 32, sp, 8
+    FILTER_VER_CHROMA_SS 64, 16, sp, 8
+
+    FILTER_VER_CHROMA_SS 48, 64, pp, 8
+    FILTER_VER_CHROMA_SS 64, 48, pp, 8
+    FILTER_VER_CHROMA_SS 64, 64, pp, 8
+    FILTER_VER_CHROMA_SS 64, 32, pp, 8
+    FILTER_VER_CHROMA_SS 64, 16, pp, 8
+
+
 %macro PROCESS_CHROMA_SP_W2_4R 1
     movd       m0, [r0]
     movd       m1, [r0 + r1]
@@ -1772,12 +1936,18 @@
 FILTER_VER_CHROMA_W2 4, sp, 8
 FILTER_VER_CHROMA_W2 8, sp, 8
 
+FILTER_VER_CHROMA_W2 16, ss, 5
+FILTER_VER_CHROMA_W2 16, pp, 8
+FILTER_VER_CHROMA_W2 16, ps, 6
+FILTER_VER_CHROMA_W2 16, sp, 8
+
+
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_W4 2
+%macro FILTER_VER_CHROMA_W4 3
 INIT_XMM sse4
-cglobal interp_4tap_vert_%1_4x2, 5, 6, %2
+cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
 
     add        r1d, r1d
     add        r3d, r3d
@@ -1791,11 +1961,15 @@
     lea        r5, [tab_ChromaCoeffV + r4]
 %endif
 
-%ifnidn %1, ss
-    %ifnidn %1, ps
+%ifnidn %2, 2
+    mov        r4d, %1/2
+%endif
+
+%ifnidn %2, ss
+    %ifnidn %2, ps
         pxor      m6, m6
         mova      m5, [pw_pixel_max]
-        %ifidn %1, pp
+        %ifidn %2, pp
             mova      m4, [tab_c_32]
         %else
             mova      m4, [tab_c_524800]
@@ -1805,6 +1979,10 @@
     %endif
 %endif
 
+%ifnidn %2, 2
+.loop:
+%endif
+
     movh       m0, [r0]
     movh       m1, [r0 + r1]
     punpcklwd  m0, m1                          ;m0=[0 1]
@@ -1825,11 +2003,11 @@
     pmaddwd    m3, [r5 + 1 * 16]
     paddd      m1, m3                          ;m1=[1+2+3+4]  Row2 done
 
-%ifidn %1, ss
+%ifidn %2, ss
     psrad     m0, 6
     psrad     m1, 6
     packssdw  m0, m1
-%elifidn %1, ps
+%elifidn %2, ps
     paddd     m0, m4
     paddd     m1, m4
     psrad     m0, 2
@@ -1838,7 +2016,7 @@
 %else
     paddd     m0, m4
     paddd     m1, m4
-    %ifidn %1, pp
+    %ifidn %2, pp
         psrad     m0, 6
         psrad     m1, 6
     %else
@@ -1852,20 +2030,31 @@
     movh       [r2], m0
     movhps     [r2 + r3], m0
 
+%ifnidn %2, 2
+    lea        r2, [r2 + r3 * 2]
+    dec        r4d
+    jnz        .loop
+%endif
+
     RET
 %endmacro
 
-FILTER_VER_CHROMA_W4 ss, 4
-FILTER_VER_CHROMA_W4 pp, 7
-FILTER_VER_CHROMA_W4 ps, 5
-FILTER_VER_CHROMA_W4 sp, 7
+FILTER_VER_CHROMA_W4 2, ss, 4
+FILTER_VER_CHROMA_W4 2, pp, 7
+FILTER_VER_CHROMA_W4 2, ps, 5
+FILTER_VER_CHROMA_W4 2, sp, 7
+
+FILTER_VER_CHROMA_W4 4, ss, 4
+FILTER_VER_CHROMA_W4 4, pp, 7
+FILTER_VER_CHROMA_W4 4, ps, 5
+FILTER_VER_CHROMA_W4 4, sp, 7
 
 ;-------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_W6 2
+%macro FILTER_VER_CHROMA_W6 3
 INIT_XMM sse4
-cglobal interp_4tap_vert_%1_6x8, 5, 7, %2
+cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
 
     add       r1d, r1d
     add       r3d, r3d
@@ -1879,12 +2068,12 @@
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
 
-    mov       r4d, 8/4
-
-%ifnidn %1, ss
-    %ifnidn %1, ps
+    mov       r4d, %1/4
+
+%ifnidn %2, ss
+    %ifnidn %2, ps
         mova      m7, [pw_pixel_max]
-        %ifidn %1, pp
+        %ifidn %2, pp
             mova      m6, [tab_c_32]
         %else
             mova      m6, [tab_c_524800]
@@ -1897,7 +2086,7 @@
 .loopH:
     PROCESS_CHROMA_SP_W4_4R
 
-%ifidn %1, ss
+%ifidn %2, ss
     psrad     m0, 6
     psrad     m1, 6
     psrad     m2, 6
@@ -1905,7 +2094,7 @@
 
     packssdw  m0, m1
     packssdw  m2, m3
-%elifidn %1, ps
+%elifidn %2, ps
     paddd     m0, m6
     paddd     m1, m6
     paddd     m2, m6
@@ -1922,7 +2111,7 @@
     paddd     m1, m6
     paddd     m2, m6
     paddd     m3, m6
-    %ifidn %1, pp
+    %ifidn %2, pp
         psrad     m0, 6
         psrad     m1, 6
         psrad     m2, 6
@@ -1952,11 +2141,11 @@
 
     PROCESS_CHROMA_SP_W2_4R r6
 
-%ifidn %1, ss
+%ifidn %2, ss
     psrad     m0, 6
     psrad     m2, 6
     packssdw  m0, m2
-%elifidn %1, ps
+%elifidn %2, ps
     paddd     m0, m6
     paddd     m2, m6
     psrad     m0, 2
@@ -1965,7 +2154,7 @@
 %else
     paddd     m0, m6
     paddd     m2, m6
-    %ifidn %1, pp
+    %ifidn %2, pp
         psrad     m0, 6
         psrad     m2, 6
     %else
@@ -1991,10 +2180,15 @@
     RET
 %endmacro
 
-FILTER_VER_CHROMA_W6 ss, 6
-FILTER_VER_CHROMA_W6 ps, 7
-FILTER_VER_CHROMA_W6 sp, 8
-FILTER_VER_CHROMA_W6 pp, 8
+FILTER_VER_CHROMA_W6 8, ss, 6
+FILTER_VER_CHROMA_W6 8, ps, 7
+FILTER_VER_CHROMA_W6 8, sp, 8
+FILTER_VER_CHROMA_W6 8, pp, 8
+
+FILTER_VER_CHROMA_W6 16, ss, 6
+FILTER_VER_CHROMA_W6 16, ps, 7
+FILTER_VER_CHROMA_W6 16, sp, 8
+FILTER_VER_CHROMA_W6 16, pp, 8
 
 %macro PROCESS_CHROMA_SP_W8_2R 0
     movu       m1, [r0]
@@ -2143,6 +2337,14 @@
 FILTER_VER_CHROMA_W8 8, 16, pp, 8
 FILTER_VER_CHROMA_W8 8, 32, pp, 8
 
+FILTER_VER_CHROMA_W8 8, 12, ss, 7
+FILTER_VER_CHROMA_W8 8, 64, ss, 7
+FILTER_VER_CHROMA_W8 8, 12, sp, 8
+FILTER_VER_CHROMA_W8 8, 64, sp, 8
+FILTER_VER_CHROMA_W8 8, 12, ps, 8
+FILTER_VER_CHROMA_W8 8, 64, ps, 8
+FILTER_VER_CHROMA_W8 8, 12, pp, 8
+FILTER_VER_CHROMA_W8 8, 64, pp, 8
 
 
 INIT_XMM sse2
@@ -2273,7 +2475,7 @@
 ;--------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_PP 2
 INIT_XMM sse4
-cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize
 
     add       r1d, r1d
     add       r3d, r3d
@@ -2290,7 +2492,7 @@
 
     mova      m7, [pd_32]
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 .loopH:
     mov       r4d, (%1/4)
 .loopW:
@@ -2329,7 +2531,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -2369,7 +2571,7 @@
 ;---------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_PS 2
 INIT_XMM sse4
-cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize
 
     add       r1d, r1d
     add       r3d, r3d
@@ -2386,7 +2588,7 @@
 
     mova      m7, [pd_n32768]
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 .loopH:
     mov       r4d, (%1/4)
 .loopW:
@@ -2421,7 +2623,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -2461,7 +2663,7 @@
 ;--------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_SP 2
 INIT_XMM sse4
-cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
 
     add       r1d, r1d
     add       r3d, r3d
@@ -2478,7 +2680,7 @@
 
     mova      m7, [tab_c_524800]
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 .loopH:
     mov       r4d, (%1/4)
 .loopW:
@@ -2517,7 +2719,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -2557,7 +2759,7 @@
 ;-----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_SS 2
 INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
 
     add        r1d, r1d
     add        r3d, r3d
@@ -2572,7 +2774,7 @@
     lea        r6, [tab_LumaCoeffV + r4]
 %endif
 
-    mov        byte [rsp], %2/4
+    mov        dword [rsp], %2/4
 .loopH:
     mov        r4d, (%1/4)
 .loopW:
@@ -2601,7 +2803,7 @@
     lea        r0, [r0 + 4 * r1 - 2 * %1]
     lea        r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec        byte [rsp]
+    dec        dword [rsp]
     jnz        .loopH
 
     RET
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/ipfilter8.asm	Tue Aug 05 21:41:53 2014 +0900
@@ -211,6 +211,41 @@
 
 RET
 
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2       m4
+%define Tm0         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_ChromaCoeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+
+mov         r5d,        16/2
+
+.loop:
+FILTER_H4_w2_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+dec         r5d
+jnz         .loop
+
+RET
+
 %macro FILTER_H4_w4_2 3
     movh        %2, [srcq - 1]
     pshufb      %2, %2, Tm0
@@ -350,6 +385,42 @@
 
 RET
 
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2       m4
+%define Tm0         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_ChromaCoeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+
+mov         r5d,        32/2
+
+.loop:
+FILTER_H4_w4_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+dec         r5d
+jnz         .loop
+
+RET
+
+
 %macro FILTER_H4_w6 3
     movu        %1, [srcq - 1]
     pshufb      %2, %1, Tm0
@@ -475,6 +546,38 @@
     movu        [dstq + 16],      %2
 %endmacro
 
+%macro FILTER_H4_w16o 5
+    movu        %1, [srcq + %5 - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq + %5 - 1 + 8]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    pmulhrsw    %2, %3
+    pmulhrsw    %4, %3
+    packuswb    %2, %4
+    movu        [dstq + %5],      %2
+%endmacro
+
+%macro FILTER_H4_w48 4
+    FILTER_H4_w16o %1, %2, %3, %4, 0
+    FILTER_H4_w16o %1, %2, %3, %4, 16
+    FILTER_H4_w16o %1, %2, %3, %4, 32
+%endmacro
+
+%macro FILTER_H4_w64 4
+    FILTER_H4_w16o %1, %2, %3, %4, 0
+    FILTER_H4_w16o %1, %2, %3, %4, 16
+    FILTER_H4_w16o %1, %2, %3, %4, 32
+    FILTER_H4_w16o %1, %2, %3, %4, 48
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -525,6 +628,11 @@
 IPFILTER_CHROMA 8,  32
 IPFILTER_CHROMA 12, 16
 
+IPFILTER_CHROMA 6,  16
+IPFILTER_CHROMA 8,  12
+IPFILTER_CHROMA 8,  64
+IPFILTER_CHROMA 12, 32
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -577,6 +685,18 @@
 IPFILTER_CHROMA_W 24, 32
 IPFILTER_CHROMA_W 32, 32
 
+IPFILTER_CHROMA_W 16, 24
+IPFILTER_CHROMA_W 16, 64
+IPFILTER_CHROMA_W 32, 48
+IPFILTER_CHROMA_W 24, 64
+IPFILTER_CHROMA_W 32, 64
+
+IPFILTER_CHROMA_W 64, 64
+IPFILTER_CHROMA_W 64, 32
+IPFILTER_CHROMA_W 64, 48
+IPFILTER_CHROMA_W 48, 64
+IPFILTER_CHROMA_W 64, 16
+
 
 %macro FILTER_H8_W8 7-8   ; t0, t1, t2, t3, coef, c512, src, dst
     movu        %1, %7
@@ -987,7 +1107,7 @@
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W2_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x8, 4, 6, 8
+cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -1067,6 +1187,8 @@
 
 FILTER_V4_W2_H4 2, 8
 
+FILTER_V4_W2_H4 2, 16
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -1273,6 +1395,8 @@
 FILTER_V4_W4_H4 4,  8
 FILTER_V4_W4_H4 4, 16
 
+FILTER_V4_W4_H4 4, 32
+
 %macro FILTER_V4_W8_H2 0
 punpcklbw   m1,        m2
 punpcklbw   m7,        m3,        m0
@@ -1640,6 +1764,8 @@
 FILTER_V_PS_W4_H4 4, 8
 FILTER_V_PS_W4_H4 4, 16
 
+FILTER_V_PS_W4_H4 4, 32
+
 ;--------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
@@ -1708,6 +1834,9 @@
 FILTER_V_PS_W8_H8_H16_H2 8, 4
 FILTER_V_PS_W8_H8_H16_H2 8, 6
 
+FILTER_V_PS_W8_H8_H16_H2 8, 12
+FILTER_V_PS_W8_H8_H16_H2 8, 64
+
 ;--------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
@@ -1803,8 +1932,9 @@
 ;------------------------------------------------------------------------------------------------------------
 ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W6 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_6x8, 4, 6, 8
+cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
@@ -1821,7 +1951,7 @@
     pshufb     m5, [tab_Vm + 16]
     mova       m4, [pw_2000]
     lea        r5, [3 * r1]
-    mov        r4d, 2
+    mov        r4d, %2/4
 
 .loop:
     movq       m0, [r0]
@@ -1889,12 +2019,17 @@
     dec        r4d
     jnz        .loop
     RET
+%endmacro
+
+FILTER_V_PS_W6 6, 8
+FILTER_V_PS_W6 6, 16
 
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W12 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_12x16, 4, 6, 8
+cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
@@ -1910,7 +2045,7 @@
     pshufb     m1, m0, [tab_Vm]
     pshufb     m0, [tab_Vm + 16]
 
-    mov        r4d, 16/2
+    mov        r4d, %2/2
 
 .loop:
     movu       m2, [r0]
@@ -1970,6 +2105,10 @@
     dec        r4d
     jnz        .loop
     RET
+%endmacro
+
+FILTER_V_PS_W12 12, 16
+FILTER_V_PS_W12 12, 32
 
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -2059,11 +2198,15 @@
 FILTER_V_PS_W16 16, 16
 FILTER_V_PS_W16 16, 32
 
+FILTER_V_PS_W16 16, 24
+FILTER_V_PS_W16 16, 64
+
 ;--------------------------------------------------------------------------------------------------------------
 ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_V4_PS_W24 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_24x32, 4, 6, 8
+cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
@@ -2079,7 +2222,7 @@
     pshufb     m1, m0, [tab_Vm]
     pshufb     m0, [tab_Vm + 16]
 
-    mov        r4d, 32/2
+    mov        r4d, %2/2
 
 .loop:
     movu       m2, [r0]
@@ -2170,6 +2313,11 @@
     dec        r4d
     jnz        .loop
     RET
+%endmacro
+
+FILTER_V4_PS_W24 24, 32
+
+FILTER_V4_PS_W24 24, 64
 
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -2265,6 +2413,9 @@
 FILTER_V_PS_W32 32, 24
 FILTER_V_PS_W32 32, 32
 
+FILTER_V_PS_W32 32, 48
+FILTER_V_PS_W32 32, 64
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -2359,12 +2510,16 @@
 FILTER_V4_W8_H8_H16_H32 8, 16
 FILTER_V4_W8_H8_H16_H32 8, 32
 
+FILTER_V4_W8_H8_H16_H32 8, 12
+FILTER_V4_W8_H8_H16_H32 8, 64
+
+
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W6_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_6x8, 4, 6, 8
+cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -2455,12 +2610,14 @@
 
 FILTER_V4_W6_H4 6, 8
 
+FILTER_V4_W6_H4 6, 16
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W12_H2 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_12x16, 4, 6, 8
+cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -2543,12 +2700,14 @@
 
 FILTER_V4_W12_H2 12, 16
 
+FILTER_V4_W12_H2 12, 32
+
 ;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W16_H2 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -2563,7 +2722,7 @@
 pshufb      m1,        m0,       [tab_Vm]
 pshufb      m0,        [tab_Vm + 16]
 
-mov         r4d,       %2
+mov         r4d,       %2/2
 
 .loop:
 movu        m2,        [r0]
@@ -2622,7 +2781,7 @@
 
 lea         r2,        [r2 + 2 * r3]
 
-sub         r4,        2
+dec         r4d
 jnz        .loop
 RET
 %endmacro
@@ -2633,12 +2792,15 @@
 FILTER_V4_W16_H2 16, 16
 FILTER_V4_W16_H2 16, 32
 
+FILTER_V4_W16_H2 16, 24
+FILTER_V4_W16_H2 16, 64
+
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W24 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_24x32, 4, 6, 8
+cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -2754,8 +2916,10 @@
 
 FILTER_V4_W24 24, 32
 
+FILTER_V4_W24 24, 64
+
 ;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W32 2
 INIT_XMM sse4
@@ -2849,6 +3013,111 @@
 FILTER_V4_W32 32, 24
 FILTER_V4_W32 32, 32
 
+FILTER_V4_W32 32, 48
+FILTER_V4_W32 32, 64
+
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16n_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m1,        m0,       [tab_Vm]
+pshufb      m0,        [tab_Vm + 16]
+
+mov         r4d,       %2/2
+
+.loop:
+
+mov         r6d,       %1/16
+
+.loopW:
+
+movu        m2,        [r0]
+movu        m3,        [r0 + r1]
+
+punpcklbw   m4,        m2,        m3
+punpckhbw   m2,        m3
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+lea         r5,        [r0 + 2 * r1]
+movu        m5,        [r5]
+movu        m6,        [r5 + r1]
+
+punpckhbw   m7,        m5,        m6
+pmaddubsw   m7,        m0
+paddw       m2,        m7
+
+punpcklbw   m7,        m5,        m6
+pmaddubsw   m7,        m0
+paddw       m4,        m7
+
+mova        m7,        [tab_c_512]
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movu        [r2],      m4
+
+punpcklbw   m4,        m3,        m5
+punpckhbw   m3,        m5
+
+pmaddubsw   m4,        m1
+pmaddubsw   m3,        m1
+
+movu        m5,        [r5 + 2 * r1]
+
+punpcklbw   m2,        m6,        m5
+punpckhbw   m6,        m5
+
+pmaddubsw   m2,        m0
+pmaddubsw   m6,        m0
+
+paddw       m4,        m2
+paddw       m3,        m6
+
+pmulhrsw    m4,        m7
+pmulhrsw    m3,        m7
+
+packuswb    m4,        m3
+
+movu        [r2 + r3],      m4
+
+add         r0,        16
+add         r2,        16
+dec         r6d
+jnz         .loopW
+
+lea         r0,        [r0 + r1 * 2 - %1]
+lea         r2,        [r2 + r3 * 2 - %1]
+
+dec         r4d
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W16n_H2 64, 64
+FILTER_V4_W16n_H2 64, 32
+FILTER_V4_W16n_H2 64, 48
+FILTER_V4_W16n_H2 48, 64
+FILTER_V4_W16n_H2 64, 16
+
 
 ;-----------------------------------------------------------------------------
 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
@@ -3350,7 +3619,7 @@
 ;-------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA 3
 INIT_XMM sse4
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
     lea       r5, [3 * r1]
     sub       r0, r5
     shl       r4d, 6
@@ -3370,7 +3639,7 @@
 %else
     mova      m3, [pw_2000]
 %endif
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 
 .loopH:
     mov       r4d, (%1/8)
@@ -3420,7 +3689,7 @@
     lea       r2, [r2 + 4 * r3 - 2 * %1]
 %endif
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -3532,7 +3801,7 @@
 ;--------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_SP 2
 INIT_XMM sse4
-cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
 
     add       r1d, r1d
     lea       r5, [r1 + 2 * r1]
@@ -3548,7 +3817,7 @@
 
     mova      m7, [tab_c_526336]
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 .loopH:
     mov       r4d, (%1/4)
 .loopW:
@@ -3585,7 +3854,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -3720,7 +3989,7 @@
 ;--------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_CHROMA_SP 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
 
     add       r1d, r1d
     sub       r0, r1
@@ -3735,7 +4004,7 @@
 
     mova      m6, [tab_c_526336]
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 
 .loopH:
     mov       r4d, (%1/4)
@@ -3773,7 +4042,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -3794,6 +4063,20 @@
     FILTER_VER_CHROMA_SP 24, 32
     FILTER_VER_CHROMA_SP 32, 8
 
+    FILTER_VER_CHROMA_SP 16, 24
+    FILTER_VER_CHROMA_SP 16, 64
+    FILTER_VER_CHROMA_SP 12, 32
+    FILTER_VER_CHROMA_SP 4, 32
+    FILTER_VER_CHROMA_SP 32, 64
+    FILTER_VER_CHROMA_SP 32, 48
+    FILTER_VER_CHROMA_SP 24, 64
+
+    FILTER_VER_CHROMA_SP 64, 64
+    FILTER_VER_CHROMA_SP 64, 32
+    FILTER_VER_CHROMA_SP 64, 48
+    FILTER_VER_CHROMA_SP 48, 64
+    FILTER_VER_CHROMA_SP 64, 16
+
 
 %macro PROCESS_CHROMA_SP_W2_4R 1
     movd       m0, [r0]
@@ -3879,6 +4162,8 @@
 FILTER_VER_CHROMA_SP_W2_4R 2, 4
 FILTER_VER_CHROMA_SP_W2_4R 2, 8
 
+FILTER_VER_CHROMA_SP_W2_4R 2, 16
+
 ;--------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
@@ -3931,10 +4216,11 @@
     RET
 
 ;-------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vertical_sp_6x8(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W6_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_sp_6x8, 5, 7, 7
+cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
 
     add       r1d, r1d
     sub       r0, r1
@@ -3949,7 +4235,7 @@
 
     mova      m6, [tab_c_526336]
 
-    mov       r4d, 8/4
+    mov       r4d, %2/4
 
 .loopH:
     PROCESS_CHROMA_SP_W4_4R
@@ -4003,6 +4289,11 @@
     jnz       .loopH
 
     RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W6_H4 6, 8
+
+FILTER_VER_CHROMA_SP_W6_H4 6, 16
 
 %macro PROCESS_CHROMA_SP_W8_2R 0
     movu       m1, [r0]
@@ -4093,6 +4384,10 @@
 FILTER_VER_CHROMA_SP_W8_H2 8, 16
 FILTER_VER_CHROMA_SP_W8_H2 8, 32
 
+FILTER_VER_CHROMA_SP_W8_H2 8, 12
+FILTER_VER_CHROMA_SP_W8_H2 8, 64
+
+
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
@@ -4145,6 +4440,8 @@
 FILTER_HORIZ_CHROMA_2xN 2, 4
 FILTER_HORIZ_CHROMA_2xN 2, 8
 
+FILTER_HORIZ_CHROMA_2xN 2, 16
+
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
@@ -4198,6 +4495,8 @@
 FILTER_HORIZ_CHROMA_4xN 4, 8
 FILTER_HORIZ_CHROMA_4xN 4, 16
 
+FILTER_HORIZ_CHROMA_4xN 4, 32
+
 %macro PROCESS_CHROMA_W6 3
     movu       %1, [srcq]
     pshufb     %2, %1, Tm0
@@ -4277,6 +4576,9 @@
 FILTER_HORIZ_CHROMA 6, 8
 FILTER_HORIZ_CHROMA 12, 16
 
+FILTER_HORIZ_CHROMA 6, 16
+FILTER_HORIZ_CHROMA 12, 32
+
 %macro PROCESS_CHROMA_W8 3
     movu        %1, [srcq]
     pshufb      %2, %1, Tm0
@@ -4341,6 +4643,9 @@
 FILTER_HORIZ_CHROMA_8xN 8, 16
 FILTER_HORIZ_CHROMA_8xN 8, 32
 
+FILTER_HORIZ_CHROMA_8xN 8, 12
+FILTER_HORIZ_CHROMA_8xN 8, 64
+
 %macro PROCESS_CHROMA_W16 4
     movu        %1, [srcq]
     pshufb      %2, %1, Tm0
@@ -4422,6 +4727,38 @@
     movu        [dstq + 48], %4
 %endmacro
 
+%macro PROCESS_CHROMA_W16o 5
+    movu        %1, [srcq + %5]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq + %5 + 8]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    psubw       %2, %3
+    psubw       %4, %3
+    movu        [dstq + %5 * 2], %2
+    movu        [dstq + %5 * 2 + 16], %4
+%endmacro
+
+%macro PROCESS_CHROMA_W48 4
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
+%endmacro
+
+%macro PROCESS_CHROMA_W64 4
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
+    PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
+%endmacro
+
 ;------------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;------------------------------------------------------------------------------------------------------------------------------
@@ -4480,6 +4817,119 @@
 FILTER_HORIZ_CHROMA_WxN 32, 24
 FILTER_HORIZ_CHROMA_WxN 32, 32
 
+FILTER_HORIZ_CHROMA_WxN 16, 24
+FILTER_HORIZ_CHROMA_WxN 16, 64
+FILTER_HORIZ_CHROMA_WxN 24, 64
+FILTER_HORIZ_CHROMA_WxN 32, 48
+FILTER_HORIZ_CHROMA_WxN 32, 64
+
+FILTER_HORIZ_CHROMA_WxN 64, 64
+FILTER_HORIZ_CHROMA_WxN 64, 32
+FILTER_HORIZ_CHROMA_WxN 64, 48
+FILTER_HORIZ_CHROMA_WxN 48, 64
+FILTER_HORIZ_CHROMA_WxN 64, 16
+
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W16n 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+
+    mov        r4d, r4m
+    sub        r0, r1
+    add        r3d, r3d
+
+%ifdef PIC
+    lea        r5, [tab_ChromaCoeff]
+    movd       m0, [r5 + r4 * 4]
+%else
+    movd       m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    pshufb     m1, m0, [tab_Vm]
+    pshufb     m0, [tab_Vm + 16]
+    mov        r4d, %2/2
+
+.loop:
+
+    mov         r6d,       %1/16
+
+.loopW:
+
+    movu       m2, [r0]
+    movu       m3, [r0 + r1]
+
+    punpcklbw  m4, m2, m3
+    punpckhbw  m2, m3
+
+    pmaddubsw  m4, m1
+    pmaddubsw  m2, m1
+
+    lea        r5, [r0 + 2 * r1]
+    movu       m5, [r5]
+    movu       m7, [r5 + r1]
+
+    punpcklbw  m6, m5, m7
+    pmaddubsw  m6, m0
+    paddw      m4, m6
+
+    punpckhbw  m6, m5, m7
+    pmaddubsw  m6, m0
+    paddw      m2, m6
+
+    mova       m6, [pw_2000]
+
+    psubw      m4, m6
+    psubw      m2, m6
+
+    movu       [r2], m4
+    movu       [r2 + 16], m2
+
+    punpcklbw  m4, m3, m5
+    punpckhbw  m3, m5
+
+    pmaddubsw  m4, m1
+    pmaddubsw  m3, m1
+
+    movu       m5, [r5 + 2 * r1]
+
+    punpcklbw  m2, m7, m5
+    punpckhbw  m7, m5
+
+    pmaddubsw  m2, m0
+    pmaddubsw  m7, m0
+
+    paddw      m4, m2
+    paddw      m3, m7
+
+    psubw      m4, m6
+    psubw      m3, m6
+
+    movu       [r2 + r3], m4
+    movu       [r2 + r3 + 16], m3
+
+    add         r0,        16
+    add         r2,        32
+    dec         r6d
+    jnz         .loopW
+
+    lea         r0,        [r0 + r1 * 2 - %1]
+    lea         r2,        [r2 + r3 * 2 - %1 * 2]
+
+    dec        r4d
+    jnz        .loop
+    RET
+%endmacro
+
+FILTER_V_PS_W16n 64, 64
+FILTER_V_PS_W16n 64, 32
+FILTER_V_PS_W16n 64, 48
+FILTER_V_PS_W16n 48, 64
+FILTER_V_PS_W16n 64, 16
+
+
 ;------------------------------------------------------------------------------------------------------------
 ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;------------------------------------------------------------------------------------------------------------
@@ -4556,8 +5006,9 @@
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W2 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_2x8, 4, 6, 8
+cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
@@ -4574,7 +5025,7 @@
 
     mova       m1, [pw_2000]
     lea        r5, [3 * r1]
-    mov        r4d, 2
+    mov        r4d, %2/4
 .loop:
     movd       m2, [r0]
     movd       m3, [r0 + r1]
@@ -4635,13 +5086,18 @@
     jnz        .loop
 
 RET
+%endmacro
+
+FILTER_V_PS_W2 2, 8
+
+FILTER_V_PS_W2 2, 16
 
 ;-----------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_CHROMA_SS 2
 INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-1
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
 
     add       r1d, r1d
     add       r3d, r3d
@@ -4655,7 +5111,7 @@
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
 
-    mov       byte [rsp], %2/4
+    mov       dword [rsp], %2/4
 
 .loopH:
     mov       r4d, (%1/4)
@@ -4686,7 +5142,7 @@
     lea       r0, [r0 + 4 * r1 - 2 * %1]
     lea       r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec       byte [rsp]
+    dec       dword [rsp]
     jnz       .loopH
 
     RET
@@ -4707,6 +5163,21 @@
     FILTER_VER_CHROMA_SS 24, 32
     FILTER_VER_CHROMA_SS 32, 8
 
+    FILTER_VER_CHROMA_SS 16, 24
+    FILTER_VER_CHROMA_SS 12, 32
+    FILTER_VER_CHROMA_SS 4, 32
+    FILTER_VER_CHROMA_SS 32, 64
+    FILTER_VER_CHROMA_SS 16, 64
+    FILTER_VER_CHROMA_SS 32, 48
+    FILTER_VER_CHROMA_SS 24, 64
+
+    FILTER_VER_CHROMA_SS 64, 64
+    FILTER_VER_CHROMA_SS 64, 32
+    FILTER_VER_CHROMA_SS 64, 48
+    FILTER_VER_CHROMA_SS 48, 64
+    FILTER_VER_CHROMA_SS 64, 16
+
+
 ;---------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------------
@@ -4753,6 +5224,8 @@
 FILTER_VER_CHROMA_SS_W2_4R 2, 4
 FILTER_VER_CHROMA_SS_W2_4R 2, 8
 
+FILTER_VER_CHROMA_SS_W2_4R 2, 16
+
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------
@@ -4803,8 +5276,9 @@
 ;-------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W6_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ss_6x8, 5, 7, 6
+cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
 
     add       r1d, r1d
     add       r3d, r3d
@@ -4818,7 +5292,7 @@
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
 
-    mov       r4d, 8/4
+    mov       r4d, %2/4
 
 .loopH:
     PROCESS_CHROMA_SP_W4_4R
@@ -4861,6 +5335,12 @@
     jnz       .loopH
 
     RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W6_H4 6, 8
+
+FILTER_VER_CHROMA_SS_W6_H4 6, 16
+
 
 ;----------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4911,12 +5391,15 @@
 FILTER_VER_CHROMA_SS_W8_H2 8, 16
 FILTER_VER_CHROMA_SS_W8_H2 8, 32
 
+FILTER_VER_CHROMA_SS_W8_H2 8, 12
+FILTER_VER_CHROMA_SS_W8_H2 8, 64
+
 ;-----------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_SS 2
 INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
 
     add        r1d, r1d
     add        r3d, r3d
@@ -4931,7 +5414,7 @@
     lea        r6, [tab_LumaCoeffV + r4]
 %endif
 
-    mov        byte [rsp], %2/4
+    mov        dword [rsp], %2/4
 .loopH:
     mov        r4d, (%1/4)
 .loopW:
@@ -5023,7 +5506,7 @@
     lea        r0, [r0 + 4 * r1 - 2 * %1]
     lea        r2, [r2 + 4 * r3 - 2 * %1]
 
-    dec        byte [rsp]
+    dec        dword [rsp]
     jnz        .loopH
 
     RET
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/ipfilter8.h	Tue Aug 05 21:41:53 2014 +0900
@@ -153,6 +153,60 @@
     SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
     SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
 
+#define CHROMA_VERT_FILTERS_422(cpu) \
+    SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 12, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 24, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(12, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(4, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 48, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(24, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
+    SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(2, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_VERT_FILTERS_444(cpu) \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(64, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(64, 32, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(64, 48, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(48, 64, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(64, 16, cpu); \
+    SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu)
+
 #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
     void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
     void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt);
@@ -183,6 +237,58 @@
     SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
     SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
 
+#define CHROMA_HORIZ_FILTERS_422(cpu) \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 12, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(6, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(2, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 24, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(12, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(4, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 48, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(24, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 64, cpu)
+
+#define CHROMA_HORIZ_FILTERS_444(cpu) \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(64, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(64, 32, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(64, 48, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(48, 64, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \
+    SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu)
+
 void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 
@@ -190,12 +296,26 @@
 CHROMA_HORIZ_FILTERS(_sse4);
 CHROMA_VERT_FILTERS_SSE4(_sse4);
 
+CHROMA_VERT_FILTERS_422(_sse2);
+CHROMA_HORIZ_FILTERS_422(_sse4);
+CHROMA_VERT_FILTERS_SSE4_422(_sse4);
+
+CHROMA_VERT_FILTERS_444(_sse2);
+CHROMA_HORIZ_FILTERS_444(_sse4);
+
 #undef CHROMA_VERT_FILTERS_SSE4
 #undef CHROMA_VERT_FILTERS
 #undef SETUP_CHROMA_VERT_FUNC_DEF
 #undef CHROMA_HORIZ_FILTERS
 #undef SETUP_CHROMA_HORIZ_FUNC_DEF
 
+#undef CHROMA_VERT_FILTERS_422
+#undef CHROMA_VERT_FILTERS_SSE4_422
+#undef CHROMA_HORIZ_FILTERS_422
+
+#undef CHROMA_VERT_FILTERS_444
+#undef CHROMA_HORIZ_FILTERS_444
+
 #else // if HIGH_BIT_DEPTH
 
 #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
@@ -230,6 +350,58 @@
     SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
     SETUP_CHROMA_FUNC_DEF(8, 32, cpu)
 
+#define CHROMA_FILTERS_422(cpu) \
+    SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
+    SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 12, cpu); \
+    SETUP_CHROMA_FUNC_DEF(6, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_FUNC_DEF(2, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 24, cpu); \
+    SETUP_CHROMA_FUNC_DEF(12, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(4, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 48, cpu); \
+    SETUP_CHROMA_FUNC_DEF(24, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_FILTERS_444(cpu) \
+    SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 4, cpu); \
+    SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
+    SETUP_CHROMA_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(64, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF(64, 32, cpu); \
+    SETUP_CHROMA_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF(64, 48, cpu); \
+    SETUP_CHROMA_FUNC_DEF(48, 64, cpu); \
+    SETUP_CHROMA_FUNC_DEF(64, 16, cpu); \
+    SETUP_CHROMA_FUNC_DEF(16, 64, cpu);
+
 #define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
     void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
 
@@ -261,6 +433,60 @@
     SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu);
 
+#define CHROMA_SP_FILTERS_422(cpu) \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 12, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_SP_FILTERS_422_SSE4(cpu) \
+    SETUP_CHROMA_SP_FUNC_DEF(2, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(2, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(4, 4, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(4, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(6, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 24, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(12, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 48, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(24, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu);
+
+#define CHROMA_SP_FILTERS_444(cpu) \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 4, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(64, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(64, 32, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(64, 48, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(48, 64, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(64, 16, cpu); \
+    SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu);
+
 #define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \
     void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
 
@@ -285,19 +511,83 @@
     SETUP_CHROMA_SS_FUNC_DEF(32, 24, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF(24, 32, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \
-    SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu)
+    SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu);
 
 #define CHROMA_SS_FILTERS_SSE4(cpu) \
     SETUP_CHROMA_SS_FUNC_DEF(2, 4, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \
     SETUP_CHROMA_SS_FUNC_DEF(6, 8, cpu);
 
+#define CHROMA_SS_FILTERS_422(cpu) \
+    SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 12, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 24, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(12, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(4, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 48, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(24, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_SS_FILTERS_422_SSE4(cpu) \
+    SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(2, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_SS_FILTERS_444(cpu) \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 4, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(64, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(64, 32, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(32, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(64, 48, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(48, 64, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(64, 16, cpu); \
+    SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
+
 CHROMA_FILTERS(_sse4);
 CHROMA_SP_FILTERS(_sse2);
 CHROMA_SP_FILTERS_SSE4(_sse4);
 CHROMA_SS_FILTERS(_sse2);
 CHROMA_SS_FILTERS_SSE4(_sse4);
 
+CHROMA_FILTERS_422(_sse4);
+CHROMA_SP_FILTERS_422(_sse2);
+CHROMA_SP_FILTERS_422_SSE4(_sse4);
+CHROMA_SS_FILTERS_422(_sse2);
+CHROMA_SS_FILTERS_422_SSE4(_sse4);
+
+CHROMA_FILTERS_444(_sse4);
+CHROMA_SP_FILTERS_444(_sse4);
+CHROMA_SS_FILTERS_444(_sse2);
+
 void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 
 #undef SETUP_CHROMA_FUNC_DEF
@@ -308,6 +598,17 @@
 #undef CHROMA_SS_FILTERS
 #undef CHROMA_SS_FILTERS_SSE4
 #undef CHROMA_SP_FILTERS_SSE4
+
+#undef CHROMA_FILTERS_422
+#undef CHROMA_SP_FILTERS_422
+#undef CHROMA_SS_FILTERS_422
+#undef CHROMA_SS_FILTERS_422_SSE4
+#undef CHROMA_SP_FILTERS_422_SSE4
+
+#undef CHROMA_FILTERS_444
+#undef CHROMA_SP_FILTERS_444
+#undef CHROMA_SS_FILTERS_444
+
 #endif // if HIGH_BIT_DEPTH
 
 LUMA_FILTERS(_sse4);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/mc-a.asm	Tue Aug 05 21:41:53 2014 +0900
@@ -154,6 +154,50 @@
 %endrep
     RET
 
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m6,         [pw_1023]
+    mova        m7,         [pw_1024]
+    mov         r6d,        16/4
+    add         r3,         r3
+    add         r4,         r4
+    add         r5,         r5
+.loop:
+    movd        m1,         [r0]
+    movd        m2,         [r0 + r3]
+    movd        m3,         [r1]
+    movd        m4,         [r1 + r4]
+    lea         r0,         [r0 + r3 * 2]
+    lea         r1,         [r1 + r4 * 2]
+    punpckldq   m1,         m2
+    punpckldq   m3,         m4
+    movd        m2,         [r0]
+    movd        m4,         [r0 + r3]
+    movd        m5,         [r1]
+    movd        m0,         [r1 + r4]
+    lea         r0,         [r0 + r3 * 2]
+    lea         r1,         [r1 + r4 * 2]
+    punpckldq   m2,         m4
+    punpckldq   m5,         m0
+    punpcklqdq  m1,         m2
+    punpcklqdq  m3,         m5
+    paddw       m1,         m3
+    pmulhrsw    m1,         m7
+    paddw       m1,         [pw_512]
+    pxor        m0,         m0
+    pmaxsw      m1,         m0
+    pminsw      m1,         m6
+    movd        [r2],       m1
+    pextrd      [r2 + r5],  m1, 1
+    lea         r2,         [r2 + r5 * 2]
+    pextrd      [r2],       m1, 2
+    pextrd      [r2 + r5],  m1, 3
+    lea         r2,         [r2 + r5 * 2]
+    dec         r6d
+    jnz         .loop
+    RET
+;-----------------------------------------------------------------------------
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
@@ -181,7 +225,7 @@
     RET
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_6x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova        m4,             [pw_512]
     mova        m5,             [pw_1023]
     mova        m7,             [pw_1024]
@@ -220,6 +264,42 @@
     RET
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
+cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,             [pw_512]
+    mova        m5,             [pw_1023]
+    mova        m7,             [pw_1024]
+    pxor        m6,             m6
+    mov         r6d,            16/2
+    add         r3,             r3
+    add         r4,             r4
+    add         r5,             r5
+.loop:
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    dec         r6d
+    lea         r0,             [r0 + r3 * 2]
+    lea         r1,             [r1 + r4 * 2]
+    paddw       m0,             m2
+    paddw       m1,             m3
+    pmulhrsw    m0,             m7
+    pmulhrsw    m1,             m7
+    paddw       m0,             m4
+    paddw       m1,             m4
+    pmaxsw      m0,             m6
+    pmaxsw      m1,             m6
+    pminsw      m0,             m5
+    pminsw      m1,             m5
+    movh        [r2],           m0
+    pextrd      [r2 + 8],       m0, 2
+    movh        [r2 + r5],      m1
+    pextrd      [r2 + r5 + 8],  m1, 2
+    lea         r2,             [r2 + r5 * 2]
+    jnz         .loop
+    RET
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova        m4,          [pw_512]
     mova        m5,          [pw_1023]
@@ -335,6 +415,8 @@
 ADDAVG_W4_H4 8
 ADDAVG_W4_H4 16
 
+ADDAVG_W4_H4 32
+
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W8_H4 1
 INIT_XMM sse4
@@ -382,6 +464,9 @@
 ADDAVG_W8_H4 16
 ADDAVG_W8_H4 32
 
+ADDAVG_W8_H4 12
+ADDAVG_W8_H4 64
+
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W12_H4 1
 INIT_XMM sse4
@@ -442,6 +527,8 @@
 
 ADDAVG_W12_H4 16
 
+ADDAVG_W12_H4 32
+
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W16_H4 1
 INIT_XMM sse4
@@ -509,6 +596,8 @@
 ADDAVG_W16_H4 32
 ADDAVG_W16_H4 64
 
+ADDAVG_W16_H4 24
+
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W24_H2 2
 INIT_XMM sse4
@@ -589,6 +678,8 @@
 
 ADDAVG_W24_H2 24, 32
 
+ADDAVG_W24_H2 24, 64
+
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W32_H2 1
 INIT_XMM sse4
@@ -691,6 +782,8 @@
 ADDAVG_W32_H2 32
 ADDAVG_W32_H2 64
 
+ADDAVG_W32_H2 48
+
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W48_H2 1
 INIT_XMM sse4
@@ -1052,6 +1145,48 @@
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
+cglobal addAvg_2x16, 6,7,8, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m0,         [pw_256]
+    mova        m7,         [pw_128]
+    mov         r6d,        16/4
+    add         r3,         r3
+    add         r4,         r4
+.loop:
+    movd        m1,         [r0]
+    movd        m2,         [r0 + r3]
+    movd        m3,         [r1]
+    movd        m4,         [r1 + r4]
+    lea         r0,         [r0 + r3 * 2]
+    lea         r1,         [r1 + r4 * 2]
+    punpckldq   m1,         m2
+    punpckldq   m3,         m4
+    movd        m2,         [r0]
+    movd        m4,         [r0 + r3]
+    movd        m5,         [r1]
+    movd        m6,         [r1 + r4]
+    lea         r0,         [r0 + r3 * 2]
+    lea         r1,         [r1 + r4 * 2]
+    punpckldq   m2,         m4
+    punpckldq   m5,         m6
+    punpcklqdq  m1,         m2
+    punpcklqdq  m3,         m5
+    paddw       m1,         m3
+    pmulhrsw    m1,         m0
+    paddw       m1,         m7
+    packuswb    m1,         m1
+    pextrw      [r2],       m1, 0
+    pextrw      [r2 + r5],  m1, 1
+    lea         r2,         [r2 + r5 * 2]
+    pextrw      [r2],       m1, 2
+    pextrw      [r2 + r5],  m1, 3
+    lea         r2,         [r2 + r5 * 2]
+    dec         r6d
+    jnz         .loop
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
 cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
 
     mova           m1,          [pw_256]
@@ -1132,6 +1267,9 @@
 ADDAVG_W4_H4 4
 ADDAVG_W4_H4 8
 ADDAVG_W4_H4 16
+
+ADDAVG_W4_H4 32
+
 ;-----------------------------------------------------------------------------
 
 ;-----------------------------------------------------------------------------
@@ -1232,6 +1370,39 @@
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
+cglobal addAvg_6x16, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,             [pw_256]
+    mova        m5,             [pw_128]
+    mov         r6d,            16/2
+    add         r3,             r3
+    add         r4,             r4
+.loop:
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    dec         r6d
+    lea         r0,             [r0 + r3 * 2]
+    lea         r1,             [r1 + r4 * 2]
+    paddw       m0,             m2
+    paddw       m1,             m3
+    pmulhrsw    m0,             m4
+    pmulhrsw    m1,             m4
+    paddw       m0,             m5
+    paddw       m1,             m5
+    packuswb    m0,             m0
+    packuswb    m1,             m1
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+    lea         r2,             [r2 + r5 * 2]
+    jnz         .loop
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
 cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
     mova        m4,          [pw_256]
     mova        m5,          [pw_128]
@@ -1392,6 +1563,9 @@
 ADDAVG_W8_H4 16
 ADDAVG_W8_H4 32
 
+ADDAVG_W8_H4 12
+ADDAVG_W8_H4 64
+
 ;-----------------------------------------------------------------------------
 
 
@@ -1485,6 +1659,8 @@
 
 ADDAVG_W12_H4 16
 
+ADDAVG_W12_H4 32
+
 ;-----------------------------------------------------------------------------
 
 
@@ -1580,6 +1756,8 @@
 ADDAVG_W16_H4 32
 ADDAVG_W16_H4 64
 
+ADDAVG_W16_H4 24
+
 ;-----------------------------------------------------------------------------
 
 
@@ -1654,6 +1832,8 @@
 
 ADDAVG_W24_H2 24, 32
 
+ADDAVG_W24_H2 24, 64
+
 ;-----------------------------------------------------------------------------
 
 ;-----------------------------------------------------------------------------
@@ -1743,6 +1923,8 @@
 ADDAVG_W32_H2 32
 ADDAVG_W32_H2 64
 
+ADDAVG_W32_H2 48
+
 ;-----------------------------------------------------------------------------
 
 
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixel-util.h	Tue Aug 05 21:41:53 2014 +0900
@@ -94,6 +94,32 @@
     SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 8, cpu); \
     SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu);
 
+#define CHROMA_PIXELSUB_DEF_422(cpu) \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 8, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 12, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(6, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 4, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 24, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(12, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 8, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 64, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 48, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(24, 64, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 64, cpu);
+
 #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
     void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
     void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
@@ -130,6 +156,9 @@
 CHROMA_PIXELSUB_DEF(_sse2);
 LUMA_PIXELSUB_DEF(_sse2);
 
+CHROMA_PIXELSUB_DEF_422(_sse4);
+CHROMA_PIXELSUB_DEF_422(_sse2);
+
 #define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
     uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
 
@@ -142,6 +171,7 @@
 LUMA_PIXELVAR_DEF(_sse2);
 
 #undef CHROMA_PIXELSUB_DEF
+#undef CHROMA_PIXELSUB_DEF_422
 #undef LUMA_PIXELSUB_DEF
 #undef LUMA_PIXELVAR_DEF
 #undef SETUP_CHROMA_PIXELSUB_PS_FUNC
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixel-util8.asm	Tue Aug 05 21:41:53 2014 +0900
@@ -2878,6 +2878,61 @@
 RET
 
 ;-----------------------------------------------------------------------------
+; void pixel_sub_ps_2x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%macro PIXEL_SUB_PS_W2_H2 2
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
+    add     r1,     r1
+    add     r4,     r4
+    add     r5,     r5
+    mov     r6d,    %2/2
+.loop:
+    movd    m0,     [r2]
+    movd    m1,     [r3]
+    movd    m2,     [r2 + r4]
+    movd    m3,     [r3 + r5]
+    dec     r6d
+    lea     r2,     [r2 + r4 * 2]
+    lea     r3,     [r3 + r5 * 2]
+    psubw   m0,     m1
+    psubw   m2,     m3
+    movd    [r0],       m0
+    movd    [r0 + r1],  m2
+    lea     r0,     [r0 + 2 * r1]
+    jnz     .loop
+    RET
+%else
+INIT_XMM sse4
+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
+    add         r1,     r1
+    mov         r6d,    %2/2
+.loop:
+    pinsrw      m0,     [r2],       0
+    pinsrw      m1,     [r3],       0
+    pinsrw      m2,     [r2 + r4],  0
+    pinsrw      m3,     [r3 + r5],  0
+    dec         r6d
+    lea         r2,     [r2 + r4 * 2]
+    lea         r3,     [r3 + r5 * 2]
+    pmovzxbw    m0,     m0
+    pmovzxbw    m1,     m1
+    pmovzxbw    m2,     m2
+    pmovzxbw    m3,     m3
+    psubw       m0,     m1
+    psubw       m2,     m3
+    movd        [r0],       m0
+    movd        [r0 + r1],  m2
+    lea         r0,     [r0 + r1 * 2]
+    jnz         .loop
+    RET
+%endif
+%endmacro
+
+PIXEL_SUB_PS_W2_H2   2, 16
+
+;-----------------------------------------------------------------------------
 ; void pixel_sub_sp_c_4x2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 %if HIGH_BIT_DEPTH
@@ -2991,11 +3046,17 @@
 PIXELSUB_PS_W4_H4 4, 4
 PIXELSUB_PS_W4_H4 4, 8
 PIXELSUB_PS_W4_H4 4, 16
+;
+PIXELSUB_PS_W4_H4 4, 12
+PIXELSUB_PS_W4_H4 4, 32
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W4_H4 4, 4
 PIXELSUB_PS_W4_H4 4, 8
 PIXELSUB_PS_W4_H4 4, 16
+;
+PIXELSUB_PS_W4_H4 4, 12
+PIXELSUB_PS_W4_H4 4, 32
 %endif
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -3087,9 +3148,13 @@
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
 PIXELSUB_PS_W6_H4 6, 8
+;
+PIXELSUB_PS_W6_H4 6, 16
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W6_H4 6, 8
+;
+PIXELSUB_PS_W6_H4 6, 16
 %endif
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_8x2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -3285,12 +3350,18 @@
 PIXELSUB_PS_W8_H4 8, 8
 PIXELSUB_PS_W8_H4 8, 16
 PIXELSUB_PS_W8_H4 8, 32
+;
+PIXELSUB_PS_W8_H4 8, 12
+PIXELSUB_PS_W8_H4 8, 64
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W8_H4 8, 4
 PIXELSUB_PS_W8_H4 8, 8
 PIXELSUB_PS_W8_H4 8, 16
 PIXELSUB_PS_W8_H4 8, 32
+;
+PIXELSUB_PS_W8_H4 8, 12
+PIXELSUB_PS_W8_H4 8, 64
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -3404,9 +3475,13 @@
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
 PIXELSUB_PS_W12_H4 12, 16
+;
+PIXELSUB_PS_W12_H4 12, 32
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W12_H4 12, 16
+;
+PIXELSUB_PS_W12_H4 12, 32
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -3529,6 +3604,8 @@
 PIXELSUB_PS_W16_H4 16, 16
 PIXELSUB_PS_W16_H4 16, 32
 PIXELSUB_PS_W16_H4 16, 64
+;
+PIXELSUB_PS_W16_H4 16, 24
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W16_H4 16, 4
@@ -3537,6 +3614,8 @@
 PIXELSUB_PS_W16_H4 16, 16
 PIXELSUB_PS_W16_H4 16, 32
 PIXELSUB_PS_W16_H4 16, 64
+;
+PIXELSUB_PS_W16_H4 16, 24
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -3632,9 +3711,13 @@
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
 PIXELSUB_PS_W24_H2 24, 32
+;
+PIXELSUB_PS_W24_H2 24, 64
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W24_H2 24, 32
+;
+PIXELSUB_PS_W24_H2 24, 64
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -3752,6 +3835,8 @@
 PIXELSUB_PS_W32_H2 32, 24
 PIXELSUB_PS_W32_H2 32, 32
 PIXELSUB_PS_W32_H2 32, 64
+;
+PIXELSUB_PS_W32_H2 32, 48
 %else
 INIT_XMM sse4
 PIXELSUB_PS_W32_H2 32, 8
@@ -3759,6 +3844,8 @@
 PIXELSUB_PS_W32_H2 32, 24
 PIXELSUB_PS_W32_H2 32, 32
 PIXELSUB_PS_W32_H2 32, 64
+;
+PIXELSUB_PS_W32_H2 32, 48
 %endif
 
 ;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixel.h	Tue Aug 05 21:41:53 2014 +0900
@@ -206,6 +206,16 @@
 ADDAVG(addAvg_64x48)
 ADDAVG(addAvg_64x64)
 
+ADDAVG(addAvg_2x16)
+ADDAVG(addAvg_4x32)
+ADDAVG(addAvg_6x16)
+ADDAVG(addAvg_8x12)
+ADDAVG(addAvg_8x64)
+ADDAVG(addAvg_12x32)
+ADDAVG(addAvg_16x24)
+ADDAVG(addAvg_24x64)
+ADDAVG(addAvg_32x48)
+
 void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
 
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixeladd8.asm	Tue Aug 05 21:41:53 2014 +0900
@@ -212,6 +212,8 @@
 
 PIXEL_ADD_PS_W2_H4   2, 8
 
+PIXEL_ADD_PS_W2_H4   2, 16
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_4x2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -359,6 +361,7 @@
 PIXEL_ADD_PS_W4_H4   4,  8
 PIXEL_ADD_PS_W4_H4   4, 16
 
+PIXEL_ADD_PS_W4_H4   4, 32
 
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
@@ -469,6 +472,8 @@
 
 PIXEL_ADD_PS_W6_H4 6,  8
 
+PIXEL_ADD_PS_W6_H4 6,  16
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_8x2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -736,6 +741,8 @@
 PIXEL_ADD_PS_W8_H4 8, 16
 PIXEL_ADD_PS_W8_H4 8, 32
 
+PIXEL_ADD_PS_W8_H4 8, 12
+PIXEL_ADD_PS_W8_H4 8, 64
 
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
@@ -886,6 +893,8 @@
 
 PIXEL_ADD_PS_W12_H4 12, 16
 
+PIXEL_ADD_PS_W12_H4 12, 32
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1033,6 +1042,8 @@
 PIXEL_ADD_PS_W16_H4 16, 32
 PIXEL_ADD_PS_W16_H4 16, 64
 
+PIXEL_ADD_PS_W16_H4 16, 24
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1138,6 +1149,8 @@
 
 PIXEL_ADD_PS_W24_H2 24, 32
 
+PIXEL_ADD_PS_W24_H2 24, 64
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1265,6 +1278,8 @@
 PIXEL_ADD_PS_W32_H2 32, 32
 PIXEL_ADD_PS_W32_H2 32, 64
 
+PIXEL_ADD_PS_W32_H2 32, 48
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/ipfilterharness.cpp	Tue Aug 05 21:41:53 2014 +0900
@@ -171,7 +171,7 @@
         for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100 + 2;
-            rand_dstStride = rand() % 100 + 32;
+            rand_dstStride = rand() % 100 + 64;
 
             checked(opt, pixel_test_buff[index] + 3 * rand_srcStride,
                     rand_srcStride,
@@ -206,7 +206,7 @@
         for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
-            rand_dstStride = rand() % 100 + 32;
+            rand_dstStride = rand() % 100 + 64;
 
             ref(pixel_test_buff[index] + 3 * rand_srcStride,
                 rand_srcStride,
@@ -244,7 +244,7 @@
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
             {
                 rand_srcStride = rand() % 100 + 2;
-                rand_dstStride = rand() % 100;
+                rand_dstStride = rand() % 100 + 64;
 
                 ref(pixel_test_buff[index] + 3 * rand_srcStride,
                     rand_srcStride,
@@ -282,7 +282,7 @@
         for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
-            rand_dstStride = rand() % 100 + 32;
+            rand_dstStride = rand() % 100 + 64;
 
             ref(short_test_buff[index] + 3 * rand_srcStride,
                 rand_srcStride,
@@ -317,7 +317,7 @@
         for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
         {
             rand_srcStride = rand() % 100;
-            rand_dstStride = rand() % 100 + 32;
+            rand_dstStride = rand() % 100 + 64;
 
             ref(short_test_buff[index] + 3 * rand_srcStride,
                 rand_srcStride,
@@ -535,7 +535,7 @@
             for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++)
             {
                 rand_srcStride = rand() % 100;
-                rand_dstStride = rand() % 100;
+                rand_dstStride = rand() % 100 + 64;
 
                 ref(pixel_test_buff[index] + 3 * rand_srcStride,
                     rand_srcStride,
@@ -650,7 +650,7 @@
             {
                 if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_hpp[value], opt.chroma[csp].filter_hpp[value]))
                 {
-                    printf("chroma_hpp[%s]", chromaPartStr[value]);
+                    printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
                     return false;
                 }
             }
@@ -658,7 +658,7 @@
             {
                 if (!check_IPFilterChroma_hps_primitive(ref.chroma[csp].filter_hps[value], opt.chroma[csp].filter_hps[value]))
                 {
-                    printf("chroma_hps[%s]", chromaPartStr[value]);
+                    printf("chroma_hps[%s]", chromaPartStr[csp][value]);
                     return false;
                 }
             }
@@ -666,7 +666,7 @@
             {
                 if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_vpp[value], opt.chroma[csp].filter_vpp[value]))
                 {
-                    printf("chroma_vpp[%s]", chromaPartStr[value]);
+                    printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
                     return false;
                 }
             }
@@ -674,7 +674,7 @@
             {
                 if (!check_IPFilterChroma_ps_primitive(ref.chroma[csp].filter_vps[value], opt.chroma[csp].filter_vps[value]))
                 {
-                    printf("chroma_vps[%s]", chromaPartStr[value]);
+                    printf("chroma_vps[%s]", chromaPartStr[csp][value]);
                     return false;
                 }
             }
@@ -682,7 +682,7 @@
             {
                 if (!check_IPFilterChroma_sp_primitive(ref.chroma[csp].filter_vsp[value], opt.chroma[csp].filter_vsp[value]))
                 {
-                    printf("chroma_vsp[%s]", chromaPartStr[value]);
+                    printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
                     return false;
                 }
             }
@@ -690,7 +690,7 @@
             {
                 if (!check_IPFilterChroma_ss_primitive(ref.chroma[csp].filter_vss[value], opt.chroma[csp].filter_vss[value]))
                 {
-                    printf("chroma_vss[%s]", chromaPartStr[value]);
+                    printf("chroma_vss[%s]", chromaPartStr[csp][value]);
                     return false;
                 }
             }
@@ -785,40 +785,40 @@
         {
             if (opt.chroma[csp].filter_hpp[value])
             {
-                printf("chroma_hpp[%s]", chromaPartStr[value]);
+                printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].filter_hpp[value], ref.chroma[csp].filter_hpp[value],
                                pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
             }
             if (opt.chroma[csp].filter_hps[value])
             {
-                printf("chroma_hps[%s]", chromaPartStr[value]);
+                printf("chroma_hps[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].filter_hps[value], ref.chroma[csp].filter_hps[value],
                                pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
             }
             if (opt.chroma[csp].filter_vpp[value])
             {
-                printf("chroma_vpp[%s]", chromaPartStr[value]);
+                printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].filter_vpp[value], ref.chroma[csp].filter_vpp[value],
                                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
                                IPF_vec_output_p, dstStride, 1);
             }
             if (opt.chroma[csp].filter_vps[value])
             {
-                printf("chroma_vps[%s]", chromaPartStr[value]);
+                printf("chroma_vps[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].filter_vps[value], ref.chroma[csp].filter_vps[value],
                                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
                                IPF_vec_output_s, dstStride, 1);
             }
             if (opt.chroma[csp].filter_vsp[value])
             {
-                printf("chroma_vsp[%s]", chromaPartStr[value]);
+                printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].filter_vsp[value], ref.chroma[csp].filter_vsp[value],
                                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
                                IPF_vec_output_p, dstStride, 1);
             }
             if (opt.chroma[csp].filter_vss[value])
             {
-                printf("chroma_vss[%s]", chromaPartStr[value]);
+                printf("chroma_vss[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].filter_vss[value], ref.chroma[csp].filter_vss[value],
                                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
                                IPF_vec_output_s, dstStride, 1);
diff -r 0d4723a0080c -r 770c40d768d5 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/pixelharness.cpp	Tue Aug 05 21:41:53 2014 +0900
@@ -1250,7 +1250,7 @@
         {
             if (!check_copy_pp(ref.chroma[i].copy_pp[part], opt.chroma[i].copy_pp[part]))
             {
-                printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1258,7 +1258,7 @@
         {
             if (!check_copy_sp(ref.chroma[i].copy_sp[part], opt.chroma[i].copy_sp[part]))
             {
-                printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1266,7 +1266,7 @@
         {
             if (!check_copy_ps(ref.chroma[i].copy_ps[part], opt.chroma[i].copy_ps[part]))
             {
-                printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1274,7 +1274,7 @@
         {
             if (!check_copy_ss(ref.chroma[i].copy_ss[part], opt.chroma[i].copy_ss[part]))
             {
-                printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1282,7 +1282,7 @@
         {
             if (!check_pixel_sub_ps(ref.chroma[i].sub_ps[part], opt.chroma[i].sub_ps[part]))
             {
-                printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1290,7 +1290,7 @@
         {
             if (!check_pixel_add_ps(ref.chroma[i].add_ps[part], opt.chroma[i].add_ps[part]))
             {
-                printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1298,7 +1298,7 @@
         {
             if (!check_addAvg(ref.chroma[i].addAvg[part], opt.chroma[i].addAvg[part]))
             {
-                printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+                printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
                 return false;
             }
         }
@@ -1651,37 +1651,37 @@
     {
         if (opt.chroma[i].copy_pp[part])
         {
-            HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].copy_pp[part], ref.chroma[i].copy_pp[part], pbuf1, 64, pbuf2, 128);
         }
         if (opt.chroma[i].copy_sp[part])
         {
-            HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].copy_sp[part], ref.chroma[i].copy_sp[part], pbuf1, 64, sbuf3, 128);
         }
         if (opt.chroma[i].copy_ps[part])
         {
-            HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].copy_ps[part], ref.chroma[i].copy_ps[part], sbuf1, 64, pbuf1, 128);
         }
         if (opt.chroma[i].copy_ss[part])
         {
-            HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].copy_ss[part], ref.chroma[i].copy_ss[part], sbuf1, 64, sbuf2, 128);
         }
         if (opt.chroma[i].sub_ps[part])
         {
-            HEADER("[%s]  sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s]  sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].sub_ps[part], ref.chroma[i].sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
         }
         if (opt.chroma[i].add_ps[part])
         {
-            HEADER("[%s]  add_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s]  add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].add_ps[part], ref.chroma[i].add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
         }
         if (opt.chroma[i].addAvg[part])
         {
-            HEADER("[%s]  addAvg[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+            HEADER("[%s]  addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
             REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
         }
     }
diff -r 0d4723a0080c -r 770c40d768d5 source/test/testbench.cpp
--- a/source/test/testbench.cpp	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/testbench.cpp	Tue Aug 05 21:41:53 2014 +0900
@@ -36,20 +36,46 @@
 
 const char* lumaPartStr[NUM_LUMA_PARTITIONS] =
 {
-    "  4x4",
-    "  8x8", "  8x4", "  4x8",
-    "16x16", " 16x8", " 8x16", "16x12", "12x16", " 16x4", " 4x16",
-    "32x32", "32x16", "16x32", "32x24", "24x32", " 32x8", " 8x32",
-    "64x64", "64x32", "32x64", "64x48", "48x64", "64x16", "16x64",
+    "  4x4", "  8x8", "16x16", "32x32", "64x64",
+    "  8x4", "  4x8",
+    " 16x8", " 8x16",
+    "32x16", "16x32",
+    "64x32", "32x64",
+    "16x12", "12x16", " 16x4", " 4x16",
+    "32x24", "24x32", " 32x8", " 8x32",
+    "64x48", "48x64", "64x16", "16x64",
 };
 
-const char* chromaPartStr[NUM_CHROMA_PARTITIONS] =
+const char* chromaPartStr420[NUM_CHROMA_PARTITIONS] =
 {
-    "  2x2", // never used by HEVC
-    "  4x4", "  4x2", "  2x4",
-    "  8x8", "  8x4", "  4x8", "  8x6", "  6x8", "  8x2", "  2x8",
-    "16x16", " 16x8", " 8x16", "16x12", "12x16", " 16x4", " 4x16",
-    "32x32", "32x16", "16x32", "32x24", "24x32", " 32x8", " 8x32",
+    "  2x2", "  4x4", "  8x8", "16x16", "32x32",
+    "  4x2", "  2x4",
+    "  8x4", "  4x8",
+    " 16x8", " 8x16",
+    "32x16", "16x32",
+    "  8x6", "  6x8", "  8x2", "  2x8",
+    "16x12", "12x16", " 16x4", " 4x16",
+    "32x24", "24x32", " 32x8", " 8x32",
+};
+
+const char* chromaPartStr422[NUM_CHROMA_PARTITIONS] =
+{
+    "  2x4", "  4x8", " 8x16", "16x32", "32x64",
+    "  4x4", "  2x8",
+    "  8x8", " 4x16",
+    "16x16", " 8x32",
+    "32x32", "16x64",
+    " 8x12", " 6x16", "  8x4", " 2x16",
+    "16x24", "12x32", " 16x8", " 4x32",
+    "32x48", "24x64", "32x16", " 8x64",
+};
+
+const char* const* chromaPartStr[X265_CSP_COUNT] =
+{
+    lumaPartStr,
+    chromaPartStr420,
+    chromaPartStr422,
+    lumaPartStr
 };
 
 void do_help()
diff -r 0d4723a0080c -r 770c40d768d5 source/test/testharness.h
--- a/source/test/testharness.h	Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/testharness.h	Tue Aug 05 21:41:53 2014 +0900
@@ -40,7 +40,7 @@
 using namespace x265;
 
 extern const char* lumaPartStr[NUM_LUMA_PARTITIONS];
-extern const char* chromaPartStr[NUM_CHROMA_PARTITIONS];
+extern const char* const* chromaPartStr[X265_CSP_COUNT];
 
 class TestHarness
 {


More information about the x265-devel mailing list