[x265] primitives for RExt
Satoshi Nakagawa
nakagawa424 at oki.com
Tue Aug 5 14:48:50 CEST 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1407242513 -32400
# Tue Aug 05 21:41:53 2014 +0900
# Node ID 770c40d768d55e68e76c485d5dc61d014257e789
# Parent 0d4723a0080cff763ff20ab9c516c6e082496a0b
primitives for RExt
diff -r 0d4723a0080c -r 770c40d768d5 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/pixel.cpp Tue Aug 05 21:41:53 2014 +0900
@@ -1015,13 +1015,6 @@
p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-#define CHROMA_422_X(W, H) \
- p.chroma[X265_CSP_I422].addAvg[CHROMA422X_ ## W ## x ## H] = addAvg<W, H>; \
- p.chroma[X265_CSP_I422].copy_pp[CHROMA422X_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_sp[CHROMA422X_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_ps[CHROMA422X_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422X_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
-
#define CHROMA_444(W, H) \
p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
@@ -1090,7 +1083,6 @@
LUMA(16, 64);
CHROMA_420(8, 32);
- CHROMA_422_X(4, 8);
CHROMA_422(4, 8);
CHROMA_422(4, 4);
CHROMA_422(2, 8);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/primitives.h
--- a/source/common/primitives.h Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/primitives.h Tue Aug 05 21:41:53 2014 +0900
@@ -69,7 +69,7 @@
enum Chroma422Partitions
{
- CHROMA422X_4x8, CHROMA422_4x8, CHROMA422_8x16, CHROMA422_16x32, CHROMA422_32x64,
+ CHROMA422_2x4, CHROMA422_4x8, CHROMA422_8x16, CHROMA422_16x32, CHROMA422_32x64,
CHROMA422_4x4, CHROMA422_2x8,
CHROMA422_8x8, CHROMA422_4x16,
CHROMA422_16x16, CHROMA422_8x32,
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 05 21:41:53 2014 +0900
@@ -250,6 +250,12 @@
p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
+#define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
+
#define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
@@ -259,12 +265,18 @@
#define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+#define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
#define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
#define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+#define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
+
#define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
@@ -294,8 +306,33 @@
SETUP_CHROMA_FUNC_DEF_420(32, 8, cpu); \
SETUP_CHROMA_FUNC_DEF_420(8, 32, cpu);
+#define CHROMA_FILTERS_422(cpu) \
+ SETUP_CHROMA_FUNC_DEF_422(4, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(4, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(2, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(8, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(8, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(4, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(8, 12, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(6, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(8, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(2, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(16, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(16, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(8, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(16, 24, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(12, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(16, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(4, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(32, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(32, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(16, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(32, 48, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(24, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(32, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_422(8, 64, cpu);
+
#define CHROMA_FILTERS_444(cpu) \
- SETUP_CHROMA_FUNC_DEF_444(4, 4, cpu); \
SETUP_CHROMA_FUNC_DEF_444(8, 8, cpu); \
SETUP_CHROMA_FUNC_DEF_444(8, 4, cpu); \
SETUP_CHROMA_FUNC_DEF_444(4, 8, cpu); \
@@ -312,12 +349,22 @@
SETUP_CHROMA_FUNC_DEF_444(32, 24, cpu); \
SETUP_CHROMA_FUNC_DEF_444(24, 32, cpu); \
SETUP_CHROMA_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu);
+ SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(64, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(64, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(32, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(64, 48, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(48, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(64, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF_444(16, 64, cpu);
#define CHROMA_SP_FILTERS_SSE4_420(cpu) \
SETUP_CHROMA_SP_FUNC_DEF_420(4, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_420(4, 2, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_420(2, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_420(4, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_420(6, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_420(2, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_420(16, 16, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_420(16, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_420(16, 12, cpu); \
@@ -339,8 +386,35 @@
SETUP_CHROMA_SP_FUNC_DEF_420(8, 16, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_420(8, 32, cpu);
+#define CHROMA_SP_FILTERS_SSE4_422(cpu) \
+ SETUP_CHROMA_SP_FUNC_DEF_422(4, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(4, 4, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(2, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(4, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(6, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(2, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(16, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(16, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(16, 24, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(12, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(16, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(4, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(32, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(32, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(16, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(32, 48, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(24, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(32, 16, cpu);
+
+#define CHROMA_SP_FILTERS_422(cpu) \
+ SETUP_CHROMA_SP_FUNC_DEF_422(8, 4, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(8, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(8, 12, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(8, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(8, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_422(8, 64, cpu);
+
#define CHROMA_SP_FILTERS_SSE4_444(cpu) \
- SETUP_CHROMA_SP_FUNC_DEF_444(4, 4, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_444(4, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_444(16, 16, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_444(16, 8, cpu); \
@@ -353,7 +427,14 @@
SETUP_CHROMA_SP_FUNC_DEF_444(16, 32, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_444(32, 24, cpu); \
SETUP_CHROMA_SP_FUNC_DEF_444(24, 32, cpu); \
- SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu);
+ SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(64, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(64, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(32, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(64, 48, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(48, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(64, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF_444(16, 64, cpu);
#define CHROMA_SP_FILTERS_444(cpu) \
SETUP_CHROMA_SP_FUNC_DEF_444(8, 8, cpu); \
@@ -389,8 +470,35 @@
SETUP_CHROMA_SS_FUNC_DEF_420(2, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF_420(6, 8, cpu);
+#define CHROMA_SS_FILTERS_422(cpu) \
+ SETUP_CHROMA_SS_FUNC_DEF_422(4, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(4, 4, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(8, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(8, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(4, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(8, 12, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(8, 4, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(16, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(16, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(8, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(16, 24, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(12, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(16, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(4, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(32, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(32, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(16, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(32, 48, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(24, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(32, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(8, 64, cpu);
+
+#define CHROMA_SS_FILTERS_SSE4_422(cpu) \
+ SETUP_CHROMA_SS_FUNC_DEF_422(2, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(2, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_422(6, 16, cpu);
+
#define CHROMA_SS_FILTERS_444(cpu) \
- SETUP_CHROMA_SS_FUNC_DEF_444(4, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF_444(8, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF_444(8, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF_444(4, 8, cpu); \
@@ -407,7 +515,14 @@
SETUP_CHROMA_SS_FUNC_DEF_444(32, 24, cpu); \
SETUP_CHROMA_SS_FUNC_DEF_444(24, 32, cpu); \
SETUP_CHROMA_SS_FUNC_DEF_444(32, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu);
+ SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(64, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(64, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(32, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(64, 48, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(48, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(64, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF_444(16, 64, cpu)
#if HIGH_BIT_DEPTH // temporary, until all 10bit functions are completed
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
@@ -466,6 +581,35 @@
SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
+#define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
+ p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
+
+#define CHROMA_BLOCKCOPY_422(type, cpu) \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 2, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 2, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 4, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 4, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 4, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 6, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 8, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 8, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 8, 12, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 8, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 8, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 8, 64, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 12, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 16, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 16, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 16, 24, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 16, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 16, 64, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 24, 64, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 32, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 32, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 32, 48, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_422(type, 32, 64, cpu);
+
#define LUMA_BLOCKCOPY(type, cpu) \
SETUP_LUMA_BLOCKCOPY(type, 4, 4, cpu); \
SETUP_LUMA_BLOCKCOPY(type, 8, 8, cpu); \
@@ -497,10 +641,13 @@
p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
#define CHROMA_BLOCKCOPY_SP(cpu) \
+ SETUP_CHROMA_BLOCKCOPY_SP(2, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(2, 8, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(4, 2, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(4, 4, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(4, 8, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(6, 8, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(8, 2, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(8, 4, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(8, 6, cpu); \
@@ -519,35 +666,94 @@
SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
-#define SETUP_CHROMA_LUMA(W1, H1, W2, H2, cpu) \
- p.chroma[X265_CSP_I420].sub_ps[LUMA_ ## W1 ## x ## H1] = x265_pixel_sub_ps_ ## W2 ## x ## H2 ## cpu; \
- p.chroma[X265_CSP_I420].add_ps[LUMA_ ## W1 ## x ## H1] = x265_pixel_add_ps_ ## W2 ## x ## H2 ## cpu;
+#define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_BLOCKCOPY_SP_422(cpu) \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(2, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(2, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(4, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(4, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(4, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(6, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(8, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(8, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(8, 12, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(8, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(8, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(8, 64, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(12, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(16, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(16, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(16, 24, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(16, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(16, 64, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(24, 64, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(32, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(32, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(32, 48, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
+
+#define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
+ p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
#define CHROMA_PIXELSUB_PS(cpu) \
- SETUP_CHROMA_LUMA(8, 8, 4, 4, cpu); \
- SETUP_CHROMA_LUMA(8, 4, 4, 2, cpu); \
- SETUP_CHROMA_LUMA(4, 8, 2, 4, cpu); \
- SETUP_CHROMA_LUMA(16, 16, 8, 8, cpu); \
- SETUP_CHROMA_LUMA(16, 8, 8, 4, cpu); \
- SETUP_CHROMA_LUMA(8, 16, 4, 8, cpu); \
- SETUP_CHROMA_LUMA(16, 12, 8, 6, cpu); \
- SETUP_CHROMA_LUMA(12, 16, 6, 8, cpu); \
- SETUP_CHROMA_LUMA(16, 4, 8, 2, cpu); \
- SETUP_CHROMA_LUMA(4, 16, 2, 8, cpu); \
- SETUP_CHROMA_LUMA(32, 32, 16, 16, cpu); \
- SETUP_CHROMA_LUMA(32, 16, 16, 8, cpu); \
- SETUP_CHROMA_LUMA(16, 32, 8, 16, cpu); \
- SETUP_CHROMA_LUMA(32, 24, 16, 12, cpu); \
- SETUP_CHROMA_LUMA(24, 32, 12, 16, cpu); \
- SETUP_CHROMA_LUMA(32, 8, 16, 4, cpu); \
- SETUP_CHROMA_LUMA(8, 32, 4, 16, cpu); \
- SETUP_CHROMA_LUMA(64, 64, 32, 32, cpu); \
- SETUP_CHROMA_LUMA(64, 32, 32, 16, cpu); \
- SETUP_CHROMA_LUMA(32, 64, 16, 32, cpu); \
- SETUP_CHROMA_LUMA(64, 48, 32, 24, cpu); \
- SETUP_CHROMA_LUMA(48, 64, 24, 32, cpu); \
- SETUP_CHROMA_LUMA(64, 16, 32, 8, cpu); \
- SETUP_CHROMA_LUMA(16, 64, 8, 32, cpu);
+ SETUP_CHROMA_PIXELSUB(4, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB(4, 2, cpu); \
+ SETUP_CHROMA_PIXELSUB(2, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB(8, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB(8, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB(4, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB(8, 6, cpu); \
+ SETUP_CHROMA_PIXELSUB(6, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB(8, 2, cpu); \
+ SETUP_CHROMA_PIXELSUB(2, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB(16, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB(16, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB(8, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB(16, 12, cpu); \
+ SETUP_CHROMA_PIXELSUB(12, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB(16, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB(4, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB(32, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB(32, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB(16, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB(32, 24, cpu); \
+ SETUP_CHROMA_PIXELSUB(24, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB(32, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB(8, 32, cpu);
+
+#define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_PIXELSUB_PS_422(cpu) \
+ SETUP_CHROMA_PIXELSUB_422(4, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(4, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(2, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(8, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(8, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(4, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(8, 12, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(6, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(8, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(2, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(16, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(8, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(16, 24, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(12, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(16, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(4, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(32, 64, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(32, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(16, 64, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(32, 48, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(24, 64, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(32, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_422(8, 64, cpu);
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
@@ -753,7 +959,36 @@
SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG_422(cpu) \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(6, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 12, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 64, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(12, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 24, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 64, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(24, 64, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
@@ -897,6 +1132,72 @@
SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
+#define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_VERT_FILTERS_422(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(8, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(8, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(4, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(8, 12, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(8, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(16, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(16, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(8, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(16, 24, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(12, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(16, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(4, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(32, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(32, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(16, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(32, 48, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(24, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(32, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(8, 64, cpu);
+
+#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(2, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(2, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(4, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
+
+#define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
+ p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_VERT_FILTERS_444(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(8, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(4, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(16, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(16, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(8, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(16, 12, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(12, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(16, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(4, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(32, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(32, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(16, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(32, 24, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(24, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(32, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(8, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(64, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(64, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(32, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(64, 48, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(48, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(64, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu)
+
#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
@@ -927,6 +1228,66 @@
SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
+#define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
+ p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_HORIZ_FILTERS_422(cpu) \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 12, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(6, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 24, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(12, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 48, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(24, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu)
+
+#define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
+ p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_HORIZ_FILTERS_444(cpu) \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 12, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(12, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 24, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(24, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 48, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(48, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 64, cpu)
+
namespace x265 {
// private x265 namespace
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
@@ -1010,14 +1371,20 @@
p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
CHROMA_PIXELSUB_PS(_sse2);
+ CHROMA_PIXELSUB_PS_422(_sse2);
LUMA_PIXELSUB(_sse2);
CHROMA_BLOCKCOPY(ss, _sse2);
+ CHROMA_BLOCKCOPY_422(ss, _sse2);
LUMA_BLOCKCOPY(ss, _sse2);
CHROMA_VERT_FILTERS(_sse2);
+ CHROMA_VERT_FILTERS_422(_sse2);
+ CHROMA_VERT_FILTERS_444(_sse2);
+ p.luma_p2s = x265_luma_p2s_sse2;
p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
- p.luma_p2s = x265_luma_p2s_sse2;
+ p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
+ p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
@@ -1061,9 +1428,13 @@
{
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
+ CHROMA_ADDAVG_422(_sse4);
LUMA_FILTERS(_sse4);
CHROMA_HORIZ_FILTERS(_sse4);
CHROMA_VERT_FILTERS_SSE4(_sse4);
+ CHROMA_HORIZ_FILTERS_422(_sse4);
+ CHROMA_VERT_FILTERS_SSE4_422(_sse4);
+ CHROMA_HORIZ_FILTERS_444(_sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
p.quant = x265_quant_sse4;
@@ -1116,6 +1487,13 @@
p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
}
+ for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
+ {
+ p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
+ p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
+ p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
+ }
+
#else // if HIGH_BIT_DEPTH
if (cpuMask & X265_CPU_SSE2)
{
@@ -1141,14 +1519,19 @@
CHROMA_BLOCKCOPY(ss, _sse2);
CHROMA_BLOCKCOPY(pp, _sse2);
+ CHROMA_BLOCKCOPY_422(ss, _sse2);
+ CHROMA_BLOCKCOPY_422(pp, _sse2);
LUMA_BLOCKCOPY(ss, _sse2);
LUMA_BLOCKCOPY(pp, _sse2);
LUMA_BLOCKCOPY(sp, _sse2);
CHROMA_BLOCKCOPY_SP(_sse2);
+ CHROMA_BLOCKCOPY_SP_422(_sse2);
CHROMA_SS_FILTERS_420(_sse2);
+ CHROMA_SS_FILTERS_422(_sse2);
CHROMA_SS_FILTERS_444(_sse2);
CHROMA_SP_FILTERS_420(_sse2);
+ CHROMA_SP_FILTERS_422(_sse2);
CHROMA_SP_FILTERS_444(_sse2);
LUMA_SS_FILTERS(_sse2);
@@ -1215,6 +1598,7 @@
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
p.luma_p2s = x265_luma_p2s_ssse3;
p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
+ p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
p.dct[DST_4x4] = x265_dst4_ssse3;
@@ -1227,6 +1611,7 @@
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
+ CHROMA_ADDAVG_422(_sse4);
p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
// TODO: check POPCNT flag!
@@ -1248,13 +1633,17 @@
LUMA_SSE_SP(_sse4);
CHROMA_PIXELSUB_PS(_sse4);
+ CHROMA_PIXELSUB_PS_422(_sse4);
LUMA_PIXELSUB(_sse4);
CHROMA_FILTERS_420(_sse4);
+ CHROMA_FILTERS_422(_sse4);
CHROMA_FILTERS_444(_sse4);
CHROMA_SS_FILTERS_SSE4_420(_sse4);
+ CHROMA_SS_FILTERS_SSE4_422(_sse4);
+ CHROMA_SP_FILTERS_SSE4_420(_sse4);
+ CHROMA_SP_FILTERS_SSE4_422(_sse4);
CHROMA_SP_FILTERS_SSE4_444(_sse4);
- CHROMA_SP_FILTERS_SSE4_420(_sse4);
LUMA_SP_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
ASSGN_SSE_SS(sse4);
@@ -1263,12 +1652,9 @@
p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
CHROMA_BLOCKCOPY(ps, _sse4);
+ CHROMA_BLOCKCOPY_422(ps, _sse4);
LUMA_BLOCKCOPY(ps, _sse4);
- p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x4] = x265_interp_4tap_vert_sp_2x4_sse4;
- p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x8] = x265_interp_4tap_vert_sp_2x8_sse4;
- p.chroma[X265_CSP_I420].filter_vsp[CHROMA_6x8] = x265_interp_4tap_vert_sp_6x8_sse4;
-
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/blockcopy8.asm Tue Aug 05 21:41:53 2014 +0900
@@ -92,6 +92,24 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_2x16, 4, 7, 0
+ mov r6d, 16/2
+.loop:
+ mov r4w, [r2]
+ mov r5w, [r2 + r3]
+ dec r6d
+ lea r2, [r2 + r3 * 2]
+ mov [r0], r4w
+ mov [r0 + r1], r5w
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+
+
+;-----------------------------------------------------------------------------
; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -166,6 +184,8 @@
BLOCKCOPY_PP_W4_H8 4, 8
BLOCKCOPY_PP_W4_H8 4, 16
+BLOCKCOPY_PP_W4_H8 4, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -232,6 +252,28 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_6x16, 4, 7, 2
+ mov r6d, 16/2
+.loop:
+ movd m0, [r2]
+ mov r4w, [r2 + 4]
+ movd m1, [r2 + r3]
+ mov r5w, [r2 + r3 + 4]
+ lea r2, [r2 + r3 * 2]
+ movd [r0], m0
+ mov [r0 + 4], r4w
+ movd [r0 + r1], m1
+ mov [r0 + r1 + 4], r5w
+ lea r0, [r0 + r1 * 2]
+ dec r6d
+ jnz .loop
+ RET
+
+
+;-----------------------------------------------------------------------------
; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -286,6 +328,23 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x12, 4, 5, 2
+ mov r4d, 12/2
+.loop:
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ movh [r0], m0
+ movh [r0 + r1], m1
+ dec r4d
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ jnz .loop
+ RET
+
+;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W8_H8 2
@@ -330,6 +389,8 @@
BLOCKCOPY_PP_W8_H8 8, 16
BLOCKCOPY_PP_W8_H8 8, 32
+BLOCKCOPY_PP_W8_H8 8, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -370,6 +431,8 @@
BLOCKCOPY_PP_W12_H4 12, 16
+BLOCKCOPY_PP_W12_H4 12, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -448,6 +511,8 @@
BLOCKCOPY_PP_W16_H8 16, 32
BLOCKCOPY_PP_W16_H8 16, 64
+BLOCKCOPY_PP_W16_H8 16, 24
+
;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -487,6 +552,8 @@
BLOCKCOPY_PP_W24_H4 24, 32
+BLOCKCOPY_PP_W24_H4 24, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -531,6 +598,8 @@
BLOCKCOPY_PP_W32_H4 32, 32
BLOCKCOPY_PP_W32_H4 32, 64
+BLOCKCOPY_PP_W32_H4 32, 48
+
;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -718,6 +787,35 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W2_H2 2
+INIT_XMM sse2
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+ add r3, r3
+ mov r6d, %2/2
+.loop:
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ dec r6d
+ lea r2, [r2 + r3 * 2]
+ packuswb m0, m0
+ packuswb m1, m1
+ movd r4d, m0
+ movd r5d, m1
+ mov [r0], r4w
+ mov [r0 + r1], r5w
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SP_W2_H2 2, 4
+BLOCKCOPY_SP_W2_H2 2, 8
+
+BLOCKCOPY_SP_W2_H2 2, 16
+
+;-----------------------------------------------------------------------------
; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -862,6 +960,8 @@
BLOCKCOPY_SP_W4_H8 4, 16
+BLOCKCOPY_SP_W4_H8 4, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
@@ -926,6 +1026,40 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W6_H2 2
+INIT_XMM sse2
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+ add r3, r3
+ mov r6d, %2/2
+.loop:
+ movh m0, [r2]
+ movd m2, [r2 + 8]
+ movh m1, [r2 + r3]
+ movd m3, [r2 + r3 + 8]
+ dec r6d
+ lea r2, [r2 + r3 * 2]
+ packuswb m0, m0
+ packuswb m2, m2
+ packuswb m1, m1
+ packuswb m3, m3
+ movd r4d, m2
+ movd r5d, m3
+ movd [r0], m0
+ mov [r0 + 4], r4w
+ movd [r0 + r1], m1
+ mov [r0 + r1 + 4], r5w
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SP_W6_H2 6, 8
+
+BLOCKCOPY_SP_W6_H2 6, 16
+
+;-----------------------------------------------------------------------------
; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -1042,6 +1176,36 @@
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W8_H4 2
+INIT_XMM sse2
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+ add r3, r3
+ mov r4d, %2/4
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movu m2, [r2]
+ movu m3, [r2 + r3]
+ dec r4d
+ lea r2, [r2 + r3 * 2]
+ packuswb m0, m1
+ packuswb m2, m3
+ movlps [r0], m0
+ movhps [r0 + r1], m0
+ lea r0, [r0 + r1 * 2]
+ movlps [r0], m2
+ movhps [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SP_W8_H4 8, 12
+
+;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H8 2
INIT_XMM sse2
cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
@@ -1092,6 +1256,8 @@
BLOCKCOPY_SP_W8_H8 8, 16
BLOCKCOPY_SP_W8_H8 8, 32
+BLOCKCOPY_SP_W8_H8 8, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
@@ -1147,6 +1313,8 @@
BLOCKCOPY_SP_W12_H4 12, 16
+BLOCKCOPY_SP_W12_H4 12, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
@@ -1196,6 +1364,8 @@
BLOCKCOPY_SP_W16_H4 16, 32
BLOCKCOPY_SP_W16_H4 16, 64
+BLOCKCOPY_SP_W16_H4 16, 24
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
@@ -1235,6 +1405,8 @@
BLOCKCOPY_SP_W24_H2 24, 32
+BLOCKCOPY_SP_W24_H2 24, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
@@ -1281,6 +1453,8 @@
BLOCKCOPY_SP_W32_H2 32, 32
BLOCKCOPY_SP_W32_H2 32, 64
+BLOCKCOPY_SP_W32_H2 32, 48
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
@@ -1596,6 +1770,28 @@
RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+ add r1, r1
+ mov r4d, 16/2
+.loop:
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ dec r4d
+ lea r2, [r2 + r3 * 2]
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ movd [r0], m0
+ movd [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -1687,6 +1883,9 @@
BLOCKCOPY_PS_W4_H4 4, 8
BLOCKCOPY_PS_W4_H4 4, 16
+BLOCKCOPY_PS_W4_H4 4, 32
+
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -1732,6 +1931,8 @@
BLOCKCOPY_PS_W6_H4 6, 8
+BLOCKCOPY_PS_W6_H4 6, 16
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -1862,6 +2063,9 @@
BLOCKCOPY_PS_W8_H4 8, 16
BLOCKCOPY_PS_W8_H4 8, 32
+BLOCKCOPY_PS_W8_H4 8, 12
+BLOCKCOPY_PS_W8_H4 8, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
@@ -1898,6 +2102,8 @@
BLOCKCOPY_PS_W12_H2 12, 16
+BLOCKCOPY_PS_W12_H2 12, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -1990,6 +2196,8 @@
BLOCKCOPY_PS_W16_H4 16, 32
BLOCKCOPY_PS_W16_H4 16, 64
+BLOCKCOPY_PS_W16_H4 16, 24
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -2033,6 +2241,8 @@
BLOCKCOPY_PS_W24_H2 24, 32
+BLOCKCOPY_PS_W24_H2 24, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -2084,6 +2294,8 @@
BLOCKCOPY_PS_W32_H2 32, 32
BLOCKCOPY_PS_W32_H2 32, 64
+BLOCKCOPY_PS_W32_H2 32, 48
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
;-----------------------------------------------------------------------------
@@ -2280,6 +2492,26 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_2x16, 4, 7, 0
+ add r1, r1
+ add r3, r3
+ mov r6d, 16/2
+.loop:
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ dec r6d
+ lea r2, [r2 + r3 * 2]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+
+
+;-----------------------------------------------------------------------------
; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -2361,6 +2593,8 @@
BLOCKCOPY_SS_W4_H8 4, 8
BLOCKCOPY_SS_W4_H8 4, 16
+BLOCKCOPY_SS_W4_H8 4, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -2417,6 +2651,30 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_6x16, 4, 5, 4
+ add r1, r1
+ add r3, r3
+ mov r4d, 16/2
+.loop:
+ movh m0, [r2]
+ movd m2, [r2 + 8]
+ movh m1, [r2 + r3]
+ movd m3, [r2 + r3 + 8]
+ dec r4d
+ lea r2, [r2 + r3 * 2]
+ movh [r0], m0
+ movd [r0 + 8], m2
+ movh [r0 + r1], m1
+ movd [r0 + r1 + 8], m3
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+
+
+;-----------------------------------------------------------------------------
; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -2483,6 +2741,26 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_8x12, 4, 5, 2
+ add r1, r1
+ add r3, r3
+ mov r4d, 12/2
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ lea r2, [r2 + 2 * r3]
+ dec r4d
+ movu [r0], m0
+ movu [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+ RET
+
+
+;-----------------------------------------------------------------------------
; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W8_H8 2
@@ -2531,6 +2809,8 @@
BLOCKCOPY_SS_W8_H8 8, 16
BLOCKCOPY_SS_W8_H8 8, 32
+BLOCKCOPY_SS_W8_H8 8, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -2573,6 +2853,8 @@
BLOCKCOPY_SS_W12_H4 12, 16
+BLOCKCOPY_SS_W12_H4 12, 32
+
;-----------------------------------------------------------------------------
; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -2687,6 +2969,8 @@
BLOCKCOPY_SS_W16_H8 16, 32
BLOCKCOPY_SS_W16_H8 16, 64
+BLOCKCOPY_SS_W16_H8 16, 24
+
;-----------------------------------------------------------------------------
; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -2737,6 +3021,8 @@
BLOCKCOPY_SS_W24_H4 24, 32
+BLOCKCOPY_SS_W24_H4 24, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
@@ -2803,6 +3089,8 @@
BLOCKCOPY_SS_W32_H4 32, 32
BLOCKCOPY_SS_W32_H4 32, 64
+BLOCKCOPY_SS_W32_H4 32, 48
+
;-----------------------------------------------------------------------------
; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/blockcopy8.h Tue Aug 05 21:41:53 2014 +0900
@@ -83,12 +83,33 @@
#define BLOCKCOPY_SP(cpu) \
SETUP_BLOCKCOPY_SP(2, 4, cpu); \
SETUP_BLOCKCOPY_SP(2, 8, cpu); \
- SETUP_BLOCKCOPY_SP(6, 8, cpu);
+ SETUP_BLOCKCOPY_SP(6, 8, cpu); \
+ \
+ SETUP_BLOCKCOPY_SP(2, 16, cpu); \
+ SETUP_BLOCKCOPY_SP(4, 32, cpu); \
+ SETUP_BLOCKCOPY_SP(6, 16, cpu); \
+ SETUP_BLOCKCOPY_SP(8, 12, cpu); \
+ SETUP_BLOCKCOPY_SP(8, 64, cpu); \
+ SETUP_BLOCKCOPY_SP(12, 32, cpu); \
+ SETUP_BLOCKCOPY_SP(16, 24, cpu); \
+ SETUP_BLOCKCOPY_SP(24, 64, cpu); \
+ SETUP_BLOCKCOPY_SP(32, 48, cpu);
#define BLOCKCOPY_SS_PP(cpu) \
SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \
SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \
- SETUP_BLOCKCOPY_SS_PP(6, 8, cpu);
+ SETUP_BLOCKCOPY_SS_PP(6, 8, cpu); \
+ \
+ SETUP_BLOCKCOPY_SS_PP(2, 16, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(4, 32, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(6, 16, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(8, 12, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(8, 64, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(32, 48, cpu);
+
#define BLOCKCOPY_PS(cpu) \
SETUP_BLOCKCOPY_PS(2, 4, cpu); \
@@ -121,13 +142,25 @@
SETUP_BLOCKCOPY_PS(64, 16, cpu); \
SETUP_BLOCKCOPY_PS(64, 32, cpu); \
SETUP_BLOCKCOPY_PS(64, 48, cpu); \
- SETUP_BLOCKCOPY_PS(64, 64, cpu);
+ SETUP_BLOCKCOPY_PS(64, 64, cpu); \
+ \
+ SETUP_BLOCKCOPY_PS(2, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(4, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(6, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 12, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 64, cpu); \
+ SETUP_BLOCKCOPY_PS(12, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 24, cpu); \
+ SETUP_BLOCKCOPY_PS(24, 64, cpu); \
+ SETUP_BLOCKCOPY_PS(32, 48, cpu);
BLOCKCOPY_COMMON(_sse2);
BLOCKCOPY_SS_PP(_sse2);
BLOCKCOPY_SP(_sse4);
BLOCKCOPY_PS(_sse4);
+BLOCKCOPY_SP(_sse2);
+
void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/ipfilter16.asm Tue Aug 05 21:41:53 2014 +0900
@@ -926,6 +926,12 @@
FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
+FILTER_CHROMA_H 2, 16, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 32, pp, 6, 8, 5
+FILTER_CHROMA_H 2, 16, ps, 7, 5, 6
+FILTER_CHROMA_H 4, 32, ps, 7, 6, 6
+
+
%macro FILTER_W6_1 1
movu m3, [r0]
pshufb m3, m3, m2
@@ -1362,6 +1368,75 @@
FILTER_W32_1 ps
ret
+%macro FILTER_W8o_1 2
+ movu m3, [r0 + %2]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + %2 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + %2 + 8]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + %2 + 12]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + %2], m3
+ movhps [r2 + %2 + 8], m3
+%endmacro
+
+%macro FILTER_W48_1 1
+ FILTER_W8o_1 %1, 0
+ FILTER_W8o_1 %1, 16
+ FILTER_W8o_1 %1, 32
+ FILTER_W8o_1 %1, 48
+ FILTER_W8o_1 %1, 64
+ FILTER_W8o_1 %1, 80
+%endmacro
+
+cglobal chroma_filter_pp_48x1_internal
+ FILTER_W48_1 pp
+ ret
+
+cglobal chroma_filter_ps_48x1_internal
+ FILTER_W48_1 ps
+ ret
+
+%macro FILTER_W64_1 1
+ FILTER_W8o_1 %1, 0
+ FILTER_W8o_1 %1, 16
+ FILTER_W8o_1 %1, 32
+ FILTER_W8o_1 %1, 48
+ FILTER_W8o_1 %1, 64
+ FILTER_W8o_1 %1, 80
+ FILTER_W8o_1 %1, 96
+ FILTER_W8o_1 %1, 112
+%endmacro
+
+cglobal chroma_filter_pp_64x1_internal
+ FILTER_W64_1 pp
+ ret
+
+cglobal chroma_filter_ps_64x1_internal
+ FILTER_W64_1 ps
+ ret
+
+
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -1453,6 +1528,36 @@
IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 6, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 12, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 12, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 24, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 24, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 48, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 6, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 12, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 12, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 24, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 24, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 48, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 64, ps, 6, 7, 6
+
+IPFILTER_CHROMA 48, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 48, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 48, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 48, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 16, ps, 6, 7, 6
+
%macro PROCESS_CHROMA_SP_W4_4R 0
movq m0, [r0]
@@ -1494,7 +1599,7 @@
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_SS 4
INIT_XMM sse2
-cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-1
+cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -1508,7 +1613,7 @@
lea r6, [tab_ChromaCoeffV + r4]
%endif
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
%ifnidn %3, ss
%ifnidn %3, ps
@@ -1587,7 +1692,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -1653,6 +1758,65 @@
FILTER_VER_CHROMA_SS 24, 32, pp, 8
FILTER_VER_CHROMA_SS 32, 8, pp, 8
+
+ FILTER_VER_CHROMA_SS 16, 24, ss, 6
+ FILTER_VER_CHROMA_SS 12, 32, ss, 6
+ FILTER_VER_CHROMA_SS 4, 32, ss, 6
+ FILTER_VER_CHROMA_SS 32, 64, ss, 6
+ FILTER_VER_CHROMA_SS 16, 64, ss, 6
+ FILTER_VER_CHROMA_SS 32, 48, ss, 6
+ FILTER_VER_CHROMA_SS 24, 64, ss, 6
+
+ FILTER_VER_CHROMA_SS 16, 24, ps, 7
+ FILTER_VER_CHROMA_SS 12, 32, ps, 7
+ FILTER_VER_CHROMA_SS 4, 32, ps, 7
+ FILTER_VER_CHROMA_SS 32, 64, ps, 7
+ FILTER_VER_CHROMA_SS 16, 64, ps, 7
+ FILTER_VER_CHROMA_SS 32, 48, ps, 7
+ FILTER_VER_CHROMA_SS 24, 64, ps, 7
+
+ FILTER_VER_CHROMA_SS 16, 24, sp, 8
+ FILTER_VER_CHROMA_SS 12, 32, sp, 8
+ FILTER_VER_CHROMA_SS 4, 32, sp, 8
+ FILTER_VER_CHROMA_SS 32, 64, sp, 8
+ FILTER_VER_CHROMA_SS 16, 64, sp, 8
+ FILTER_VER_CHROMA_SS 32, 48, sp, 8
+ FILTER_VER_CHROMA_SS 24, 64, sp, 8
+
+ FILTER_VER_CHROMA_SS 16, 24, pp, 8
+ FILTER_VER_CHROMA_SS 12, 32, pp, 8
+ FILTER_VER_CHROMA_SS 4, 32, pp, 8
+ FILTER_VER_CHROMA_SS 32, 64, pp, 8
+ FILTER_VER_CHROMA_SS 16, 64, pp, 8
+ FILTER_VER_CHROMA_SS 32, 48, pp, 8
+ FILTER_VER_CHROMA_SS 24, 64, pp, 8
+
+
+ FILTER_VER_CHROMA_SS 48, 64, ss, 6
+ FILTER_VER_CHROMA_SS 64, 48, ss, 6
+ FILTER_VER_CHROMA_SS 64, 64, ss, 6
+ FILTER_VER_CHROMA_SS 64, 32, ss, 6
+ FILTER_VER_CHROMA_SS 64, 16, ss, 6
+
+ FILTER_VER_CHROMA_SS 48, 64, ps, 7
+ FILTER_VER_CHROMA_SS 64, 48, ps, 7
+ FILTER_VER_CHROMA_SS 64, 64, ps, 7
+ FILTER_VER_CHROMA_SS 64, 32, ps, 7
+ FILTER_VER_CHROMA_SS 64, 16, ps, 7
+
+ FILTER_VER_CHROMA_SS 48, 64, sp, 8
+ FILTER_VER_CHROMA_SS 64, 48, sp, 8
+ FILTER_VER_CHROMA_SS 64, 64, sp, 8
+ FILTER_VER_CHROMA_SS 64, 32, sp, 8
+ FILTER_VER_CHROMA_SS 64, 16, sp, 8
+
+ FILTER_VER_CHROMA_SS 48, 64, pp, 8
+ FILTER_VER_CHROMA_SS 64, 48, pp, 8
+ FILTER_VER_CHROMA_SS 64, 64, pp, 8
+ FILTER_VER_CHROMA_SS 64, 32, pp, 8
+ FILTER_VER_CHROMA_SS 64, 16, pp, 8
+
+
%macro PROCESS_CHROMA_SP_W2_4R 1
movd m0, [r0]
movd m1, [r0 + r1]
@@ -1772,12 +1936,18 @@
FILTER_VER_CHROMA_W2 4, sp, 8
FILTER_VER_CHROMA_W2 8, sp, 8
+FILTER_VER_CHROMA_W2 16, ss, 5
+FILTER_VER_CHROMA_W2 16, pp, 8
+FILTER_VER_CHROMA_W2 16, ps, 6
+FILTER_VER_CHROMA_W2 16, sp, 8
+
+
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_W4 2
+%macro FILTER_VER_CHROMA_W4 3
INIT_XMM sse4
-cglobal interp_4tap_vert_%1_4x2, 5, 6, %2
+cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
add r1d, r1d
add r3d, r3d
@@ -1791,11 +1961,15 @@
lea r5, [tab_ChromaCoeffV + r4]
%endif
-%ifnidn %1, ss
- %ifnidn %1, ps
+%ifnidn %2, 2
+ mov r4d, %1/2
+%endif
+
+%ifnidn %2, ss
+ %ifnidn %2, ps
pxor m6, m6
mova m5, [pw_pixel_max]
- %ifidn %1, pp
+ %ifidn %2, pp
mova m4, [tab_c_32]
%else
mova m4, [tab_c_524800]
@@ -1805,6 +1979,10 @@
%endif
%endif
+%ifnidn %2, 2
+.loop:
+%endif
+
movh m0, [r0]
movh m1, [r0 + r1]
punpcklwd m0, m1 ;m0=[0 1]
@@ -1825,11 +2003,11 @@
pmaddwd m3, [r5 + 1 * 16]
paddd m1, m3 ;m1=[1+2+3+4] Row2 done
-%ifidn %1, ss
+%ifidn %2, ss
psrad m0, 6
psrad m1, 6
packssdw m0, m1
-%elifidn %1, ps
+%elifidn %2, ps
paddd m0, m4
paddd m1, m4
psrad m0, 2
@@ -1838,7 +2016,7 @@
%else
paddd m0, m4
paddd m1, m4
- %ifidn %1, pp
+ %ifidn %2, pp
psrad m0, 6
psrad m1, 6
%else
@@ -1852,20 +2030,31 @@
movh [r2], m0
movhps [r2 + r3], m0
+%ifnidn %2, 2
+ lea r2, [r2 + r3 * 2]
+ dec r4d
+ jnz .loop
+%endif
+
RET
%endmacro
-FILTER_VER_CHROMA_W4 ss, 4
-FILTER_VER_CHROMA_W4 pp, 7
-FILTER_VER_CHROMA_W4 ps, 5
-FILTER_VER_CHROMA_W4 sp, 7
+FILTER_VER_CHROMA_W4 2, ss, 4
+FILTER_VER_CHROMA_W4 2, pp, 7
+FILTER_VER_CHROMA_W4 2, ps, 5
+FILTER_VER_CHROMA_W4 2, sp, 7
+
+FILTER_VER_CHROMA_W4 4, ss, 4
+FILTER_VER_CHROMA_W4 4, pp, 7
+FILTER_VER_CHROMA_W4 4, ps, 5
+FILTER_VER_CHROMA_W4 4, sp, 7
;-------------------------------------------------------------------------------------------------------------------
; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_W6 2
+%macro FILTER_VER_CHROMA_W6 3
INIT_XMM sse4
-cglobal interp_4tap_vert_%1_6x8, 5, 7, %2
+cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
add r1d, r1d
add r3d, r3d
@@ -1879,12 +2068,12 @@
lea r6, [tab_ChromaCoeffV + r4]
%endif
- mov r4d, 8/4
-
-%ifnidn %1, ss
- %ifnidn %1, ps
+ mov r4d, %1/4
+
+%ifnidn %2, ss
+ %ifnidn %2, ps
mova m7, [pw_pixel_max]
- %ifidn %1, pp
+ %ifidn %2, pp
mova m6, [tab_c_32]
%else
mova m6, [tab_c_524800]
@@ -1897,7 +2086,7 @@
.loopH:
PROCESS_CHROMA_SP_W4_4R
-%ifidn %1, ss
+%ifidn %2, ss
psrad m0, 6
psrad m1, 6
psrad m2, 6
@@ -1905,7 +2094,7 @@
packssdw m0, m1
packssdw m2, m3
-%elifidn %1, ps
+%elifidn %2, ps
paddd m0, m6
paddd m1, m6
paddd m2, m6
@@ -1922,7 +2111,7 @@
paddd m1, m6
paddd m2, m6
paddd m3, m6
- %ifidn %1, pp
+ %ifidn %2, pp
psrad m0, 6
psrad m1, 6
psrad m2, 6
@@ -1952,11 +2141,11 @@
PROCESS_CHROMA_SP_W2_4R r6
-%ifidn %1, ss
+%ifidn %2, ss
psrad m0, 6
psrad m2, 6
packssdw m0, m2
-%elifidn %1, ps
+%elifidn %2, ps
paddd m0, m6
paddd m2, m6
psrad m0, 2
@@ -1965,7 +2154,7 @@
%else
paddd m0, m6
paddd m2, m6
- %ifidn %1, pp
+ %ifidn %2, pp
psrad m0, 6
psrad m2, 6
%else
@@ -1991,10 +2180,15 @@
RET
%endmacro
-FILTER_VER_CHROMA_W6 ss, 6
-FILTER_VER_CHROMA_W6 ps, 7
-FILTER_VER_CHROMA_W6 sp, 8
-FILTER_VER_CHROMA_W6 pp, 8
+FILTER_VER_CHROMA_W6 8, ss, 6
+FILTER_VER_CHROMA_W6 8, ps, 7
+FILTER_VER_CHROMA_W6 8, sp, 8
+FILTER_VER_CHROMA_W6 8, pp, 8
+
+FILTER_VER_CHROMA_W6 16, ss, 6
+FILTER_VER_CHROMA_W6 16, ps, 7
+FILTER_VER_CHROMA_W6 16, sp, 8
+FILTER_VER_CHROMA_W6 16, pp, 8
%macro PROCESS_CHROMA_SP_W8_2R 0
movu m1, [r0]
@@ -2143,6 +2337,14 @@
FILTER_VER_CHROMA_W8 8, 16, pp, 8
FILTER_VER_CHROMA_W8 8, 32, pp, 8
+FILTER_VER_CHROMA_W8 8, 12, ss, 7
+FILTER_VER_CHROMA_W8 8, 64, ss, 7
+FILTER_VER_CHROMA_W8 8, 12, sp, 8
+FILTER_VER_CHROMA_W8 8, 64, sp, 8
+FILTER_VER_CHROMA_W8 8, 12, ps, 8
+FILTER_VER_CHROMA_W8 8, 64, ps, 8
+FILTER_VER_CHROMA_W8 8, 12, pp, 8
+FILTER_VER_CHROMA_W8 8, 64, pp, 8
INIT_XMM sse2
@@ -2273,7 +2475,7 @@
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_PP 2
INIT_XMM sse4
-cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -2290,7 +2492,7 @@
mova m7, [pd_32]
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
.loopW:
@@ -2329,7 +2531,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -2369,7 +2571,7 @@
;---------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_PS 2
INIT_XMM sse4
-cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -2386,7 +2588,7 @@
mova m7, [pd_n32768]
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
.loopW:
@@ -2421,7 +2623,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -2461,7 +2663,7 @@
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SP 2
INIT_XMM sse4
-cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -2478,7 +2680,7 @@
mova m7, [tab_c_524800]
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
.loopW:
@@ -2517,7 +2719,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -2557,7 +2759,7 @@
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SS 2
INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -2572,7 +2774,7 @@
lea r6, [tab_LumaCoeffV + r4]
%endif
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
.loopW:
@@ -2601,7 +2803,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/ipfilter8.asm Tue Aug 05 21:41:53 2014 +0900
@@ -211,6 +211,41 @@
RET
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [tab_c_512]
+mova Tm0, [tab_Tm]
+
+mov r5d, 16/2
+
+.loop:
+FILTER_H4_w2_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+dec r5d
+jnz .loop
+
+RET
+
%macro FILTER_H4_w4_2 3
movh %2, [srcq - 1]
pshufb %2, %2, Tm0
@@ -350,6 +385,42 @@
RET
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [tab_c_512]
+mova Tm0, [tab_Tm]
+
+mov r5d, 32/2
+
+.loop:
+FILTER_H4_w4_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+dec r5d
+jnz .loop
+
+RET
+
+
%macro FILTER_H4_w6 3
movu %1, [srcq - 1]
pshufb %2, %1, Tm0
@@ -475,6 +546,38 @@
movu [dstq + 16], %2
%endmacro
+%macro FILTER_H4_w16o 5
+ movu %1, [srcq + %5 - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + %5 - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq + %5], %2
+%endmacro
+
+%macro FILTER_H4_w48 4
+ FILTER_H4_w16o %1, %2, %3, %4, 0
+ FILTER_H4_w16o %1, %2, %3, %4, 16
+ FILTER_H4_w16o %1, %2, %3, %4, 32
+%endmacro
+
+%macro FILTER_H4_w64 4
+ FILTER_H4_w16o %1, %2, %3, %4, 0
+ FILTER_H4_w16o %1, %2, %3, %4, 16
+ FILTER_H4_w16o %1, %2, %3, %4, 32
+ FILTER_H4_w16o %1, %2, %3, %4, 48
+%endmacro
+
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -525,6 +628,11 @@
IPFILTER_CHROMA 8, 32
IPFILTER_CHROMA 12, 16
+IPFILTER_CHROMA 6, 16
+IPFILTER_CHROMA 8, 12
+IPFILTER_CHROMA 8, 64
+IPFILTER_CHROMA 12, 32
+
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -577,6 +685,18 @@
IPFILTER_CHROMA_W 24, 32
IPFILTER_CHROMA_W 32, 32
+IPFILTER_CHROMA_W 16, 24
+IPFILTER_CHROMA_W 16, 64
+IPFILTER_CHROMA_W 32, 48
+IPFILTER_CHROMA_W 24, 64
+IPFILTER_CHROMA_W 32, 64
+
+IPFILTER_CHROMA_W 64, 64
+IPFILTER_CHROMA_W 64, 32
+IPFILTER_CHROMA_W 64, 48
+IPFILTER_CHROMA_W 48, 64
+IPFILTER_CHROMA_W 64, 16
+
%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
movu %1, %7
@@ -987,7 +1107,7 @@
;-----------------------------------------------------------------------------
%macro FILTER_V4_W2_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x8, 4, 6, 8
+cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1067,6 +1187,8 @@
FILTER_V4_W2_H4 2, 8
+FILTER_V4_W2_H4 2, 16
+
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -1273,6 +1395,8 @@
FILTER_V4_W4_H4 4, 8
FILTER_V4_W4_H4 4, 16
+FILTER_V4_W4_H4 4, 32
+
%macro FILTER_V4_W8_H2 0
punpcklbw m1, m2
punpcklbw m7, m3, m0
@@ -1640,6 +1764,8 @@
FILTER_V_PS_W4_H4 4, 8
FILTER_V_PS_W4_H4 4, 16
+FILTER_V_PS_W4_H4 4, 32
+
;--------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
@@ -1708,6 +1834,9 @@
FILTER_V_PS_W8_H8_H16_H2 8, 4
FILTER_V_PS_W8_H8_H16_H2 8, 6
+FILTER_V_PS_W8_H8_H16_H2 8, 12
+FILTER_V_PS_W8_H8_H16_H2 8, 64
+
;--------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
@@ -1803,8 +1932,9 @@
;------------------------------------------------------------------------------------------------------------
;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W6 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_6x8, 4, 6, 8
+cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1821,7 +1951,7 @@
pshufb m5, [tab_Vm + 16]
mova m4, [pw_2000]
lea r5, [3 * r1]
- mov r4d, 2
+ mov r4d, %2/4
.loop:
movq m0, [r0]
@@ -1889,12 +2019,17 @@
dec r4d
jnz .loop
RET
+%endmacro
+
+FILTER_V_PS_W6 6, 8
+FILTER_V_PS_W6 6, 16
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W12 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_12x16, 4, 6, 8
+cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1910,7 +2045,7 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
- mov r4d, 16/2
+ mov r4d, %2/2
.loop:
movu m2, [r0]
@@ -1970,6 +2105,10 @@
dec r4d
jnz .loop
RET
+%endmacro
+
+FILTER_V_PS_W12 12, 16
+FILTER_V_PS_W12 12, 32
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -2059,11 +2198,15 @@
FILTER_V_PS_W16 16, 16
FILTER_V_PS_W16 16, 32
+FILTER_V_PS_W16 16, 24
+FILTER_V_PS_W16 16, 64
+
;--------------------------------------------------------------------------------------------------------------
;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_V4_PS_W24 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_24x32, 4, 6, 8
+cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -2079,7 +2222,7 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
- mov r4d, 32/2
+ mov r4d, %2/2
.loop:
movu m2, [r0]
@@ -2170,6 +2313,11 @@
dec r4d
jnz .loop
RET
+%endmacro
+
+FILTER_V4_PS_W24 24, 32
+
+FILTER_V4_PS_W24 24, 64
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -2265,6 +2413,9 @@
FILTER_V_PS_W32 32, 24
FILTER_V_PS_W32 32, 32
+FILTER_V_PS_W32 32, 48
+FILTER_V_PS_W32 32, 64
+
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -2359,12 +2510,16 @@
FILTER_V4_W8_H8_H16_H32 8, 16
FILTER_V4_W8_H8_H16_H32 8, 32
+FILTER_V4_W8_H8_H16_H32 8, 12
+FILTER_V4_W8_H8_H16_H32 8, 64
+
+
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
%macro FILTER_V4_W6_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_6x8, 4, 6, 8
+cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -2455,12 +2610,14 @@
FILTER_V4_W6_H4 6, 8
+FILTER_V4_W6_H4 6, 16
+
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
%macro FILTER_V4_W12_H2 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_12x16, 4, 6, 8
+cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -2543,12 +2700,14 @@
FILTER_V4_W12_H2 12, 16
+FILTER_V4_W12_H2 12, 32
+
;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
%macro FILTER_V4_W16_H2 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -2563,7 +2722,7 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
-mov r4d, %2
+mov r4d, %2/2
.loop:
movu m2, [r0]
@@ -2622,7 +2781,7 @@
lea r2, [r2 + 2 * r3]
-sub r4, 2
+dec r4d
jnz .loop
RET
%endmacro
@@ -2633,12 +2792,15 @@
FILTER_V4_W16_H2 16, 16
FILTER_V4_W16_H2 16, 32
+FILTER_V4_W16_H2 16, 24
+FILTER_V4_W16_H2 16, 64
+
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
%macro FILTER_V4_W24 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_24x32, 4, 6, 8
+cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -2754,8 +2916,10 @@
FILTER_V4_W24 24, 32
+FILTER_V4_W24 24, 64
+
;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
%macro FILTER_V4_W32 2
INIT_XMM sse4
@@ -2849,6 +3013,111 @@
FILTER_V4_W32 32, 24
FILTER_V4_W32 32, 32
+FILTER_V4_W32 32, 48
+FILTER_V4_W32 32, 64
+
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16n_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mov r4d, %2/2
+
+.loop:
+
+mov r6d, %1/16
+
+.loopW:
+
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r5, [r0 + 2 * r1]
+movu m5, [r5]
+movu m6, [r5 + r1]
+
+punpckhbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m2, m7
+
+punpcklbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m4, m7
+
+mova m7, [tab_c_512]
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m5, [r5 + 2 * r1]
+
+punpcklbw m2, m6, m5
+punpckhbw m6, m5
+
+pmaddubsw m2, m0
+pmaddubsw m6, m0
+
+paddw m4, m2
+paddw m3, m6
+
+pmulhrsw m4, m7
+pmulhrsw m3, m7
+
+packuswb m4, m3
+
+movu [r2 + r3], m4
+
+add r0, 16
+add r2, 16
+dec r6d
+jnz .loopW
+
+lea r0, [r0 + r1 * 2 - %1]
+lea r2, [r2 + r3 * 2 - %1]
+
+dec r4d
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W16n_H2 64, 64
+FILTER_V4_W16n_H2 64, 32
+FILTER_V4_W16n_H2 64, 48
+FILTER_V4_W16n_H2 48, 64
+FILTER_V4_W16n_H2 64, 16
+
;-----------------------------------------------------------------------------
; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
@@ -3350,7 +3619,7 @@
;-------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA 3
INIT_XMM sse4
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
lea r5, [3 * r1]
sub r0, r5
shl r4d, 6
@@ -3370,7 +3639,7 @@
%else
mova m3, [pw_2000]
%endif
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/8)
@@ -3420,7 +3689,7 @@
lea r2, [r2 + 4 * r3 - 2 * %1]
%endif
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -3532,7 +3801,7 @@
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SP 2
INIT_XMM sse4
-cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
add r1d, r1d
lea r5, [r1 + 2 * r1]
@@ -3548,7 +3817,7 @@
mova m7, [tab_c_526336]
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
.loopW:
@@ -3585,7 +3854,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -3720,7 +3989,7 @@
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_SP 2
INIT_XMM sse4
-cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
add r1d, r1d
sub r0, r1
@@ -3735,7 +4004,7 @@
mova m6, [tab_c_526336]
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
@@ -3773,7 +4042,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -3794,6 +4063,20 @@
FILTER_VER_CHROMA_SP 24, 32
FILTER_VER_CHROMA_SP 32, 8
+ FILTER_VER_CHROMA_SP 16, 24
+ FILTER_VER_CHROMA_SP 16, 64
+ FILTER_VER_CHROMA_SP 12, 32
+ FILTER_VER_CHROMA_SP 4, 32
+ FILTER_VER_CHROMA_SP 32, 64
+ FILTER_VER_CHROMA_SP 32, 48
+ FILTER_VER_CHROMA_SP 24, 64
+
+ FILTER_VER_CHROMA_SP 64, 64
+ FILTER_VER_CHROMA_SP 64, 32
+ FILTER_VER_CHROMA_SP 64, 48
+ FILTER_VER_CHROMA_SP 48, 64
+ FILTER_VER_CHROMA_SP 64, 16
+
%macro PROCESS_CHROMA_SP_W2_4R 1
movd m0, [r0]
@@ -3879,6 +4162,8 @@
FILTER_VER_CHROMA_SP_W2_4R 2, 4
FILTER_VER_CHROMA_SP_W2_4R 2, 8
+FILTER_VER_CHROMA_SP_W2_4R 2, 16
+
;--------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
@@ -3931,10 +4216,11 @@
RET
;-------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vertical_sp_6x8(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W6_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_sp_6x8, 5, 7, 7
+cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
add r1d, r1d
sub r0, r1
@@ -3949,7 +4235,7 @@
mova m6, [tab_c_526336]
- mov r4d, 8/4
+ mov r4d, %2/4
.loopH:
PROCESS_CHROMA_SP_W4_4R
@@ -4003,6 +4289,11 @@
jnz .loopH
RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W6_H4 6, 8
+
+FILTER_VER_CHROMA_SP_W6_H4 6, 16
%macro PROCESS_CHROMA_SP_W8_2R 0
movu m1, [r0]
@@ -4093,6 +4384,10 @@
FILTER_VER_CHROMA_SP_W8_H2 8, 16
FILTER_VER_CHROMA_SP_W8_H2 8, 32
+FILTER_VER_CHROMA_SP_W8_H2 8, 12
+FILTER_VER_CHROMA_SP_W8_H2 8, 64
+
+
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-----------------------------------------------------------------------------------------------------------------------------
@@ -4145,6 +4440,8 @@
FILTER_HORIZ_CHROMA_2xN 2, 4
FILTER_HORIZ_CHROMA_2xN 2, 8
+FILTER_HORIZ_CHROMA_2xN 2, 16
+
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-----------------------------------------------------------------------------------------------------------------------------
@@ -4198,6 +4495,8 @@
FILTER_HORIZ_CHROMA_4xN 4, 8
FILTER_HORIZ_CHROMA_4xN 4, 16
+FILTER_HORIZ_CHROMA_4xN 4, 32
+
%macro PROCESS_CHROMA_W6 3
movu %1, [srcq]
pshufb %2, %1, Tm0
@@ -4277,6 +4576,9 @@
FILTER_HORIZ_CHROMA 6, 8
FILTER_HORIZ_CHROMA 12, 16
+FILTER_HORIZ_CHROMA 6, 16
+FILTER_HORIZ_CHROMA 12, 32
+
%macro PROCESS_CHROMA_W8 3
movu %1, [srcq]
pshufb %2, %1, Tm0
@@ -4341,6 +4643,9 @@
FILTER_HORIZ_CHROMA_8xN 8, 16
FILTER_HORIZ_CHROMA_8xN 8, 32
+FILTER_HORIZ_CHROMA_8xN 8, 12
+FILTER_HORIZ_CHROMA_8xN 8, 64
+
%macro PROCESS_CHROMA_W16 4
movu %1, [srcq]
pshufb %2, %1, Tm0
@@ -4422,6 +4727,38 @@
movu [dstq + 48], %4
%endmacro
+%macro PROCESS_CHROMA_W16o 5
+ movu %1, [srcq + %5]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + %5 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ psubw %2, %3
+ psubw %4, %3
+ movu [dstq + %5 * 2], %2
+ movu [dstq + %5 * 2 + 16], %4
+%endmacro
+
+%macro PROCESS_CHROMA_W48 4
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
+%endmacro
+
+%macro PROCESS_CHROMA_W64 4
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
+%endmacro
+
;------------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;------------------------------------------------------------------------------------------------------------------------------
@@ -4480,6 +4817,119 @@
FILTER_HORIZ_CHROMA_WxN 32, 24
FILTER_HORIZ_CHROMA_WxN 32, 32
+FILTER_HORIZ_CHROMA_WxN 16, 24
+FILTER_HORIZ_CHROMA_WxN 16, 64
+FILTER_HORIZ_CHROMA_WxN 24, 64
+FILTER_HORIZ_CHROMA_WxN 32, 48
+FILTER_HORIZ_CHROMA_WxN 32, 64
+
+FILTER_HORIZ_CHROMA_WxN 64, 64
+FILTER_HORIZ_CHROMA_WxN 64, 32
+FILTER_HORIZ_CHROMA_WxN 64, 48
+FILTER_HORIZ_CHROMA_WxN 48, 64
+FILTER_HORIZ_CHROMA_WxN 64, 16
+
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W16n 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+ mov r4d, %2/2
+
+.loop:
+
+ mov r6d, %1/16
+
+.loopW:
+
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+ movu m5, [r5]
+ movu m7, [r5 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
+
+ movu [r2], m4
+ movu [r2 + 16], m2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r5 + 2 * r1]
+
+ punpcklbw m2, m7, m5
+ punpckhbw m7, m5
+
+ pmaddubsw m2, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m2
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
+
+ movu [r2 + r3], m4
+ movu [r2 + r3 + 16], m3
+
+ add r0, 16
+ add r2, 32
+ dec r6d
+ jnz .loopW
+
+ lea r0, [r0 + r1 * 2 - %1]
+ lea r2, [r2 + r3 * 2 - %1 * 2]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W16n 64, 64
+FILTER_V_PS_W16n 64, 32
+FILTER_V_PS_W16n 64, 48
+FILTER_V_PS_W16n 48, 64
+FILTER_V_PS_W16n 64, 16
+
+
;------------------------------------------------------------------------------------------------------------
;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;------------------------------------------------------------------------------------------------------------
@@ -4556,8 +5006,9 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W2 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_2x8, 4, 6, 8
+cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -4574,7 +5025,7 @@
mova m1, [pw_2000]
lea r5, [3 * r1]
- mov r4d, 2
+ mov r4d, %2/4
.loop:
movd m2, [r0]
movd m3, [r0 + r1]
@@ -4635,13 +5086,18 @@
jnz .loop
RET
+%endmacro
+
+FILTER_V_PS_W2 2, 8
+
+FILTER_V_PS_W2 2, 16
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_SS 2
INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-1
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -4655,7 +5111,7 @@
lea r6, [tab_ChromaCoeffV + r4]
%endif
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
@@ -4686,7 +5142,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
@@ -4707,6 +5163,21 @@
FILTER_VER_CHROMA_SS 24, 32
FILTER_VER_CHROMA_SS 32, 8
+ FILTER_VER_CHROMA_SS 16, 24
+ FILTER_VER_CHROMA_SS 12, 32
+ FILTER_VER_CHROMA_SS 4, 32
+ FILTER_VER_CHROMA_SS 32, 64
+ FILTER_VER_CHROMA_SS 16, 64
+ FILTER_VER_CHROMA_SS 32, 48
+ FILTER_VER_CHROMA_SS 24, 64
+
+ FILTER_VER_CHROMA_SS 64, 64
+ FILTER_VER_CHROMA_SS 64, 32
+ FILTER_VER_CHROMA_SS 64, 48
+ FILTER_VER_CHROMA_SS 48, 64
+ FILTER_VER_CHROMA_SS 64, 16
+
+
;---------------------------------------------------------------------------------------------------------------------
; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;---------------------------------------------------------------------------------------------------------------------
@@ -4753,6 +5224,8 @@
FILTER_VER_CHROMA_SS_W2_4R 2, 4
FILTER_VER_CHROMA_SS_W2_4R 2, 8
+FILTER_VER_CHROMA_SS_W2_4R 2, 16
+
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;---------------------------------------------------------------------------------------------------------------
@@ -4803,8 +5276,9 @@
;-------------------------------------------------------------------------------------------------------------------
; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W6_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ss_6x8, 5, 7, 6
+cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
add r1d, r1d
add r3d, r3d
@@ -4818,7 +5292,7 @@
lea r6, [tab_ChromaCoeffV + r4]
%endif
- mov r4d, 8/4
+ mov r4d, %2/4
.loopH:
PROCESS_CHROMA_SP_W4_4R
@@ -4861,6 +5335,12 @@
jnz .loopH
RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W6_H4 6, 8
+
+FILTER_VER_CHROMA_SS_W6_H4 6, 16
+
;----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4911,12 +5391,15 @@
FILTER_VER_CHROMA_SS_W8_H2 8, 16
FILTER_VER_CHROMA_SS_W8_H2 8, 32
+FILTER_VER_CHROMA_SS_W8_H2 8, 12
+FILTER_VER_CHROMA_SS_W8_H2 8, 64
+
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SS 2
INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
add r1d, r1d
add r3d, r3d
@@ -4931,7 +5414,7 @@
lea r6, [tab_LumaCoeffV + r4]
%endif
- mov byte [rsp], %2/4
+ mov dword [rsp], %2/4
.loopH:
mov r4d, (%1/4)
.loopW:
@@ -5023,7 +5506,7 @@
lea r0, [r0 + 4 * r1 - 2 * %1]
lea r2, [r2 + 4 * r3 - 2 * %1]
- dec byte [rsp]
+ dec dword [rsp]
jnz .loopH
RET
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/ipfilter8.h Tue Aug 05 21:41:53 2014 +0900
@@ -153,6 +153,60 @@
SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
+#define CHROMA_VERT_FILTERS_422(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_VERT_FILTERS_444(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu)
+
#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt);
@@ -183,6 +237,58 @@
SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
+#define CHROMA_HORIZ_FILTERS_422(cpu) \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(6, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 64, cpu)
+
+#define CHROMA_HORIZ_FILTERS_444(cpu) \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu)
+
void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
@@ -190,12 +296,26 @@
CHROMA_HORIZ_FILTERS(_sse4);
CHROMA_VERT_FILTERS_SSE4(_sse4);
+CHROMA_VERT_FILTERS_422(_sse2);
+CHROMA_HORIZ_FILTERS_422(_sse4);
+CHROMA_VERT_FILTERS_SSE4_422(_sse4);
+
+CHROMA_VERT_FILTERS_444(_sse2);
+CHROMA_HORIZ_FILTERS_444(_sse4);
+
#undef CHROMA_VERT_FILTERS_SSE4
#undef CHROMA_VERT_FILTERS
#undef SETUP_CHROMA_VERT_FUNC_DEF
#undef CHROMA_HORIZ_FILTERS
#undef SETUP_CHROMA_HORIZ_FUNC_DEF
+#undef CHROMA_VERT_FILTERS_422
+#undef CHROMA_VERT_FILTERS_SSE4_422
+#undef CHROMA_HORIZ_FILTERS_422
+
+#undef CHROMA_VERT_FILTERS_444
+#undef CHROMA_HORIZ_FILTERS_444
+
#else // if HIGH_BIT_DEPTH
#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
@@ -230,6 +350,58 @@
SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_FUNC_DEF(8, 32, cpu)
+#define CHROMA_FILTERS_422(cpu) \
+ SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_FUNC_DEF(6, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_FILTERS_444(cpu) \
+ SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 64, cpu);
+
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
@@ -261,6 +433,60 @@
SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu);
+#define CHROMA_SP_FILTERS_422(cpu) \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_SP_FILTERS_422_SSE4(cpu) \
+ SETUP_CHROMA_SP_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(6, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu);
+
+#define CHROMA_SP_FILTERS_444(cpu) \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu);
+
#define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \
void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
@@ -285,19 +511,83 @@
SETUP_CHROMA_SS_FUNC_DEF(32, 24, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(24, 32, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu)
+ SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu);
#define CHROMA_SS_FILTERS_SSE4(cpu) \
SETUP_CHROMA_SS_FUNC_DEF(2, 4, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \
SETUP_CHROMA_SS_FUNC_DEF(6, 8, cpu);
+#define CHROMA_SS_FILTERS_422(cpu) \
+ SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 12, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 24, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(12, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(4, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 48, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(24, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 64, cpu);
+
+#define CHROMA_SS_FILTERS_422_SSE4(cpu) \
+ SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(2, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(6, 16, cpu);
+
+#define CHROMA_SS_FILTERS_444(cpu) \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(64, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(64, 32, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(32, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(64, 48, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(48, 64, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(64, 16, cpu); \
+ SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu);
+
CHROMA_FILTERS(_sse4);
CHROMA_SP_FILTERS(_sse2);
CHROMA_SP_FILTERS_SSE4(_sse4);
CHROMA_SS_FILTERS(_sse2);
CHROMA_SS_FILTERS_SSE4(_sse4);
+CHROMA_FILTERS_422(_sse4);
+CHROMA_SP_FILTERS_422(_sse2);
+CHROMA_SP_FILTERS_422_SSE4(_sse4);
+CHROMA_SS_FILTERS_422(_sse2);
+CHROMA_SS_FILTERS_422_SSE4(_sse4);
+
+CHROMA_FILTERS_444(_sse4);
+CHROMA_SP_FILTERS_444(_sse4);
+CHROMA_SS_FILTERS_444(_sse2);
+
void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
#undef SETUP_CHROMA_FUNC_DEF
@@ -308,6 +598,17 @@
#undef CHROMA_SS_FILTERS
#undef CHROMA_SS_FILTERS_SSE4
#undef CHROMA_SP_FILTERS_SSE4
+
+#undef CHROMA_FILTERS_422
+#undef CHROMA_SP_FILTERS_422
+#undef CHROMA_SS_FILTERS_422
+#undef CHROMA_SS_FILTERS_422_SSE4
+#undef CHROMA_SP_FILTERS_422_SSE4
+
+#undef CHROMA_FILTERS_444
+#undef CHROMA_SP_FILTERS_444
+#undef CHROMA_SS_FILTERS_444
+
#endif // if HIGH_BIT_DEPTH
LUMA_FILTERS(_sse4);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/mc-a.asm Tue Aug 05 21:41:53 2014 +0900
@@ -154,6 +154,50 @@
%endrep
RET
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m6, [pw_1023]
+ mova m7, [pw_1024]
+ mov r6d, 16/4
+ add r3, r3
+ add r4, r4
+ add r5, r5
+.loop:
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r4 * 2]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m0, [r1 + r4]
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r4 * 2]
+ punpckldq m2, m4
+ punpckldq m5, m0
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, [pw_512]
+ pxor m0, m0
+ pmaxsw m1, m0
+ pminsw m1, m6
+ movd [r2], m1
+ pextrd [r2 + r5], m1, 1
+ lea r2, [r2 + r5 * 2]
+ pextrd [r2], m1, 2
+ pextrd [r2 + r5], m1, 3
+ lea r2, [r2 + r5 * 2]
+ dec r6d
+ jnz .loop
+ RET
+;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
INIT_XMM sse4
@@ -181,7 +225,7 @@
RET
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_6x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_512]
mova m5, [pw_1023]
mova m7, [pw_1024]
@@ -220,6 +264,42 @@
RET
;-----------------------------------------------------------------------------
INIT_XMM sse4
+cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ mov r6d, 16/2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+.loop:
+ movu m0, [r0]
+ movu m2, [r1]
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ dec r6d
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r4 * 2]
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ paddw m0, m4
+ paddw m1, m4
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pminsw m0, m5
+ pminsw m1, m5
+ movh [r2], m0
+ pextrd [r2 + 8], m0, 2
+ movh [r2 + r5], m1
+ pextrd [r2 + r5 + 8], m1, 2
+ lea r2, [r2 + r5 * 2]
+ jnz .loop
+ RET
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_512]
mova m5, [pw_1023]
@@ -335,6 +415,8 @@
ADDAVG_W4_H4 8
ADDAVG_W4_H4 16
+ADDAVG_W4_H4 32
+
;-----------------------------------------------------------------------------
%macro ADDAVG_W8_H4 1
INIT_XMM sse4
@@ -382,6 +464,9 @@
ADDAVG_W8_H4 16
ADDAVG_W8_H4 32
+ADDAVG_W8_H4 12
+ADDAVG_W8_H4 64
+
;-----------------------------------------------------------------------------
%macro ADDAVG_W12_H4 1
INIT_XMM sse4
@@ -442,6 +527,8 @@
ADDAVG_W12_H4 16
+ADDAVG_W12_H4 32
+
;-----------------------------------------------------------------------------
%macro ADDAVG_W16_H4 1
INIT_XMM sse4
@@ -509,6 +596,8 @@
ADDAVG_W16_H4 32
ADDAVG_W16_H4 64
+ADDAVG_W16_H4 24
+
;-----------------------------------------------------------------------------
%macro ADDAVG_W24_H2 2
INIT_XMM sse4
@@ -589,6 +678,8 @@
ADDAVG_W24_H2 24, 32
+ADDAVG_W24_H2 24, 64
+
;-----------------------------------------------------------------------------
%macro ADDAVG_W32_H2 1
INIT_XMM sse4
@@ -691,6 +782,8 @@
ADDAVG_W32_H2 32
ADDAVG_W32_H2 64
+ADDAVG_W32_H2 48
+
;-----------------------------------------------------------------------------
%macro ADDAVG_W48_H2 1
INIT_XMM sse4
@@ -1052,6 +1145,48 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
+cglobal addAvg_2x16, 6,7,8, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m0, [pw_256]
+ mova m7, [pw_128]
+ mov r6d, 16/4
+ add r3, r3
+ add r4, r4
+.loop:
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r4 * 2]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m6, [r1 + r4]
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r4 * 2]
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+ paddw m1, m3
+ pmulhrsw m1, m0
+ paddw m1, m7
+ packuswb m1, m1
+ pextrw [r2], m1, 0
+ pextrw [r2 + r5], m1, 1
+ lea r2, [r2 + r5 * 2]
+ pextrw [r2], m1, 2
+ pextrw [r2 + r5], m1, 3
+ lea r2, [r2 + r5 * 2]
+ dec r6d
+ jnz .loop
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
mova m1, [pw_256]
@@ -1132,6 +1267,9 @@
ADDAVG_W4_H4 4
ADDAVG_W4_H4 8
ADDAVG_W4_H4 16
+
+ADDAVG_W4_H4 32
+
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
@@ -1232,6 +1370,39 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
+cglobal addAvg_6x16, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ mov r6d, 16/2
+ add r3, r3
+ add r4, r4
+.loop:
+ movu m0, [r0]
+ movu m2, [r1]
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ dec r6d
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r4 * 2]
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ packuswb m0, m0
+ packuswb m1, m1
+ movd [r2], m0
+ pextrw [r2 + 4], m0, 2
+ movd [r2 + r5], m1
+ pextrw [r2 + r5 + 4], m1, 2
+ lea r2, [r2 + r5 * 2]
+ jnz .loop
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
mova m4, [pw_256]
mova m5, [pw_128]
@@ -1392,6 +1563,9 @@
ADDAVG_W8_H4 16
ADDAVG_W8_H4 32
+ADDAVG_W8_H4 12
+ADDAVG_W8_H4 64
+
;-----------------------------------------------------------------------------
@@ -1485,6 +1659,8 @@
ADDAVG_W12_H4 16
+ADDAVG_W12_H4 32
+
;-----------------------------------------------------------------------------
@@ -1580,6 +1756,8 @@
ADDAVG_W16_H4 32
ADDAVG_W16_H4 64
+ADDAVG_W16_H4 24
+
;-----------------------------------------------------------------------------
@@ -1654,6 +1832,8 @@
ADDAVG_W24_H2 24, 32
+ADDAVG_W24_H2 24, 64
+
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
@@ -1743,6 +1923,8 @@
ADDAVG_W32_H2 32
ADDAVG_W32_H2 64
+ADDAVG_W32_H2 48
+
;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixel-util.h Tue Aug 05 21:41:53 2014 +0900
@@ -94,6 +94,32 @@
SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 8, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu);
+#define CHROMA_PIXELSUB_DEF_422(cpu) \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 12, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(6, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 4, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 24, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(12, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 8, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 64, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 48, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(24, 64, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 64, cpu);
+
#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
@@ -130,6 +156,9 @@
CHROMA_PIXELSUB_DEF(_sse2);
LUMA_PIXELSUB_DEF(_sse2);
+CHROMA_PIXELSUB_DEF_422(_sse4);
+CHROMA_PIXELSUB_DEF_422(_sse2);
+
#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
@@ -142,6 +171,7 @@
LUMA_PIXELVAR_DEF(_sse2);
#undef CHROMA_PIXELSUB_DEF
+#undef CHROMA_PIXELSUB_DEF_422
#undef LUMA_PIXELSUB_DEF
#undef LUMA_PIXELVAR_DEF
#undef SETUP_CHROMA_PIXELSUB_PS_FUNC
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixel-util8.asm Tue Aug 05 21:41:53 2014 +0900
@@ -2878,6 +2878,61 @@
RET
;-----------------------------------------------------------------------------
+; void pixel_sub_ps_2x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%macro PIXEL_SUB_PS_W2_H2 2
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
+ add r1, r1
+ add r4, r4
+ add r5, r5
+ mov r6d, %2/2
+.loop:
+ movd m0, [r2]
+ movd m1, [r3]
+ movd m2, [r2 + r4]
+ movd m3, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ psubw m0, m1
+ psubw m2, m3
+ movd [r0], m0
+ movd [r0 + r1], m2
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+ RET
+%else
+INIT_XMM sse4
+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
+ add r1, r1
+ mov r6d, %2/2
+.loop:
+ pinsrw m0, [r2], 0
+ pinsrw m1, [r3], 0
+ pinsrw m2, [r2 + r4], 0
+ pinsrw m3, [r3 + r5], 0
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ psubw m0, m1
+ psubw m2, m3
+ movd [r0], m0
+ movd [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ jnz .loop
+ RET
+%endif
+%endmacro
+
+PIXEL_SUB_PS_W2_H2 2, 16
+
+;-----------------------------------------------------------------------------
; void pixel_sub_sp_c_4x2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
@@ -2991,11 +3046,17 @@
PIXELSUB_PS_W4_H4 4, 4
PIXELSUB_PS_W4_H4 4, 8
PIXELSUB_PS_W4_H4 4, 16
+;
+PIXELSUB_PS_W4_H4 4, 12
+PIXELSUB_PS_W4_H4 4, 32
%else
INIT_XMM sse4
PIXELSUB_PS_W4_H4 4, 4
PIXELSUB_PS_W4_H4 4, 8
PIXELSUB_PS_W4_H4 4, 16
+;
+PIXELSUB_PS_W4_H4 4, 12
+PIXELSUB_PS_W4_H4 4, 32
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -3087,9 +3148,13 @@
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W6_H4 6, 8
+;
+PIXELSUB_PS_W6_H4 6, 16
%else
INIT_XMM sse4
PIXELSUB_PS_W6_H4 6, 8
+;
+PIXELSUB_PS_W6_H4 6, 16
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_8x2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -3285,12 +3350,18 @@
PIXELSUB_PS_W8_H4 8, 8
PIXELSUB_PS_W8_H4 8, 16
PIXELSUB_PS_W8_H4 8, 32
+;
+PIXELSUB_PS_W8_H4 8, 12
+PIXELSUB_PS_W8_H4 8, 64
%else
INIT_XMM sse4
PIXELSUB_PS_W8_H4 8, 4
PIXELSUB_PS_W8_H4 8, 8
PIXELSUB_PS_W8_H4 8, 16
PIXELSUB_PS_W8_H4 8, 32
+;
+PIXELSUB_PS_W8_H4 8, 12
+PIXELSUB_PS_W8_H4 8, 64
%endif
;-----------------------------------------------------------------------------
@@ -3404,9 +3475,13 @@
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W12_H4 12, 16
+;
+PIXELSUB_PS_W12_H4 12, 32
%else
INIT_XMM sse4
PIXELSUB_PS_W12_H4 12, 16
+;
+PIXELSUB_PS_W12_H4 12, 32
%endif
;-----------------------------------------------------------------------------
@@ -3529,6 +3604,8 @@
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
PIXELSUB_PS_W16_H4 16, 64
+;
+PIXELSUB_PS_W16_H4 16, 24
%else
INIT_XMM sse4
PIXELSUB_PS_W16_H4 16, 4
@@ -3537,6 +3614,8 @@
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
PIXELSUB_PS_W16_H4 16, 64
+;
+PIXELSUB_PS_W16_H4 16, 24
%endif
;-----------------------------------------------------------------------------
@@ -3632,9 +3711,13 @@
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W24_H2 24, 32
+;
+PIXELSUB_PS_W24_H2 24, 64
%else
INIT_XMM sse4
PIXELSUB_PS_W24_H2 24, 32
+;
+PIXELSUB_PS_W24_H2 24, 64
%endif
;-----------------------------------------------------------------------------
@@ -3752,6 +3835,8 @@
PIXELSUB_PS_W32_H2 32, 24
PIXELSUB_PS_W32_H2 32, 32
PIXELSUB_PS_W32_H2 32, 64
+;
+PIXELSUB_PS_W32_H2 32, 48
%else
INIT_XMM sse4
PIXELSUB_PS_W32_H2 32, 8
@@ -3759,6 +3844,8 @@
PIXELSUB_PS_W32_H2 32, 24
PIXELSUB_PS_W32_H2 32, 32
PIXELSUB_PS_W32_H2 32, 64
+;
+PIXELSUB_PS_W32_H2 32, 48
%endif
;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixel.h Tue Aug 05 21:41:53 2014 +0900
@@ -206,6 +206,16 @@
ADDAVG(addAvg_64x48)
ADDAVG(addAvg_64x64)
+ADDAVG(addAvg_2x16)
+ADDAVG(addAvg_4x32)
+ADDAVG(addAvg_6x16)
+ADDAVG(addAvg_8x12)
+ADDAVG(addAvg_8x64)
+ADDAVG(addAvg_12x32)
+ADDAVG(addAvg_16x24)
+ADDAVG(addAvg_24x64)
+ADDAVG(addAvg_32x48)
+
void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Tue Aug 05 01:05:47 2014 -0500
+++ b/source/common/x86/pixeladd8.asm Tue Aug 05 21:41:53 2014 +0900
@@ -212,6 +212,8 @@
PIXEL_ADD_PS_W2_H4 2, 8
+PIXEL_ADD_PS_W2_H4 2, 16
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_4x2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -359,6 +361,7 @@
PIXEL_ADD_PS_W4_H4 4, 8
PIXEL_ADD_PS_W4_H4 4, 16
+PIXEL_ADD_PS_W4_H4 4, 32
;-----------------------------------------------------------------------------
; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
@@ -469,6 +472,8 @@
PIXEL_ADD_PS_W6_H4 6, 8
+PIXEL_ADD_PS_W6_H4 6, 16
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_8x2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -736,6 +741,8 @@
PIXEL_ADD_PS_W8_H4 8, 16
PIXEL_ADD_PS_W8_H4 8, 32
+PIXEL_ADD_PS_W8_H4 8, 12
+PIXEL_ADD_PS_W8_H4 8, 64
;-----------------------------------------------------------------------------
; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
@@ -886,6 +893,8 @@
PIXEL_ADD_PS_W12_H4 12, 16
+PIXEL_ADD_PS_W12_H4 12, 32
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -1033,6 +1042,8 @@
PIXEL_ADD_PS_W16_H4 16, 32
PIXEL_ADD_PS_W16_H4 16, 64
+PIXEL_ADD_PS_W16_H4 16, 24
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -1138,6 +1149,8 @@
PIXEL_ADD_PS_W24_H2 24, 32
+PIXEL_ADD_PS_W24_H2 24, 64
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -1265,6 +1278,8 @@
PIXEL_ADD_PS_W32_H2 32, 32
PIXEL_ADD_PS_W32_H2 32, 64
+PIXEL_ADD_PS_W32_H2 32, 48
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
diff -r 0d4723a0080c -r 770c40d768d5 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/ipfilterharness.cpp Tue Aug 05 21:41:53 2014 +0900
@@ -171,7 +171,7 @@
for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
{
rand_srcStride = rand() % 100 + 2;
- rand_dstStride = rand() % 100 + 32;
+ rand_dstStride = rand() % 100 + 64;
checked(opt, pixel_test_buff[index] + 3 * rand_srcStride,
rand_srcStride,
@@ -206,7 +206,7 @@
for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
{
rand_srcStride = rand() % 100;
- rand_dstStride = rand() % 100 + 32;
+ rand_dstStride = rand() % 100 + 64;
ref(pixel_test_buff[index] + 3 * rand_srcStride,
rand_srcStride,
@@ -244,7 +244,7 @@
for (int isRowExt = 0; isRowExt < 2; isRowExt++)
{
rand_srcStride = rand() % 100 + 2;
- rand_dstStride = rand() % 100;
+ rand_dstStride = rand() % 100 + 64;
ref(pixel_test_buff[index] + 3 * rand_srcStride,
rand_srcStride,
@@ -282,7 +282,7 @@
for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
{
rand_srcStride = rand() % 100;
- rand_dstStride = rand() % 100 + 32;
+ rand_dstStride = rand() % 100 + 64;
ref(short_test_buff[index] + 3 * rand_srcStride,
rand_srcStride,
@@ -317,7 +317,7 @@
for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
{
rand_srcStride = rand() % 100;
- rand_dstStride = rand() % 100 + 32;
+ rand_dstStride = rand() % 100 + 64;
ref(short_test_buff[index] + 3 * rand_srcStride,
rand_srcStride,
@@ -535,7 +535,7 @@
for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++)
{
rand_srcStride = rand() % 100;
- rand_dstStride = rand() % 100;
+ rand_dstStride = rand() % 100 + 64;
ref(pixel_test_buff[index] + 3 * rand_srcStride,
rand_srcStride,
@@ -650,7 +650,7 @@
{
if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_hpp[value], opt.chroma[csp].filter_hpp[value]))
{
- printf("chroma_hpp[%s]", chromaPartStr[value]);
+ printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
return false;
}
}
@@ -658,7 +658,7 @@
{
if (!check_IPFilterChroma_hps_primitive(ref.chroma[csp].filter_hps[value], opt.chroma[csp].filter_hps[value]))
{
- printf("chroma_hps[%s]", chromaPartStr[value]);
+ printf("chroma_hps[%s]", chromaPartStr[csp][value]);
return false;
}
}
@@ -666,7 +666,7 @@
{
if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_vpp[value], opt.chroma[csp].filter_vpp[value]))
{
- printf("chroma_vpp[%s]", chromaPartStr[value]);
+ printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
return false;
}
}
@@ -674,7 +674,7 @@
{
if (!check_IPFilterChroma_ps_primitive(ref.chroma[csp].filter_vps[value], opt.chroma[csp].filter_vps[value]))
{
- printf("chroma_vps[%s]", chromaPartStr[value]);
+ printf("chroma_vps[%s]", chromaPartStr[csp][value]);
return false;
}
}
@@ -682,7 +682,7 @@
{
if (!check_IPFilterChroma_sp_primitive(ref.chroma[csp].filter_vsp[value], opt.chroma[csp].filter_vsp[value]))
{
- printf("chroma_vsp[%s]", chromaPartStr[value]);
+ printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
return false;
}
}
@@ -690,7 +690,7 @@
{
if (!check_IPFilterChroma_ss_primitive(ref.chroma[csp].filter_vss[value], opt.chroma[csp].filter_vss[value]))
{
- printf("chroma_vss[%s]", chromaPartStr[value]);
+ printf("chroma_vss[%s]", chromaPartStr[csp][value]);
return false;
}
}
@@ -785,40 +785,40 @@
{
if (opt.chroma[csp].filter_hpp[value])
{
- printf("chroma_hpp[%s]", chromaPartStr[value]);
+ printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].filter_hpp[value], ref.chroma[csp].filter_hpp[value],
pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
if (opt.chroma[csp].filter_hps[value])
{
- printf("chroma_hps[%s]", chromaPartStr[value]);
+ printf("chroma_hps[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].filter_hps[value], ref.chroma[csp].filter_hps[value],
pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
}
if (opt.chroma[csp].filter_vpp[value])
{
- printf("chroma_vpp[%s]", chromaPartStr[value]);
+ printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].filter_vpp[value], ref.chroma[csp].filter_vpp[value],
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
if (opt.chroma[csp].filter_vps[value])
{
- printf("chroma_vps[%s]", chromaPartStr[value]);
+ printf("chroma_vps[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].filter_vps[value], ref.chroma[csp].filter_vps[value],
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1);
}
if (opt.chroma[csp].filter_vsp[value])
{
- printf("chroma_vsp[%s]", chromaPartStr[value]);
+ printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].filter_vsp[value], ref.chroma[csp].filter_vsp[value],
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
if (opt.chroma[csp].filter_vss[value])
{
- printf("chroma_vss[%s]", chromaPartStr[value]);
+ printf("chroma_vss[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].filter_vss[value], ref.chroma[csp].filter_vss[value],
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_s, dstStride, 1);
diff -r 0d4723a0080c -r 770c40d768d5 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/pixelharness.cpp Tue Aug 05 21:41:53 2014 +0900
@@ -1250,7 +1250,7 @@
{
if (!check_copy_pp(ref.chroma[i].copy_pp[part], opt.chroma[i].copy_pp[part]))
{
- printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1258,7 +1258,7 @@
{
if (!check_copy_sp(ref.chroma[i].copy_sp[part], opt.chroma[i].copy_sp[part]))
{
- printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1266,7 +1266,7 @@
{
if (!check_copy_ps(ref.chroma[i].copy_ps[part], opt.chroma[i].copy_ps[part]))
{
- printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1274,7 +1274,7 @@
{
if (!check_copy_ss(ref.chroma[i].copy_ss[part], opt.chroma[i].copy_ss[part]))
{
- printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1282,7 +1282,7 @@
{
if (!check_pixel_sub_ps(ref.chroma[i].sub_ps[part], opt.chroma[i].sub_ps[part]))
{
- printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1290,7 +1290,7 @@
{
if (!check_pixel_add_ps(ref.chroma[i].add_ps[part], opt.chroma[i].add_ps[part]))
{
- printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1298,7 +1298,7 @@
{
if (!check_addAvg(ref.chroma[i].addAvg[part], opt.chroma[i].addAvg[part]))
{
- printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[part]);
+ printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
return false;
}
}
@@ -1651,37 +1651,37 @@
{
if (opt.chroma[i].copy_pp[part])
{
- HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].copy_pp[part], ref.chroma[i].copy_pp[part], pbuf1, 64, pbuf2, 128);
}
if (opt.chroma[i].copy_sp[part])
{
- HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].copy_sp[part], ref.chroma[i].copy_sp[part], pbuf1, 64, sbuf3, 128);
}
if (opt.chroma[i].copy_ps[part])
{
- HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].copy_ps[part], ref.chroma[i].copy_ps[part], sbuf1, 64, pbuf1, 128);
}
if (opt.chroma[i].copy_ss[part])
{
- HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].copy_ss[part], ref.chroma[i].copy_ss[part], sbuf1, 64, sbuf2, 128);
}
if (opt.chroma[i].sub_ps[part])
{
- HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].sub_ps[part], ref.chroma[i].sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
}
if (opt.chroma[i].add_ps[part])
{
- HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].add_ps[part], ref.chroma[i].add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
}
if (opt.chroma[i].addAvg[part])
{
- HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[part]);
+ HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
}
diff -r 0d4723a0080c -r 770c40d768d5 source/test/testbench.cpp
--- a/source/test/testbench.cpp Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/testbench.cpp Tue Aug 05 21:41:53 2014 +0900
@@ -36,20 +36,46 @@
const char* lumaPartStr[NUM_LUMA_PARTITIONS] =
{
- " 4x4",
- " 8x8", " 8x4", " 4x8",
- "16x16", " 16x8", " 8x16", "16x12", "12x16", " 16x4", " 4x16",
- "32x32", "32x16", "16x32", "32x24", "24x32", " 32x8", " 8x32",
- "64x64", "64x32", "32x64", "64x48", "48x64", "64x16", "16x64",
+ " 4x4", " 8x8", "16x16", "32x32", "64x64",
+ " 8x4", " 4x8",
+ " 16x8", " 8x16",
+ "32x16", "16x32",
+ "64x32", "32x64",
+ "16x12", "12x16", " 16x4", " 4x16",
+ "32x24", "24x32", " 32x8", " 8x32",
+ "64x48", "48x64", "64x16", "16x64",
};
-const char* chromaPartStr[NUM_CHROMA_PARTITIONS] =
+const char* chromaPartStr420[NUM_CHROMA_PARTITIONS] =
{
- " 2x2", // never used by HEVC
- " 4x4", " 4x2", " 2x4",
- " 8x8", " 8x4", " 4x8", " 8x6", " 6x8", " 8x2", " 2x8",
- "16x16", " 16x8", " 8x16", "16x12", "12x16", " 16x4", " 4x16",
- "32x32", "32x16", "16x32", "32x24", "24x32", " 32x8", " 8x32",
+ " 2x2", " 4x4", " 8x8", "16x16", "32x32",
+ " 4x2", " 2x4",
+ " 8x4", " 4x8",
+ " 16x8", " 8x16",
+ "32x16", "16x32",
+ " 8x6", " 6x8", " 8x2", " 2x8",
+ "16x12", "12x16", " 16x4", " 4x16",
+ "32x24", "24x32", " 32x8", " 8x32",
+};
+
+const char* chromaPartStr422[NUM_CHROMA_PARTITIONS] =
+{
+ " 2x4", " 4x8", " 8x16", "16x32", "32x64",
+ " 4x4", " 2x8",
+ " 8x8", " 4x16",
+ "16x16", " 8x32",
+ "32x32", "16x64",
+ " 8x12", " 6x16", " 8x4", " 2x16",
+ "16x24", "12x32", " 16x8", " 4x32",
+ "32x48", "24x64", "32x16", " 8x64",
+};
+
+const char* const* chromaPartStr[X265_CSP_COUNT] =
+{
+ lumaPartStr,
+ chromaPartStr420,
+ chromaPartStr422,
+ lumaPartStr
};
void do_help()
diff -r 0d4723a0080c -r 770c40d768d5 source/test/testharness.h
--- a/source/test/testharness.h Tue Aug 05 01:05:47 2014 -0500
+++ b/source/test/testharness.h Tue Aug 05 21:41:53 2014 +0900
@@ -40,7 +40,7 @@
using namespace x265;
extern const char* lumaPartStr[NUM_LUMA_PARTITIONS];
-extern const char* chromaPartStr[NUM_CHROMA_PARTITIONS];
+extern const char* const* chromaPartStr[X265_CSP_COUNT];
class TestHarness
{
More information about the x265-devel
mailing list