[x265] [PATCH] asm: code for addAvg luma and chroma all sizes
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon Feb 3 09:09:23 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1391413535 -19800
# Mon Feb 03 13:15:35 2014 +0530
# Node ID b1471c2ff0b4fd3ff0357a3b497bcd6b462ac3c6
# Parent 7ad3e3504ea6e5f7355b21c4c7de44ad9e1c0a2a
asm: code for addAvg luma and chroma all sizes
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Mon Feb 03 12:32:41 2014 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp Mon Feb 03 13:15:35 2014 +0530
@@ -594,7 +594,7 @@
src1Stride = srcYuv1->m_width;
dststride = getStride();
- primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride, srcY1, src1Stride);
+ primitives.luma_addAvg[part](srcY0, srcY1, dstY, src0Stride, src1Stride, dststride);
}
if (bChroma)
{
@@ -602,8 +602,8 @@
src1Stride = srcYuv1->m_cwidth;
dststride = getCStride();
- primitives.chroma[m_csp].addAvg[part](dstU, dststride, srcU0, src0Stride, srcU1, src1Stride);
- primitives.chroma[m_csp].addAvg[part](dstV, dststride, srcV0, src0Stride, srcV1, src1Stride);
+ primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, src0Stride, src1Stride, dststride);
+ primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, src0Stride, src1Stride, dststride);
}
}
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/pixel.cpp Mon Feb 03 13:15:35 2014 +0530
@@ -802,7 +802,7 @@
}
template<int bx, int by>
-void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride, int16_t* src1, intptr_t src1Stride)
+void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
int shiftNum, offset;
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/primitives.h
--- a/source/common/primitives.h Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/primitives.h Mon Feb 03 13:15:35 2014 +0530
@@ -184,7 +184,7 @@
typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*addAvg_t)(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride, int16_t* src1, intptr_t src1Stride);
+typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Feb 03 13:15:35 2014 +0530
@@ -563,6 +563,62 @@
SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
+#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+ p.luma_addAvg[LUMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define LUMA_ADDAVG(cpu) \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I420].addAvg[CHROMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG(cpu) \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+
#define SETUP_INTRA_ANG4(mode, fno, cpu) \
p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG8(mode, fno, cpu) \
@@ -887,6 +943,9 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
+
+ LUMA_ADDAVG(_sse2);
+ CHROMA_ADDAVG(_sse2);
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -1043,6 +1102,10 @@
SETUP_INTRA_ANG32(33, 33, sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
+
+ p.chroma[X265_CSP_I420].addAvg[CHROMA_2x4] = x265_addAvg_2x4_sse4;
+ p.chroma[X265_CSP_I420].addAvg[CHROMA_2x8] = x265_addAvg_2x8_sse4;
+ p.chroma[X265_CSP_I420].addAvg[CHROMA_6x8] = x265_addAvg_6x8_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/const-a.asm Mon Feb 03 13:15:35 2014 +0530
@@ -33,6 +33,8 @@
const pw_1, times 16 dw 1
const pw_16, times 16 dw 16
const pw_32, times 16 dw 32
+const pw_128, times 16 dw 128
+const pw_256, times 16 dw 256
const pw_512, times 16 dw 512
const pw_1024, times 16 dw 1024
const pw_4096, times 16 dw 4096
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/mc-a.asm Mon Feb 03 13:15:35 2014 +0530
@@ -49,6 +49,8 @@
cextern pw_8
cextern pw_32
cextern pw_64
+cextern pw_128
+cextern pw_256
cextern pw_512
cextern pw_00ff
cextern pw_pixel_max
@@ -56,6 +58,1036 @@
cextern pd_32
cextern deinterleave_shufd
+;====================================================================================================================
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+;====================================================================================================================
+; r0 = pSrc0, r1 = pSrc1
+; r2 = pDst, r3 = iStride0
+; r4 = iStride1, r5 = iDstStride
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m0, [pw_256]
+ mova m7, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m6, [r1 + r4]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ pmulhrsw m1, m0
+ paddw m1, m7
+ packuswb m1, m1
+
+ pextrw [r2], m1, 0
+ pextrw [r2 + r5], m1, 1
+ lea r2, [r2 + 2 * r5]
+ pextrw [r2], m1, 2
+ pextrw [r2 + r5], m1, 3
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,6,8, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m0, [pw_256]
+ mova m7, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m6, [r1 + r4]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ pmulhrsw m1, m0
+ paddw m1, m7
+ packuswb m1, m1
+
+ pextrw [r2], m1, 0
+ pextrw [r2 + r5], m1, 1
+ lea r2, [r2 + 2 * r5]
+ pextrw [r2], m1, 2
+ pextrw [r2 + r5], m1, 3
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m6, [r1 + r4]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ pmulhrsw m1, m0
+ paddw m1, m7
+ packuswb m1, m1
+
+ pextrw [r2], m1, 0
+ pextrw [r2 + r5], m1, 1
+ lea r2, [r2 + 2 * r5]
+ pextrw [r2], m1, 2
+ pextrw [r2 + r5], m1, 3
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m1, [pw_256]
+ mova m3, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ movh m0, [r0]
+ movhps m0, [r0 + r3]
+ movh m2, [r1]
+ movhps m2, [r1 + r4]
+
+ paddw m0, m2
+ pmulhrsw m0, m1
+ paddw m0, m3
+
+ packuswb m0, m0
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r5], m0
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 1
+INIT_XMM sse2
+cglobal addAvg_4x%1, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m1, [pw_256]
+ mova m3, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1/4
+
+.loop
+ movh m0, [r0]
+ movhps m0, [r0 + r3]
+ movh m2, [r1]
+ movhps m2, [r1 + r4]
+
+ paddw m0, m2
+ pmulhrsw m0, m1
+ paddw m0, m3
+
+ packuswb m0, m0
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r5], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movh m0, [r0]
+ movhps m0, [r0 + r3]
+ movh m2, [r1]
+ movhps m2, [r1 + r4]
+
+ paddw m0, m2
+ pmulhrsw m0, m1
+ paddw m0, m3
+
+ packuswb m0, m0
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r5], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W4_H4 4
+ADDAVG_W4_H4 8
+ADDAVG_W4_H4 16
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movd [r2], m0
+ pextrw [r2 + 4], m0, 2
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movd [r2 + r5], m1
+ pextrw [r2 + r5 + 4], m1, 2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movd [r2], m0
+ pextrw [r2 + 4], m0, 2
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movd [r2 + r5], m1
+ pextrw [r2 + r5 + 4], m1, 2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movd [r2], m0
+ pextrw [r2 + 4], m0, 2
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movd [r2 + r5], m1
+ pextrw [r2 + r5 + 4], m1, 2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movd [r2], m0
+ pextrw [r2 + 4], m0, 2
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movd [r2 + r5], m1
+ pextrw [r2 + r5 + 4], m1, 2
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movh [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movh [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movh [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movh [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W8_H4 1
+INIT_XMM sse2
+cglobal addAvg_8x%1, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1/4
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ movh [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ movh [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W8_H4 4
+ADDAVG_W8_H4 8
+ADDAVG_W8_H4 16
+ADDAVG_W8_H4 32
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 1
+INIT_XMM sse2
+cglobal addAvg_12x%1, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1/4
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+ packuswb m0, m0
+ movh [r2], m0
+
+ movh m0, [r0 + 16]
+ movhps m0, [r0 + 16 + r3]
+ movh m2, [r1 + 16]
+ movhps m2, [r1 + 16 + r4]
+
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ movd [r2 + 8], m0
+ pshufd m0, m0, 1
+ movd [r2 + 8 + r5], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ movh [r2], m0
+
+ movh m0, [r0 + 16]
+ movhps m0, [r0 + 16 + r3]
+ movh m2, [r1 + 16]
+ movhps m2, [r1 + 16 + r4]
+
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ movd [r2 + 8], m0
+ pshufd m0, m0, 1
+ movd [r2 + 8 + r5], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m1, m1
+ movh [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W12_H4 16
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 1
+INIT_XMM sse2
+cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1/4
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W16_H4 4
+ADDAVG_W16_H4 8
+ADDAVG_W16_H4 12
+ADDAVG_W16_H4 16
+ADDAVG_W16_H4 32
+ADDAVG_W16_H4 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %2/2
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ movh [r2 + 16], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + r5], m1
+
+ movu m1, [r0 + 32 + r3]
+ movu m3, [r1 + 32 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m1, m1
+ movh [r2 + 16 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 1
+INIT_XMM sse2
+cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1/2
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 48]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2 + 16], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + r5], m1
+
+ movu m1, [r0 + 32 + r3]
+ movu m3, [r1 + 32 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 48 + r3]
+ movu m3, [r1 + 48 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + 16 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W32_H2 8
+ADDAVG_W32_H2 16
+ADDAVG_W32_H2 24
+ADDAVG_W32_H2 32
+ADDAVG_W32_H2 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 1
+INIT_XMM sse2
+cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1/2
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 48]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2 + 16], m0
+
+ movu m0, [r0 + 64]
+ movu m2, [r1 + 64]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 80]
+ movu m2, [r1 + 80]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2 + 32], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + r5], m1
+
+ movu m1, [r0 + 32 + r3]
+ movu m3, [r1 + 32 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 48 + r3]
+ movu m3, [r1 + 48 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + 16 + r5], m1
+
+ movu m1, [r0 + 64 + r3]
+ movu m3, [r1 + 64 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ movu m2, [r0 + 80 + r3]
+ movu m3, [r1 + 80 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m1, m2
+ movu [r2 + 32 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W48_H2 64
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 1
+INIT_XMM sse2
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+ mov r6d, %1
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 48]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2 + 16], m0
+
+ movu m0, [r0 + 64]
+ movu m2, [r1 + 64]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 80]
+ movu m2, [r1 + 80]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2 + 32], m0
+
+ movu m0, [r0 + 96]
+ movu m2, [r1 + 96]
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 112]
+ movu m2, [r1 + 112]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ movu [r2 + 48], m0
+
+ add r2, r5
+ add r0, r3
+ add r1, r4
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W64_H1 16
+ADDAVG_W64_H1 32
+ADDAVG_W64_H1 48
+ADDAVG_W64_H1 64
+;-----------------------------------------------------------------------------
+
;=============================================================================
; implicit weighted biprediction
;=============================================================================
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/pixel.h Mon Feb 03 13:15:35 2014 +0530
@@ -166,6 +166,41 @@
int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+#define ADDAVG(func) \
+ void x265_ ## func ## _sse2 (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+ void x265_ ## func ## _sse4 (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+ADDAVG(addAvg_2x4)
+ADDAVG(addAvg_2x8)
+ADDAVG(addAvg_4x2);
+ADDAVG(addAvg_4x4)
+ADDAVG(addAvg_4x8)
+ADDAVG(addAvg_4x16)
+ADDAVG(addAvg_6x8)
+ADDAVG(addAvg_8x2)
+ADDAVG(addAvg_8x4)
+ADDAVG(addAvg_8x6)
+ADDAVG(addAvg_8x8)
+ADDAVG(addAvg_8x16)
+ADDAVG(addAvg_8x32)
+ADDAVG(addAvg_12x16)
+ADDAVG(addAvg_16x4)
+ADDAVG(addAvg_16x8)
+ADDAVG(addAvg_16x12)
+ADDAVG(addAvg_16x16)
+ADDAVG(addAvg_16x32)
+ADDAVG(addAvg_16x64)
+ADDAVG(addAvg_24x32)
+ADDAVG(addAvg_32x8)
+ADDAVG(addAvg_32x16)
+ADDAVG(addAvg_32x24)
+ADDAVG(addAvg_32x32)
+ADDAVG(addAvg_32x64)
+ADDAVG(addAvg_48x64)
+ADDAVG(addAvg_64x16)
+ADDAVG(addAvg_64x32)
+ADDAVG(addAvg_64x48)
+ADDAVG(addAvg_64x64)
+
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
#undef DECL_X1
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Feb 03 12:32:41 2014 +0530
+++ b/source/test/pixelharness.cpp Mon Feb 03 13:15:35 2014 +0530
@@ -776,8 +776,8 @@
for (int i = 0; i < ITERS; i++)
{
- ref(ref_dest, STRIDE, sbuf1 + j, STRIDE, sbuf2 + j, STRIDE);
- opt(opt_dest, STRIDE, sbuf1 + j, STRIDE, sbuf2 + j, STRIDE);
+ ref(sbuf1 + j, sbuf2 + j, ref_dest, STRIDE, STRIDE, STRIDE);
+ opt(sbuf1 + j, sbuf2 + j, opt_dest, STRIDE, STRIDE, STRIDE);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
return false;
@@ -1301,14 +1301,14 @@
if (opt.chroma[i].addAvg[part])
{
HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
- REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], pbuf1, STRIDE, sbuf1, STRIDE, sbuf2, STRIDE);
+ REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
}
if (opt.luma_addAvg[part])
{
printf("luma_addAvg[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], pbuf1, STRIDE, sbuf1, STRIDE, sbuf2, STRIDE);
+ REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
#undef HEADER
More information about the x265-devel
mailing list