[x265] [PATCH] asm: code for addAvg luma and chroma all sizes
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Sat Jan 18 08:39:24 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1390030530 -19800
# Sat Jan 18 13:05:30 2014 +0530
# Node ID 806eb643b88cfef7752f45ee02c9d7f3d9bf2b27
# Parent 2e4e32a50cd6d80dbcf96338c934e5e5e1f05be5
asm: code for addAvg luma and chroma all sizes
diff -r 2e4e32a50cd6 -r 806eb643b88c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Sat Jan 18 13:05:30 2014 +0530
@@ -547,6 +547,62 @@
SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
+#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+ p.luma_addAvg[LUMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define LUMA_ADDAVG(cpu) \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
+ SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+ p.chroma_addAvg[CHROMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG(cpu) \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+
#define SETUP_INTRA_ANG4(mode, fno, cpu) \
p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG8(mode, fno, cpu) \
@@ -883,6 +939,9 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
+
+ LUMA_ADDAVG(_sse2);
+ CHROMA_ADDAVG(_sse2);
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -1012,6 +1071,10 @@
SETUP_INTRA_ANG4(33, 3, sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
+
+ p.chroma_addAvg[CHROMA_2x4] = x265_addAvg_2x4_sse4;
+ p.chroma_addAvg[CHROMA_2x8] = x265_addAvg_2x8_sse4;
+ p.chroma_addAvg[CHROMA_6x8] = x265_addAvg_6x8_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 2e4e32a50cd6 -r 806eb643b88c source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/mc-a.asm Sat Jan 18 13:05:30 2014 +0530
@@ -39,6 +39,9 @@
times 8 db 2
times 8 db 4
times 8 db 6
+
+OFFSET times 8 dw 4040h
+
sq_1: times 1 dq 1
SECTION .text
@@ -56,6 +59,1008 @@
cextern pd_32
cextern deinterleave_shufd
+;=================================================================================
+;void addAvg (pixel *dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride,
+; int16_t* src1, intptr_t src1Stride)
+;=================================================================================
+; r0 = dst, r1 = dstStride
+; r2 = src0, r3 = src0Stride
+; r4 = src1, r5 = src1Stride
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m0, [OFFSET]
+
+ movd m1, [r2]
+ movd m2, [r2 + 2 * r3]
+ movd m3, [r4]
+ movd m4, [r4 + 2 * r3]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movd m2, [r2]
+ movd m4, [r2 + 2 * r3]
+ movd m5, [r4]
+ movd m6, [r4 + 2 * r3]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ paddw m1, m0
+ psraw m1, 7
+ packuswb m1, m1
+
+ pextrw [r0], m1, 0
+ pextrw [r0 + r1], m1, 1
+ lea r0, [r0 + 2 * r1]
+ pextrw [r0], m1, 2
+ pextrw [r0 + r1], m1, 3
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m0, [OFFSET]
+
+ movd m1, [r2]
+ movd m2, [r2 + 2 * r3]
+ movd m3, [r4]
+ movd m4, [r4 + 2 * r3]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movd m2, [r2]
+ movd m4, [r2 + 2 * r3]
+ movd m5, [r4]
+ movd m6, [r4 + 2 * r3]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ paddw m1, m0
+ psraw m1, 7
+ packuswb m1, m1
+
+ pextrw [r0], m1, 0
+ pextrw [r0 + r1], m1, 1
+ lea r0, [r0 + 2 * r1]
+ pextrw [r0], m1, 2
+ pextrw [r0 + r1], m1, 3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movd m1, [r2]
+ movd m2, [r2 + 2 * r3]
+ movd m3, [r4]
+ movd m4, [r4 + 2 * r3]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movd m2, [r2]
+ movd m4, [r2 + 2 * r3]
+ movd m5, [r4]
+ movd m6, [r4 + 2 * r3]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ paddw m1, m0
+ psraw m1, 7
+ packuswb m1, m1
+
+ pextrw [r0], m1, 0
+ pextrw [r0 + r1], m1, 1
+ lea r0, [r0 + 2 * r1]
+ pextrw [r0], m1, 2
+ pextrw [r0 + r1], m1, 3
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_4x2, 6,4,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ movh m0, [r2]
+ movh m1, [r2 + 2 * r3]
+ movh m2, [r4]
+ movh m3, [r4 + 2 * r5]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ packuswb m0, m0
+ movd [r0], m0
+ pshufd m0, m0, 1
+ movd [r0 + r1], m0
+
+ RET
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/4
+
+.loop
+ movh m0, [r2]
+ movh m1, [r2 + 2 * r3]
+ movh m2, [r4]
+ movh m3, [r4 + 2 * r5]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ packuswb m0, m0
+ movd [r0], m0
+ pshufd m0, m0, 1
+ movd [r0 + r1], m0
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movh m0, [r2]
+ movh m1, [r2 + 2 * r3]
+ movh m2, [r4]
+ movh m3, [r4 + 2 * r5]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ packuswb m0, m0
+ movd [r0], m0
+ pshufd m0, m0, 1
+ movd [r0 + r1], m0
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W4_H4 4, 4
+ADDAVG_W4_H4 4, 8
+ADDAVG_W4_H4 4, 16
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movd [r0 + r1], m1
+ pextrw [r0 + r1 + 4], m1, 2
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movd [r0 + r1], m1
+ pextrw [r0 + r1 + 4], m1, 2
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movd [r0 + r1], m1
+ pextrw [r0 + r1 + 4], m1, 2
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movd [r0 + r1], m1
+ pextrw [r0 + r1 + 4], m1, 2
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x2, 6,6,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x6, 6,6,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+
+%macro ADDAVG_W8_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/4
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W8_H4 8, 4
+ADDAVG_W8_H4 8, 8
+ADDAVG_W8_H4 8, 16
+ADDAVG_W8_H4 8, 32
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/4
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movh m0, [r2 + 16]
+ movh m1, [r2 + 16 + 2 * r3]
+ movh m2, [r4 + 16]
+ movh m3, [r4 + 16 + 2 * r5]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ packuswb m0, m0
+ movd [r0 + 8], m0
+ pshufd m0, m0, 1
+ movd [r0 + 8 + r1], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0
+
+ movh m0, [r2 + 16]
+ movh m1, [r2 + 16 + 2 * r3]
+ movh m2, [r4 + 16]
+ movh m3, [r4 + 16 + 2 * r5]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ packuswb m0, m0
+ movd [r0 + 8], m0
+ pshufd m0, m0, 1
+ movd [r0 + 8 + r1], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W12_H4 12, 16
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/4
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 16]
+ movu m2, [r4 + 16]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 16 + 2 * r3]
+ movu m3, [r4 + 16 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 16]
+ movu m2, [r4 + 16]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 16 + 2 * r3]
+ movu m3, [r4 + 16 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W16_H4 16, 4
+ADDAVG_W16_H4 16, 8
+ADDAVG_W16_H4 16, 12
+ADDAVG_W16_H4 16, 16
+ADDAVG_W16_H4 16, 32
+ADDAVG_W16_H4 16, 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/2
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 16]
+ movu m2, [r4 + 16]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0], m0
+
+ movu m0, [r2 + 32]
+ movu m2, [r4 + 32]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0 + 16], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 16 + 2 * r3]
+ movu m3, [r4 + 16 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + r1], m1
+
+ movu m1, [r2 + 32 + 2 * r3]
+ movu m3, [r4 + 32 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m1, m1
+ movh [r0 + 16 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/2
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 16]
+ movu m2, [r4 + 16]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0], m0
+
+ movu m0, [r2 + 32]
+ movu m2, [r4 + 32]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 48]
+ movu m2, [r4 + 48]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0 + 16], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 16 + 2 * r3]
+ movu m3, [r4 + 16 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + r1], m1
+
+ movu m1, [r2 + 32 + 2 * r3]
+ movu m3, [r4 + 32 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 48 + 2 * r3]
+ movu m3, [r4 + 48 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + 16 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W32_H2 32, 8
+ADDAVG_W32_H2 32, 16
+ADDAVG_W32_H2 32, 24
+ADDAVG_W32_H2 32, 32
+ADDAVG_W32_H2 32, 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2/2
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 16]
+ movu m2, [r4 + 16]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0], m0
+
+ movu m0, [r2 + 32]
+ movu m2, [r4 + 32]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 48]
+ movu m2, [r4 + 48]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0 + 16], m0
+
+ movu m0, [r2 + 64]
+ movu m2, [r4 + 64]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 80]
+ movu m2, [r4 + 80]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0 + 32], m0
+
+ movu m1, [r2 + 2 * r3]
+ movu m3, [r4 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 16 + 2 * r3]
+ movu m3, [r4 + 16 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + r1], m1
+
+ movu m1, [r2 + 32 + 2 * r3]
+ movu m3, [r4 + 32 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 48 + 2 * r3]
+ movu m3, [r4 + 48 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + 16 + r1], m1
+
+ movu m1, [r2 + 64 + 2 * r3]
+ movu m3, [r4 + 64 + 2 * r5]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 7
+
+ movu m2, [r2 + 80 + 2 * r3]
+ movu m3, [r4 + 80 + 2 * r5]
+ paddw m2, m3
+ paddw m2, m4
+ psraw m2, 7
+ packuswb m1, m2
+ movu [r0 + 32 + r1], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W48_H2 48, 64
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+ mova m4, [OFFSET]
+
+ mov r6d, %2
+
+.loop
+ movu m0, [r2]
+ movu m2, [r4]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 16]
+ movu m2, [r4 + 16]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0], m0
+
+ movu m0, [r2 + 32]
+ movu m2, [r4 + 32]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 48]
+ movu m2, [r4 + 48]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0 + 16], m0
+
+ movu m0, [r2 + 64]
+ movu m2, [r4 + 64]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 80]
+ movu m2, [r4 + 80]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0 + 32], m0
+
+ movu m0, [r2 + 96]
+ movu m2, [r4 + 96]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 7
+
+ movu m1, [r2 + 112]
+ movu m2, [r4 + 112]
+ paddw m1, m2
+ paddw m1, m4
+ psraw m1, 7
+ packuswb m0, m1
+ movu [r0 + 48], m0
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + 2 * r3]
+ lea r4, [r4 + 2 * r5]
+
+ dec r6d
+ jnz .loop
+
+ RET
+
+%endmacro
+
+ADDAVG_W64_H1 64, 16
+ADDAVG_W64_H1 64, 32
+ADDAVG_W64_H1 64, 48
+ADDAVG_W64_H1 64, 64
+
+;-----------------------------------------------------------------------------
+
;=============================================================================
; implicit weighted biprediction
;=============================================================================
diff -r 2e4e32a50cd6 -r 806eb643b88c source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/pixel.h Sat Jan 18 13:05:30 2014 +0530
@@ -166,6 +166,41 @@
int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+#define ADDAVG(func, args) \
+ void func ## _sse2 args; \
+ void func ## _sse4 args;
+ADDAVG(x265_addAvg_2x4, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_2x8, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x2, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x4, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x8, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x16, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_6x8, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x2, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x4, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x6, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x8, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x16, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x32, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_12x16, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x4, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x8, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x12, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x16, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x32, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x64, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_24x32, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x8, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x16, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x24, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x32, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x64, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x16, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x32, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x48, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x64, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_48x64, (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
#undef DECL_X1
More information about the x265-devel
mailing list