[x265] [PATCH] asm-16bpp: code for addAvg luma and chroma all sizes
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Feb 19 11:53:47 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1392807092 -19800
# Wed Feb 19 16:21:32 2014 +0530
# Node ID cede20cde62ba0a96ac181bcf78a508097de0e7c
# Parent 6150985c3d535f0ea7a1dc0b8f3c69e65e30d25b
asm-16bpp: code for addAvg luma and chroma all sizes
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Feb 19 16:21:32 2014 +0530
@@ -679,10 +679,13 @@
p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
#define CHROMA_ADDAVG(cpu) \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 8, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_ADDAVG_FUNC_DEF(6, 8, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \
@@ -831,6 +834,9 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+ LUMA_ADDAVG(_sse4);
+ CHROMA_ADDAVG(_sse4);
+
p.dct[DCT_8x8] = x265_dct8_sse4;
p.quant = x265_quant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
@@ -1330,10 +1336,6 @@
SETUP_INTRA_ANG32(33, 33, sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
-
- p.chroma[X265_CSP_I420].addAvg[CHROMA_2x4] = x265_addAvg_2x4_sse4;
- p.chroma[X265_CSP_I420].addAvg[CHROMA_2x8] = x265_addAvg_2x8_sse4;
- p.chroma[X265_CSP_I420].addAvg[CHROMA_6x8] = x265_addAvg_6x8_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/const-a.asm Wed Feb 19 16:21:32 2014 +0530
@@ -36,8 +36,10 @@
const pw_128, times 16 dw 128
const pw_256, times 16 dw 256
const pw_512, times 16 dw 512
+const pw_1023, times 8 dw 1023
const pw_1024, times 16 dw 1024
const pw_4096, times 16 dw 4096
+const pw_16400, times 8 dw 16400
const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/intrapred16.asm Wed Feb 19 16:21:32 2014 +0530
@@ -45,7 +45,6 @@
const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
const pw_unpackwdq, times 8 db 0,1
-const pw_1023, times 8 dw 1023
const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
@@ -58,6 +57,7 @@
cextern pw_1
cextern pw_8
+cextern pw_1023
cextern pd_16
cextern pd_32
cextern pw_4096
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/mc-a.asm Wed Feb 19 16:21:32 2014 +0530
@@ -52,6 +52,9 @@
cextern pw_128
cextern pw_256
cextern pw_512
+cextern pw_1023
+cextern pw_1024
+cextern pw_16400
cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
@@ -65,6 +68,873 @@
; r2 = pDst, r3 = iStride0
; r4 = iStride1, r5 = iDstStride
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m7, [pw_16400]
+ mova m0, [pw_1023]
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m6, [r1 + r4]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ paddw m1, m7
+ psraw m1, 5
+ pxor m6, m6
+ pmaxsw m1, m6
+ pminsw m1, m0
+
+ movd [r2], m1
+ pextrd [r2 + r5], m1, 1
+ lea r2, [r2 + 2 * r5]
+ pextrd [r2], m1, 2
+ pextrd [r2 + r5], m1, 3
+
+ RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+
+ mova m7, [pw_16400]
+ mova m0, [pw_1023]
+
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+%rep 2
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r4]
+
+ punpckldq m1, m2
+ punpckldq m3, m4
+
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movd m2, [r0]
+ movd m4, [r0 + r3]
+ movd m5, [r1]
+ movd m6, [r1 + r4]
+
+ punpckldq m2, m4
+ punpckldq m5, m6
+ punpcklqdq m1, m2
+ punpcklqdq m3, m5
+
+ paddw m1, m3
+ paddw m1, m7
+ psraw m1, 5
+ pxor m6, m6
+ pmaxsw m1, m6
+ pminsw m1, m0
+
+ movd [r2], m1
+ pextrd [r2 + r5], m1, 1
+ lea r2, [r2 + 2 * r5]
+ pextrd [r2], m1, 2
+ pextrd [r2 + r5], m1, 3
+
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+ lea r2, [r2 + 2 * r5]
+%endrep
+ RET
+
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+
+ mova m4, [pw_16400]
+ mova m5, [pw_1023]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ movh m0, [r0]
+ movh m1, [r0 + r3]
+ movh m2, [r1]
+ movh m3, [r1 + r4]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 5
+ pmaxsw m1, m6
+ pminsw m1, m5
+
+ movh [r2], m0
+ movhps [r2 + r5], m0
+ RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,7,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_16400]
+ mova m5, [pw_1023]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+%rep 4
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 5
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movh [r2], m0
+ pextrd [r2 + 8], m0, 2
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 5
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movh [r2 + r5], m1
+ pextrd [r2 + r5 + 8], m1, 2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_8x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_16400]
+ mova m5, [pw_1023]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 5
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 5
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+ RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_8x6, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_16400]
+ mova m5, [pw_1023]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+%rep 3
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ paddw m0, m4
+ psraw m0, 5
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ paddw m1, m4
+ psraw m1, 5
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ RET
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 1
+INIT_XMM sse4
+cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, %1/4
+
+.loop
+%rep 2
+ movh m0, [r0]
+ movh m1, [r0 + r3]
+ movh m2, [r1]
+ movh m3, [r1 + r4]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+
+ pmaxsw m0, m6
+ pminsw m0, m5
+
+ movh [r2], m0
+ movhps [r2 + r5], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W4_H4 4
+ADDAVG_W4_H4 8
+ADDAVG_W4_H4 16
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W8_H4 1
+INIT_XMM sse4
+cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ mov r6d, %1/4
+
+.loop
+%rep 2
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W8_H4 4
+ADDAVG_W8_H4 8
+ADDAVG_W8_H4 16
+ADDAVG_W8_H4 32
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 1
+INIT_XMM sse4
+cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ mov r6d, %1/4
+
+.loop
+%rep 2
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movh m0, [r0 + 16]
+ movh m1, [r0 + 16 + r3]
+ movh m2, [r1 + 16]
+ movh m3, [r1 + 16 + r4]
+
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movh [r2 + 16], m0
+ movhps [r2 + r5 + 16], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W12_H4 16
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 1
+INIT_XMM sse4
+cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ mov r6d, %1/4
+
+.loop
+%rep 2
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 16], m1
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + r5 + 16], m2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W16_H4 4
+ADDAVG_W16_H4 8
+ADDAVG_W16_H4 12
+ADDAVG_W16_H4 16
+ADDAVG_W16_H4 32
+ADDAVG_W16_H4 64
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse4
+cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, %2/2
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ movu m2, [r0 + r3 + 16]
+ movu m3, [r1 + r4 + 16]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + r5 + 16], m2
+
+ movu m1, [r0 + r3 + 32]
+ movu m3, [r1 + r4 + 32]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5 + 32], m1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 1
+INIT_XMM sse4
+cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, %1/2
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 48]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 48], m1
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + r5 + 16], m2
+
+ movu m1, [r0 + 32 + r3]
+ movu m3, [r1 + 32 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5 + 32], m1
+
+ movu m2, [r0 + 48 + r3]
+ movu m3, [r1 + 48 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + r5 + 48], m2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W32_H2 8
+ADDAVG_W32_H2 16
+ADDAVG_W32_H2 24
+ADDAVG_W32_H2 32
+ADDAVG_W32_H2 64
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 1
+INIT_XMM sse4
+cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, %1/2
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 48]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 48], m1
+
+ movu m0, [r0 + 64]
+ movu m2, [r1 + 64]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 64], m0
+
+ movu m1, [r0 + 80]
+ movu m2, [r1 + 80]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 80], m1
+
+ movu m1, [r0 + r3]
+ movu m3, [r1 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + r5], m1
+
+ movu m2, [r0 + 16 + r3]
+ movu m3, [r1 + 16 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + 16 + r5], m2
+
+ movu m1, [r0 + 32 + r3]
+ movu m3, [r1 + 32 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 32 + r5], m1
+
+ movu m2, [r0 + 48 + r3]
+ movu m3, [r1 + 48 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + 48 + r5], m2
+
+ movu m1, [r0 + 64 + r3]
+ movu m3, [r1 + 64 + r4]
+ paddw m1, m3
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 64 + r5], m1
+
+ movu m2, [r0 + 80 + r3]
+ movu m3, [r1 + 80 + r4]
+ paddw m2, m3
+ pmulhrsw m2, m7
+ paddw m2, m4
+ pmaxsw m2, m6
+ pminsw m2, m5
+ movu [r2 + 80 + r5], m2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W48_H2 64
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 1
+INIT_XMM sse4
+cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m7, [pw_1024]
+ pxor m6, m6
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ mov r6d, %1
+
+.loop
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m1, [r0 + 16]
+ movu m2, [r1 + 16]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 48]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 48], m1
+
+ movu m0, [r0 + 64]
+ movu m2, [r1 + 64]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 64], m0
+
+ movu m1, [r0 + 80]
+ movu m2, [r1 + 80]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 80], m1
+
+ movu m0, [r0 + 96]
+ movu m2, [r1 + 96]
+ paddw m0, m2
+ pmulhrsw m0, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m5
+ movu [r2 + 96], m0
+
+ movu m1, [r0 + 112]
+ movu m2, [r1 + 112]
+ paddw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m4
+ pmaxsw m1, m6
+ pminsw m1, m5
+ movu [r2 + 112], m1
+
+ add r2, r5
+ add r0, r3
+ add r1, r4
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W64_H1 16
+ADDAVG_W64_H1 32
+ADDAVG_W64_H1 48
+ADDAVG_W64_H1 64
+;-----------------------------------------------------------------------------
+%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride
@@ -1087,6 +1957,7 @@
ADDAVG_W64_H1 48
ADDAVG_W64_H1 64
;-----------------------------------------------------------------------------
+%endif ; HIGH_BIT_DEPTH
;=============================================================================
; implicit weighted biprediction
diff -r 6150985c3d53 -r cede20cde62b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Feb 19 12:21:13 2014 +0530
+++ b/source/test/pixelharness.cpp Wed Feb 19 16:21:32 2014 +0530
@@ -881,6 +881,11 @@
bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
{
+#if HIGH_BIT_DEPTH
+ int old_depth = X265_DEPTH;
+ X265_DEPTH = 10;
+#endif
+
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
@@ -897,11 +902,19 @@
opt(short_test_buff[index1] + j, short_test_buff[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ {
+#if HIGH_BIT_DEPTH
+ X265_DEPTH = old_depth;
+#endif
return false;
+ }
j += INCR;
}
+#if HIGH_BIT_DEPTH
+ X265_DEPTH = old_depth;
+#endif
return true;
}
More information about the x265-devel
mailing list