[x265] [PATCH] 16bpp: assembly code for intra_pred_dc32
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Dec 6 10:56:32 CET 2013
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1386323560 -19800
# Fri Dec 06 15:22:40 2013 +0530
# Node ID 64690c4d2c8cb8faad603cc05b5f437f97746349
# Parent 53c7147e0e34ec247ab0c9f355c11245773e2fcf
16bpp: assembly code for intra_pred_dc32
diff -r 53c7147e0e34 -r 64690c4d2c8c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 06 15:03:57 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Dec 06 15:22:40 2013 +0530
@@ -657,6 +657,7 @@
p.intra_pred[BLOCK_4x4][1] = x265_intra_pred_dc4_sse4;
p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
p.intra_pred[BLOCK_16x16][1] = x265_intra_pred_dc16_sse4;
+ p.intra_pred[BLOCK_32x32][1] = x265_intra_pred_dc32_sse4;
}
if (cpuMask & X265_CPU_XOP)
{
diff -r 53c7147e0e34 -r 64690c4d2c8c source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Dec 06 15:03:57 2013 +0530
+++ b/source/common/x86/intrapred16.asm Fri Dec 06 15:22:40 2013 +0530
@@ -29,6 +29,7 @@
SECTION .text
cextern pw_1
+cextern pd_32
cextern pw_4096
@@ -317,3 +318,83 @@
.end
RET
+
+;-------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
+;-------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc32, 4, 5, 6
+ mov r4d, r5m
+ add r2, 2
+ add r3, 2
+ add r1, r1
+ movu m0, [r3]
+ movu m1, [r3 + 16]
+ movu m2, [r3 + 32]
+ movu m3, [r3 + 48]
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ movu m1, [r2]
+ movu m3, [r2 + 16]
+ movu m4, [r2 + 32]
+ movu m5, [r2 + 48]
+ paddw m1, m3
+ paddw m4, m5
+ paddw m1, m4
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ phaddw m0, m0
+ pmaddwd m0, [pw_1]
+
+ paddd m0, [pd_32] ; sum = sum + 32
+ psrld m0, 6 ; sum = sum / 64
+ pshuflw m0, m0, 0
+ pshufd m0, m0, 0
+
+%rep 4
+ ; store DC 16x16
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ add r0, r1
+%endrep
+
+ RET
More information about the x265-devel
mailing list