[x265] [PATCH 07 of 29] high_bit_depth: intra_pred_dc_new updated asm and unit test code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:15 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1418725213 -19800
# Tue Dec 16 15:50:13 2014 +0530
# Node ID 49cb8574e8f4453f700a52dc8a47fadc966a9c0b
# Parent 70b4e0c84320df0b7443e5aea6e110c1bf483684
high_bit_depth: intra_pred_dc_new updated asm and unit test code
diff -r 70b4e0c84320 -r 49cb8574e8f4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 16 14:02:19 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 16 15:50:13 2014 +0530
@@ -1418,6 +1418,12 @@
p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
+
+ p.intra_pred_new[1][BLOCK_4x4] = x265_intra_pred_dc4_new_sse4;
+ p.intra_pred_new[1][BLOCK_8x8] = x265_intra_pred_dc8_new_sse4;
+ p.intra_pred_new[1][BLOCK_16x16] = x265_intra_pred_dc16_new_sse4;
+ p.intra_pred_new[1][BLOCK_32x32] = x265_intra_pred_dc32_new_sse4;
+
p.planecopy_cp = x265_upShift_8_sse4;
INTRA_ANG_SSE4_COMMON(sse4);
diff -r 70b4e0c84320 -r 49cb8574e8f4 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Dec 16 14:02:19 2014 +0530
+++ b/source/common/x86/intrapred16.asm Tue Dec 16 15:50:13 2014 +0530
@@ -138,7 +138,69 @@
RET
-
+;-----------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4_new, 5,6,2
+ lea r3, [r2 + 18]
+ add r2, 2
+
+ movh m0, [r3] ; sumAbove
+ movh m1, [r2] ; sumLeft
+
+ paddw m0, m1
+ pshufd m1, m0, 1
+ paddw m0, m1
+ phaddw m0, m0 ; m0 = sum
+
+ test r4d, r4d
+
+ pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8
+ movd r4d, m0 ; r4d = dc_val
+ movzx r4d, r4w
+ pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
+
+ ; store DC 4x4
+ movh [r0], m0
+ movh [r0 + r1 * 2], m0
+ movh [r0 + r1 * 4], m0
+ lea r5, [r0 + r1 * 4]
+ movh [r5 + r1 * 2], m0
+
+ ; do DC filter
+ jz .end
+ lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
+ add r4d, r5d ; r4d = DC * 3 + 2
+ movd m0, r4d
+ pshuflw m0, m0, 0 ; m0 = pixDCx3
+
+ ; filter top
+ movu m1, [r2]
+ paddw m1, m0
+ psraw m1, 2
+ movh [r0], m1 ; overwrite top-left pixel, we will update it later
+
+ ; filter top-left
+ movzx r4d, word [r3]
+ add r5d, r4d
+ movzx r4d, word [r2]
+ add r4d, r5d
+ shr r4d, 2
+ mov [r0], r4w
+
+ ; filter left
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r3 + 2]
+ paddw m1, m0
+ psraw m1, 2
+ movd r3d, m1
+ mov [r0], r3w
+ shr r3d, 16
+ mov [r0 + r1 * 2], r3w
+ pextrw [r0 + r1 * 4], m1, 2
+.end:
+ RET
;-------------------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
@@ -221,6 +283,84 @@
.end:
RET
+;-----------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc8_new, 5, 7, 2
+ lea r3, [r2 + 34]
+ add r2, 2
+ add r1, r1
+ movu m0, [r3]
+ movu m1, [r2]
+
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ phaddw m0, m0
+ pmaddwd m0, [pw_1]
+
+ movd r5d, m0
+ add r5d, 8
+ shr r5d, 4 ; sum = sum / 16
+ movd m1, r5d
+ pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
+ pshufd m1, m1, 0
+
+ test r4d, r4d
+
+ ; store DC 8x8
+ mov r6, r0
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + r1 * 2], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0 + r1], m1
+ movu [r0 + r1 * 2], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0 + r1], m1
+ movu [r0 + r1 * 2], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0 + r1], m1
+
+ ; Do DC Filter
+ jz .end
+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
+ add r5d, r4d ; r5d = DC * 3 + 2
+ movd m1, r5d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+ pshufd m1, m1, 0
+
+ ; filter top
+ movu m0, [r2]
+ paddw m0, m1
+ psraw m0, 2
+ movu [r6], m0
+
+ ; filter top-left
+ movzx r5d, word [r3]
+ add r4d, r5d
+ movzx r5d, word [r2]
+ add r5d, r4d
+ shr r5d, 2
+ mov [r6], r5w
+
+ ; filter left
+ add r6, r1
+ movu m0, [r3 + 2]
+ paddw m0, m1
+ psraw m0, 2
+ pextrw [r6], m0, 0
+ pextrw [r6 + r1], m0, 1
+ pextrw [r6 + r1 * 2], m0, 2
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6 + r1], m0, 3
+ pextrw [r6 + r1 * 2], m0, 4
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6 + r1], m0, 5
+ pextrw [r6 + r1 * 2], m0, 6
+.end:
+ RET
;-------------------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
@@ -358,11 +498,219 @@
.end:
RET
+;-------------------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
+;-------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc16_new, 5, 7, 4
+ lea r3, [r2 + 66]
+ add r2, 2
+ add r1, r1
+ movu m0, [r3]
+ movu m1, [r3 + 16]
+ movu m2, [r2]
+ movu m3, [r2 + 16]
+
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+ phaddw m0, m0
+ pmaddwd m0, [pw_1]
+
+ movd r5d, m0
+ add r5d, 16
+ shr r5d, 5 ; sum = sum / 16
+ movd m1, r5d
+ pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
+ pshufd m1, m1, 0
+
+ test r4d, r4d
+
+ ; store DC 16x16
+ mov r6, r0
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16 + r1], m1
+
+ ; Do DC Filter
+ jz .end
+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
+ add r5d, r4d ; r5d = DC * 3 + 2
+ movd m1, r5d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+ pshufd m1, m1, 0
+
+ ; filter top
+ movu m2, [r2]
+ paddw m2, m1
+ psraw m2, 2
+ movu [r6], m2
+ movu m3, [r2 + 16]
+ paddw m3, m1
+ psraw m3, 2
+ movu [r6 + 16], m3
+
+ ; filter top-left
+ movzx r5d, word [r3]
+ add r4d, r5d
+ movzx r5d, word [r2]
+ add r5d, r4d
+ shr r5d, 2
+ mov [r6], r5w
+
+ ; filter left
+ add r6, r1
+ movu m2, [r3 + 2]
+ paddw m2, m1
+ psraw m2, 2
+
+ pextrw [r6], m2, 0
+ pextrw [r6 + r1], m2, 1
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6], m2, 2
+ pextrw [r6 + r1], m2, 3
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6], m2, 4
+ pextrw [r6 + r1], m2, 5
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6], m2, 6
+ pextrw [r6 + r1], m2, 7
+
+ lea r6, [r6 + r1 * 2]
+ movu m3, [r3 + 18]
+ paddw m3, m1
+ psraw m3, 2
+
+ pextrw [r6], m3, 0
+ pextrw [r6 + r1], m3, 1
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6], m3, 2
+ pextrw [r6 + r1], m3, 3
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6], m3, 4
+ pextrw [r6 + r1], m3, 5
+ lea r6, [r6 + r1 * 2]
+ pextrw [r6], m3, 6
+.end:
+ RET
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
;-------------------------------------------------------------------------------------------
INIT_XMM sse4
+cglobal intra_pred_dc32_new, 3, 5, 6
+ lea r3, [r2 + 130]
+ add r2, 2
+ add r1, r1
+ movu m0, [r3]
+ movu m1, [r3 + 16]
+ movu m2, [r3 + 32]
+ movu m3, [r3 + 48]
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ movu m1, [r2]
+ movu m3, [r2 + 16]
+ movu m4, [r2 + 32]
+ movu m5, [r2 + 48]
+ paddw m1, m3
+ paddw m4, m5
+ paddw m1, m4
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ phaddw m0, m0
+ pmaddwd m0, [pw_1]
+
+ paddd m0, [pd_32] ; sum = sum + 32
+ psrld m0, 6 ; sum = sum / 64
+ pshuflw m0, m0, 0
+ pshufd m0, m0, 0
+
+ lea r2, [r1 * 3]
+ mov r3d, 4
+.loop:
+ ; store DC 32x32
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ movu [r0 + r1 + 0], m0
+ movu [r0 + r1 + 16], m0
+ movu [r0 + r1 + 32], m0
+ movu [r0 + r1 + 48], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 16], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r1 * 2 + 48], m0
+ movu [r0 + r2 + 0], m0
+ movu [r0 + r2 + 16], m0
+ movu [r0 + r2 + 32], m0
+ movu [r0 + r2 + 48], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + 0], m0
+ movu [r0 + 16], m0
+ movu [r0 + 32], m0
+ movu [r0 + 48], m0
+ movu [r0 + r1 + 0], m0
+ movu [r0 + r1 + 16], m0
+ movu [r0 + r1 + 32], m0
+ movu [r0 + r1 + 48], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 16], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r1 * 2 + 48], m0
+ movu [r0 + r2 + 0], m0
+ movu [r0 + r2 + 16], m0
+ movu [r0 + r2 + 32], m0
+ movu [r0 + r2 + 48], m0
+ lea r0, [r0 + r1 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+;-------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
+;-------------------------------------------------------------------------------------------
+INIT_XMM sse4
cglobal intra_pred_dc32, 4, 5, 6
mov r4d, r5m
add r2, 2
More information about the x265-devel
mailing list