[x265] [PATCH 5 of 9] asm: intrapred dc32 sse2 high bit
dtyx265 at gmail.com
dtyx265 at gmail.com
Fri Mar 6 01:19:58 CET 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1425596109 28800
# Node ID 9fb1aba51e2ab4ebde4ab65a3cca9db2ed122ec4
# Parent 912c42dcb4d9b399515e6c1ed6be70db3bf5f675
asm: intrapred dc32 sse2 high bit
This patch moves x265_intra_pred_dc32_sse2 in the file to group it with the other sse2 primitives
It is also adds to asm-primitives.cpp.
diff -r 912c42dcb4d9 -r 9fb1aba51e2a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 05 14:31:59 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 14:55:09 2015 -0800
@@ -871,6 +871,7 @@
p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2;
p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2;
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2;
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2;
diff -r 912c42dcb4d9 -r 9fb1aba51e2a source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Mar 05 14:31:59 2015 -0800
+++ b/source/common/x86/intrapred16.asm Thu Mar 05 14:55:09 2015 -0800
@@ -380,70 +380,6 @@
.end:
RET
-;-----------------------------------------------------------------------------------
-; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
-;-----------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_dc4, 5,6,2
- lea r3, [r2 + 18]
- add r2, 2
-
- movh m0, [r3] ; sumAbove
- movh m1, [r2] ; sumLeft
-
- paddw m0, m1
- pshufd m1, m0, 1
- paddw m0, m1
- phaddw m0, m0 ; m0 = sum
-
- test r4d, r4d
-
- pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8
- movd r4d, m0 ; r4d = dc_val
- movzx r4d, r4w
- pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
-
- ; store DC 4x4
- movh [r0], m0
- movh [r0 + r1 * 2], m0
- movh [r0 + r1 * 4], m0
- lea r5, [r0 + r1 * 4]
- movh [r5 + r1 * 2], m0
-
- ; do DC filter
- jz .end
- lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
- add r4d, r5d ; r4d = DC * 3 + 2
- movd m0, r4d
- pshuflw m0, m0, 0 ; m0 = pixDCx3
-
- ; filter top
- movu m1, [r2]
- paddw m1, m0
- psraw m1, 2
- movh [r0], m1 ; overwrite top-left pixel, we will update it later
-
- ; filter top-left
- movzx r4d, word [r3]
- add r5d, r4d
- movzx r4d, word [r2]
- add r4d, r5d
- shr r4d, 2
- mov [r0], r4w
-
- ; filter left
- lea r0, [r0 + r1 * 2]
- movu m1, [r3 + 2]
- paddw m1, m0
- psraw m1, 2
- movd r3d, m1
- mov [r0], r3w
- shr r3d, 16
- mov [r0 + r1 * 2], r3w
- pextrw [r0 + r1 * 4], m1, 2
-.end:
- RET
-
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
;-------------------------------------------------------------------------------------------
@@ -505,6 +441,70 @@
%endrep
RET
+;-----------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4, 5,6,2
+ lea r3, [r2 + 18]
+ add r2, 2
+
+ movh m0, [r3] ; sumAbove
+ movh m1, [r2] ; sumLeft
+
+ paddw m0, m1
+ pshufd m1, m0, 1
+ paddw m0, m1
+ phaddw m0, m0 ; m0 = sum
+
+ test r4d, r4d
+
+ pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8
+ movd r4d, m0 ; r4d = dc_val
+ movzx r4d, r4w
+ pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
+
+ ; store DC 4x4
+ movh [r0], m0
+ movh [r0 + r1 * 2], m0
+ movh [r0 + r1 * 4], m0
+ lea r5, [r0 + r1 * 4]
+ movh [r5 + r1 * 2], m0
+
+ ; do DC filter
+ jz .end
+ lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
+ add r4d, r5d ; r4d = DC * 3 + 2
+ movd m0, r4d
+ pshuflw m0, m0, 0 ; m0 = pixDCx3
+
+ ; filter top
+ movu m1, [r2]
+ paddw m1, m0
+ psraw m1, 2
+ movh [r0], m1 ; overwrite top-left pixel, we will update it later
+
+ ; filter top-left
+ movzx r4d, word [r3]
+ add r5d, r4d
+ movzx r4d, word [r2]
+ add r4d, r5d
+ shr r4d, 2
+ mov [r0], r4w
+
+ ; filter left
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r3 + 2]
+ paddw m1, m0
+ psraw m1, 2
+ movd r3d, m1
+ mov [r0], r3w
+ shr r3d, 16
+ mov [r0 + r1 * 2], r3w
+ pextrw [r0 + r1 * 4], m1, 2
+.end:
+ RET
+
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
More information about the x265-devel
mailing list