[x265] [PATCH] saoCuOrgE0 asm code, improved 500.43 -> 466.58
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Dec 30 14:45:20 CET 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1419246771 -19800
# Node ID ed7405ed73e062a32c1413f8868d5f73edde79ac
# Parent 32ed3f21039a5b93a54da8961442825e4db69d88
saoCuOrgE0 asm code, improved 500.43 -> 466.58
diff -r 32ed3f21039a -r ed7405ed73e0 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Dec 29 13:49:02 2014 +0530
+++ b/source/common/x86/const-a.asm Mon Dec 22 16:42:51 2014 +0530
@@ -50,6 +50,7 @@
const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
const pw_swap, times 2 db 6,7,4,5,2,3,0,1
+const pb_2, times 16 db 2
const pb_4, times 16 db 4
const pb_16, times 16 db 16
const pb_64, times 16 db 64
diff -r 32ed3f21039a -r ed7405ed73e0 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Mon Dec 29 13:49:02 2014 +0530
+++ b/source/common/x86/loopfilter.asm Mon Dec 22 16:42:51 2014 +0530
@@ -29,9 +29,12 @@
SECTION_RODATA 32
-pw_2: times 16 db 2
SECTION .text
+cextern pb_1
+cextern pb_128
+cextern pb_2
+
;============================================================================================================
; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
@@ -39,47 +42,44 @@
INIT_XMM sse4
cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
- neg r3 ; r3 = -iSignLeft
- movd m0, r3d
- pslldq m0, 15 ; m0 = [iSignLeft x .. x]
- pcmpeqb m4, m4 ; m4 = [pb -1]
- pxor m5, m5 ; m5 = 0
- movh m6, [r1] ; m6 = m_offsetEo
+ neg r3 ; r3 = -signLeft
+ movzx r3d, r3b
+ movd m0, r3d
+ mova m4, [pb_128] ; m4 = [80]
+ pxor m5, m5 ; m5 = 0
+ movu m6, [r1] ; m6 = offsetEo
.loop:
- movu m7, [r0] ; m1 = pRec[x]
- mova m1, m7
- movu m2, [r0+1] ; m2 = pRec[x+1]
+ movu m7, [r0] ; m1 = rec[x]
+ movu m2, [r0 + 1] ; m2 = rec[x+1]
- psubusb m3, m2, m7
- psubusb m1, m2
- pcmpeqb m3, m5
- pcmpeqb m1, m5
- pcmpeqb m2, m7
+ pxor m1, m7, m4
+ pxor m3, m2, m4
+ pcmpgtb m2, m1, m3
+ pcmpgtb m3, m1
+ pand m2, [pb_1]
+ por m2, m3
- pabsb m3, m3 ; m1 = (pRec[x] - pRec[x+1]) > 0) ? 1 : 0
- por m1, m3 ; m1 = iSignRight
- pandn m2, m1
+ pslldq m3, m2, 1
+ por m3, m0
- palignr m3, m2, m0, 15 ; m3 = -iSignLeft
- psignb m3, m4 ; m3 = iSignLeft
- mova m0, m4
- pslldq m0, 15
- pand m0, m2 ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1]
- paddb m2, m3
- paddb m2, [pw_2] ; m1 = uiEdgeType
- pshufb m3, m6, m2
- pmovzxbw m2, m7 ; rec
- punpckhbw m7, m5
- pmovsxbw m1, m3 ; iOffsetEo
- punpckhbw m3, m3
- psraw m3, 8
- paddw m2, m1
- paddw m7, m3
- packuswb m2, m7
- movu [r0], m2
+ psignb m3, m4 ; m3 = signLeft
+ pxor m0, m0
+ palignr m0, m2, 15
+ paddb m2, m3
+ paddb m2, [pb_2] ; m1 = uiEdgeType
+ pshufb m3, m6, m2
+ pmovzxbw m2, m7 ; rec
+ punpckhbw m7, m5
+ pmovsxbw m1, m3 ; offsetEo
+ punpckhbw m3, m3
+ psraw m3, 8
+ paddw m2, m1
+ paddw m7, m3
+ packuswb m2, m7
+ movu [r0], m2
- add r0q, 16
- sub r2d, 16
+ add r0q, 16
+ sub r2d, 16
jnz .loop
RET
More information about the x265-devel
mailing list