[x265] [PATCH 3 of 3] asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Feb 26 10:11:31 CET 2016


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456466696 -19800
#      Fri Feb 26 11:34:56 2016 +0530
# Node ID d7d0c03b5e6e7fd0258d609ad5e9f4d7c0a40390
# Parent  59d9eca3d144e71f11d509a5dd40b634bb9ab500
asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12

diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 26 11:34:56 2016 +0530
@@ -1101,6 +1101,11 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+        p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
+
         p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
         p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/const-a.asm	Fri Feb 26 11:34:56 2016 +0530
@@ -69,6 +69,7 @@
 const pb_000000000000000F,           db 0xff
                             times 15 db 0x00
 const pb_shuf_off4,         times  2 db   0,   4,   1,   5,   2,   6,   3,   7
+const pw_shuf_off4,         times  1 db   0,   1,   8,   9,   2,   3,  10,  11,   4,   5,  12,  13,   6,   7,  14,  15
 
 ;; 16-bit constants
 
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/loopfilter.asm	Fri Feb 26 11:34:56 2016 +0530
@@ -51,6 +51,8 @@
 cextern hmul_16p
 cextern pw_1_ffff
 cextern pb_shuf_off4
+cextern pw_shuf_off4
+
 ;============================================================================================================
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
 ;============================================================================================================
@@ -3758,6 +3760,9 @@
 
 INIT_XMM sse4
 cglobal pelFilterLumaStrong_H, 5,7,10
+%if HIGH_BIT_DEPTH
+    add             r2d, r2d
+%endif
     mov             r1, r2
     neg             r3d
     neg             r4d
@@ -3766,6 +3771,16 @@
     lea             r5, [r2 * 3]
     lea             r6, [r1 * 3]
 
+%if HIGH_BIT_DEPTH
+    movu            m4, [r0]                ; src[0]
+    movu            m3, [r0 + r1]           ; src[-offset]
+    movu            m2, [r0 + r1 * 2]       ; src[-offset * 2]
+    movu            m1, [r0 + r6]           ; src[-offset * 3]
+    movu            m0, [r0 + r1 * 4]       ; src[-offset * 4]
+    movu            m5, [r0 + r2]           ; src[offset]
+    movu            m6, [r0 + r2 * 2]       ; src[offset * 2]
+    movu            m7, [r0 + r5]           ; src[offset * 3]
+%else
     pmovzxbw        m4, [r0]                ; src[0]
     pmovzxbw        m3, [r0 + r1]           ; src[-offset]
     pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
@@ -3774,6 +3789,7 @@
     pmovzxbw        m5, [r0 + r2]           ; src[offset]
     pmovzxbw        m6, [r0 + r2 * 2]       ; src[offset * 2]
     pmovzxbw        m7, [r0 + r5]           ; src[offset * 3]
+%endif
 
     paddw           m0, m0                  ; m0*2
     mova            m8, m2
@@ -3841,6 +3857,15 @@
     paddw           m0, m1
     paddw           m3, m4
     paddw           m9, m5
+
+%if HIGH_BIT_DEPTH
+    movh            [r0 + r6], m0
+    movhps          [r0 + r1], m0
+    movh            [r0], m3
+    movhps          [r0 + r2 * 2], m3,
+    movh            [r0 + r2 * 1], m9
+    movhps          [r0 + r1 * 2], m9
+%else
     packuswb        m0, m0
     packuswb        m3, m9
 
@@ -3850,14 +3875,41 @@
     pextrd          [r0 + r2 * 2], m3, 1
     pextrd          [r0 + r2 * 1], m3, 2
     pextrd          [r0 + r1 * 2], m3, 3
+%endif
     RET
 
 INIT_XMM sse4
 cglobal pelFilterLumaStrong_V, 5,5,10
+%if HIGH_BIT_DEPTH
+    add             r1d, r1d
+%endif
     neg             r3d
     neg             r4d
     lea             r2, [r1 * 3]
 
+%if HIGH_BIT_DEPTH
+    movu            m0, [r0 - 8]            ; src[-offset * 4] row 0
+    movu            m1, [r0 + r1 * 1 - 8]   ; src[-offset * 4] row 1
+    movu            m2, [r0 + r1 * 2 - 8]   ; src[-offset * 4] row 2
+    movu            m3, [r0 + r2 * 1 - 8]   ; src[-offset * 4] row 3
+
+    punpckhwd       m4, m0, m1              ; [m4 m4 m5 m5 m6 m6 m7 m7]
+    punpcklwd       m0, m1                  ; [m0 m0 m1 m1 m2 m2 m3 m3]
+
+    punpckhwd       m5, m2, m3              ; [m4 m4 m5 m5 m6 m6 m7 m7]
+    punpcklwd       m2, m3                  ; [m0 m0 m1 m1 m2 m2 m3 m3]
+
+    punpckhdq       m3, m0, m2              ; [m2 m2 m2 m2 m3 m3 m3 m3]
+    punpckldq       m0, m2                  ; [m0 m0 m0 m0 m1 m1 m1 m1]
+    psrldq          m1, m0, 8               ; [m1 m1 m1 m1 x x x x]
+    mova            m2, m3                  ; [m2 m2 m2 m2 x x x x]
+    punpckhqdq      m3, m3                  ; [m3 m3 m3 m3 x x x x]
+
+    punpckhdq       m6, m4, m5              ; [m6 m6 m6 m6 m7 m7 m7 m7]
+    punpckldq       m4, m5                  ; [m4 m4 m4 m4 m5 m5 m5 m5]
+    psrldq          m7, m6, 8
+    psrldq          m5, m4, 8
+%else
     movh            m0, [r0 - 4]            ; src[-offset * 4] row 0
     movh            m1, [r0 + r1 * 1 - 4]   ; src[-offset * 4] row 1
     movh            m2, [r0 + r1 * 2 - 4]   ; src[-offset * 4] row 2
@@ -3890,6 +3942,7 @@
     pmovzxbw        m5, m5
     pmovzxbw        m6, m6
     pmovzxbw        m7, m7
+%endif
 
     paddw           m0, m0                  ; m0*2
     mova            m8, m2
@@ -3957,6 +4010,35 @@
     paddw           m0, m1
     paddw           m3, m4
     paddw           m9, m5
+
+%if HIGH_BIT_DEPTH
+    ; 4x6 output rows -
+    ; m0 - col 0
+    ; m3 - col 3
+
+    psrldq           m1, m0, 8
+    psrldq           m2, m3, 8
+
+    mova            m4, m9
+    psrldq          m5, m9, 8
+
+    ; transpose 4x6 to 6x4
+    punpcklwd       m0, m5
+    punpcklwd       m1, m3
+    punpcklwd       m4, m2
+
+    punpckldq       m9, m0, m1
+    punpckhdq       m0, m1
+
+    movh            [r0 + r1 * 0 - 6], m9
+    movhps          [r0 + r1 * 1 - 6], m9
+    movh            [r0 + r1 * 2 - 6], m0
+    movhps          [r0 + r2 * 1 - 6], m0
+    pextrd          [r0 + r1 * 0 + 2], m4, 0
+    pextrd          [r0 + r1 * 1 + 2], m4, 1
+    pextrd          [r0 + r1 * 2 + 2], m4, 2
+    pextrd          [r0 + r2 * 1 + 2], m4, 3
+%else
     packuswb        m0, m0
     packuswb        m3, m9
 
@@ -3986,20 +4068,31 @@
     pextrw          [r0 + r1 * 1 + 1], m4, 1
     pextrw          [r0 + r1 * 2 + 1], m4, 2
     pextrw          [r0 + r2 * 1 + 1], m4, 3
+%endif
     RET
 %endif ; ARCH_X86_64
 
 %if ARCH_X86_64
 INIT_XMM sse4
 cglobal pelFilterChroma_H, 6,6,5
+%if HIGH_BIT_DEPTH
+    add             r2d, r2d
+%endif
     mov             r1, r2
     neg             r3d
     neg             r1
 
+%if HIGH_BIT_DEPTH
+    movu            m4, [r0]                ; src[0]
+    movu            m3, [r0 + r1]           ; src[-offset]
+    movu            m0, [r0 + r2]           ; src[offset]
+    movu            m2, [r0 + r1 * 2]       ; src[-offset * 2]
+%else
     pmovzxbw        m4, [r0]                ; src[0]
     pmovzxbw        m3, [r0 + r1]           ; src[-offset]
     pmovzxbw        m0, [r0 + r2]           ; src[offset]
     pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
+%endif
 
     psubw           m1, m4, m3              ; m4 - m3
     psubw           m2, m0                  ; m2 - m5
@@ -4032,21 +4125,35 @@
     pmaxsw          m3, m0
     pminsw          m3, [pw_pixel_max]
 
+%if HIGH_BIT_DEPTH
+    movh            [r0 + r1], m3
+    movhps          [r0], m3
+%else
     packuswb        m3, m3
     movd            [r0 + r1], m3
     pextrd          [r0], m3, 1
+%endif
     RET
 
 INIT_XMM sse4
 cglobal pelFilterChroma_V, 6,6,5
+%if HIGH_BIT_DEPTH
+    add             r1d, r1d
+%endif
     neg             r3d
     lea             r2, [r1 * 3]
 
+%if HIGH_BIT_DEPTH
+    movu            m4, [r0 + r1 * 0 - 4]   ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
+    movu            m3, [r0 + r1 * 1 - 4]
+    movu            m0, [r0 + r1 * 2 - 4]
+    movu            m2, [r0 + r2 * 1 - 4]
+%else
     pmovzxbw        m4, [r0 + r1 * 0 - 2]   ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
     pmovzxbw        m3, [r0 + r1 * 1 - 2]
     pmovzxbw        m0, [r0 + r1 * 2 - 2]
     pmovzxbw        m2, [r0 + r2 * 1 - 2]
-
+%endif
     punpcklwd       m4, m3
     punpcklwd       m0, m2
     punpckldq       m2, m4, m0              ; [m2 m2 m2 m2 m3 m3 m3 m3]
@@ -4085,11 +4192,19 @@
     pmaxsw          m3, m0
     pminsw          m3, [pw_pixel_max]
 
+%if HIGH_BIT_DEPTH
+    pshufb          m3, [pw_shuf_off4]
+    pextrd          [r0 + r1 * 0 - 2], m3, 0
+    pextrd          [r0 + r1 * 1 - 2], m3, 1
+    pextrd          [r0 + r1 * 2 - 2], m3, 2
+    pextrd          [r0 + r2 * 1 - 2], m3, 3
+%else
     packuswb        m3, m3
     pshufb          m3, [pb_shuf_off4]
     pextrw          [r0 + r1 * 0 - 1], m3, 0
     pextrw          [r0 + r1 * 1 - 1], m3, 1
     pextrw          [r0 + r1 * 2 - 1], m3, 2
     pextrw          [r0 + r2 * 1 - 1], m3, 3
+%endif
     RET
 %endif ; ARCH_X86_64


More information about the x265-devel mailing list