[x265] [PATCH 2 of 2] asm: asm code for deblocking filter horizontal and vertical

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 9 14:47:08 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1444286180 -19800
#      Thu Oct 08 12:06:20 2015 +0530
# Node ID 86627e458e6e2e357fe1746067392c6984b8915f
# Parent  38e4b94377fa6ffe57472c49ecff6c909ed4f6dc
asm: asm code for deblocking filter horizontal and vertical

diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 08 12:06:20 2015 +0530
@@ -2541,6 +2541,9 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
 
 #if X86_64
+        p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+        p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+
         p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
         p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/const-a.asm	Thu Oct 08 12:06:20 2015 +0530
@@ -67,6 +67,7 @@
 
 ;; 16-bit constants
 
+const pw_n1,                times 16 dw -1
 const pw_1,                 times 16 dw 1
 const pw_2,                 times 16 dw 2
 const pw_3,                 times 16 dw 3
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Oct 08 12:06:20 2015 +0530
@@ -37,6 +37,7 @@
 
 SECTION .text
 cextern pb_1
+cextern pb_01
 cextern pb_128
 cextern pb_2
 cextern pw_2
@@ -45,6 +46,8 @@
 cextern pw_1
 cextern hmul_16p
 cextern pb_4
+cextern pw_4
+cextern pw_n1
 
 
 ;============================================================================================================
@@ -2231,6 +2234,248 @@
     RET
 %endif ; ARCH_X86_64
 
+%if ARCH_X86_64
+;; argument registers used -
+; r0    - src
+; r1    - srcStep
+; r2    - offset
+; r3    - tcP
+; r4    - tcQ
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_H, 5,7,10
+    mov             r1, r2
+    neg             r3d
+    neg             r4d
+    neg             r1
+
+    lea             r5, [r2 * 3]
+    lea             r6, [r1 * 3]
+
+    pmovzxbw        m4, [r0]                ; src[0]
+    pmovzxbw        m3, [r0 + r1]           ; src[-offset]
+    pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
+    pmovzxbw        m1, [r0 + r6]           ; src[-offset * 3]
+    pmovzxbw        m0, [r0 + r1 * 4]       ; src[-offset * 4]
+    pmovzxbw        m5, [r0 + r2]           ; src[offset]
+    pmovzxbw        m6, [r0 + r2 * 2]       ; src[offset * 2]
+    pmovzxbw        m7, [r0 + r5]           ; src[offset * 3]
+
+    paddw           m0, m0                  ; m0*2
+    mova            m8, m2
+    paddw           m8, m3                  ; m2 + m3
+    paddw           m8, m4                  ; m2 + m3 + m4
+    mova            m9, m8
+    paddw           m9, m9                  ; 2*m2 + 2*m3 + 2*m4
+    paddw           m8, m1                  ; m2 + m3 + m4 + m1
+    paddw           m0, m8                  ; 2*m0 + m2+ m3 + m4 + m1
+    paddw           m9, m1
+    paddw           m0, m1
+    paddw           m9, m5                  ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+    paddw           m0, m1                  ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+    punpcklqdq      m0, m9
+    punpcklqdq      m1, m3
+
+    paddw           m3, m4
+    mova            m9, m5
+    paddw           m9, m6
+    paddw           m7, m7                  ; 2*m7
+    paddw           m9, m3                  ; m3 + m4 + m5 + m6
+    mova            m3, m9
+    paddw           m3, m3                  ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+    paddw           m7, m9                  ; 2*m7 + m3 + m4 + m5 + m6
+    paddw           m7, m6
+    psubw           m3, m6                  ; 2*m3 + 2*m4 + 2*m5 + m6
+    paddw           m7, m6                  ; m3 + m4 + m5 + 3*m6 + 2*m7
+    paddw           m3, m2                  ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+    punpcklqdq      m9, m8
+    punpcklqdq      m3, m7
+    punpcklqdq      m5, m2
+    punpcklqdq      m4, m6
+
+    movd            m7, r3d                 ; -tcP
+    movd            m2, r4d                 ; -tcQ
+    pshufb          m7, [pb_01]
+    pshufb          m2, [pb_01]
+    mova            m6, m2
+    punpcklqdq      m6, m7
+
+    paddw           m0, [pw_4]
+    paddw           m3, [pw_4]
+    paddw           m9, [pw_2]
+
+    psraw           m0, 3
+    psraw           m3, 3
+    psraw           m9, 2
+
+    psubw           m0, m1
+    psubw           m3, m4
+    psubw           m9, m5
+
+    pmaxsw          m0, m7
+    pmaxsw          m3, m2
+    pmaxsw          m9, m6
+    psignw          m7, [pw_n1]
+    psignw          m2, [pw_n1]
+    psignw          m6, [pw_n1]
+    pminsw          m0, m7
+    pminsw          m3, m2
+    pminsw          m9, m6
+
+    paddw           m0, m1
+    paddw           m3, m4
+    paddw           m9, m5
+    packuswb        m0, m0
+    packuswb        m3, m9
+
+    movd            [r0 + r6], m0
+    pextrd          [r0 + r1], m0, 1
+    movd            [r0], m3
+    pextrd          [r0 + r2 * 2], m3, 1
+    pextrd          [r0 + r2 * 1], m3, 2
+    pextrd          [r0 + r1 * 2], m3, 3
+    RET
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_V, 5,5,10
+    neg             r3d
+    neg             r4d
+    lea             r2, [r1 * 3]
+
+    movh            m0, [r0 - 4]            ; src[-offset * 4] row 0
+    movh            m1, [r0 + r1 * 1 - 4]   ; src[-offset * 4] row 1
+    movh            m2, [r0 + r1 * 2 - 4]   ; src[-offset * 4] row 2
+    movh            m3, [r0 + r2 * 1 - 4]   ; src[-offset * 4] row 3
+
+    punpcklbw       m0, m1
+    punpcklbw       m2, m3
+    mova            m4, m0
+    punpcklwd       m0, m2
+    punpckhwd       m4, m2
+    mova            m1, m0
+    mova            m2, m0
+    mova            m3, m0
+    pshufd          m0, m0, 0
+    pshufd          m1, m1, 1
+    pshufd          m2, m2, 2
+    pshufd          m3, m3, 3
+    mova            m5, m4
+    mova            m6, m4
+    mova            m7, m4
+    pshufd          m4, m4, 0
+    pshufd          m5, m5, 1
+    pshufd          m6, m6, 2
+    pshufd          m7, m7, 3
+    pmovzxbw        m0, m0
+    pmovzxbw        m1, m1
+    pmovzxbw        m2, m2
+    pmovzxbw        m3, m3
+    pmovzxbw        m4, m4
+    pmovzxbw        m5, m5
+    pmovzxbw        m6, m6
+    pmovzxbw        m7, m7
+
+    paddw           m0, m0                  ; m0*2
+    mova            m8, m2
+    paddw           m8, m3                  ; m2 + m3
+    paddw           m8, m4                  ; m2 + m3 + m4
+    mova            m9, m8
+    paddw           m9, m9                  ; 2*m2 + 2*m3 + 2*m4
+    paddw           m8, m1                  ; m2 + m3 + m4 + m1
+    paddw           m0, m8                  ; 2*m0 + m2+ m3 + m4 + m1
+    paddw           m9, m1
+    paddw           m0, m1
+    paddw           m9, m5                  ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+    paddw           m0, m1                  ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+    punpcklqdq      m0, m9
+    punpcklqdq      m1, m3
+
+    paddw           m3, m4
+    mova            m9, m5
+    paddw           m9, m6
+    paddw           m7, m7                  ; 2*m7
+    paddw           m9, m3                  ; m3 + m4 + m5 + m6
+    mova            m3, m9
+    paddw           m3, m3                  ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+    paddw           m7, m9                  ; 2*m7 + m3 + m4 + m5 + m6
+    paddw           m7, m6
+    psubw           m3, m6                  ; 2*m3 + 2*m4 + 2*m5 + m6
+    paddw           m7, m6                  ; m3 + m4 + m5 + 3*m6 + 2*m7
+    paddw           m3, m2                  ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+    punpcklqdq      m9, m8
+    punpcklqdq      m3, m7
+    punpcklqdq      m5, m2
+    punpcklqdq      m4, m6
+
+    movd            m7, r3d                 ; -tcP
+    movd            m2, r4d                 ; -tcQ
+    pshufb          m7, [pb_01]
+    pshufb          m2, [pb_01]
+    mova            m6, m2
+    punpcklqdq      m6, m7
+
+    paddw           m0, [pw_4]
+    paddw           m3, [pw_4]
+    paddw           m9, [pw_2]
+
+    psraw           m0, 3
+    psraw           m3, 3
+    psraw           m9, 2
+
+    psubw           m0, m1
+    psubw           m3, m4
+    psubw           m9, m5
+
+    pmaxsw          m0, m7
+    pmaxsw          m3, m2
+    pmaxsw          m9, m6
+    psignw          m7, [pw_n1]
+    psignw          m2, [pw_n1]
+    psignw          m6, [pw_n1]
+    pminsw          m0, m7
+    pminsw          m3, m2
+    pminsw          m9, m6
+
+    paddw           m0, m1
+    paddw           m3, m4
+    paddw           m9, m5
+    packuswb        m0, m0
+    packuswb        m3, m9
+
+    ; 4x6 output rows -
+    ; m0 - col 0
+    ; m3 - col 3
+    mova            m1, m0
+    mova            m2, m3
+    mova            m4, m3
+    mova            m5, m3
+    pshufd          m1, m1, 1               ; col 2
+    pshufd          m2, m2, 1               ; col 5
+    pshufd          m4, m4, 2               ; col 4
+    pshufd          m5, m5, 3               ; col 1
+
+    ; transpose 4x6 to 6x4
+    punpcklbw       m0, m5
+    punpcklbw       m1, m3
+    punpcklbw       m4, m2
+    punpcklwd       m0, m1
+
+    movd            [r0 + r1 * 0 - 3], m0
+    pextrd          [r0 + r1 * 1 - 3], m0, 1
+    pextrd          [r0 + r1 * 2 - 3], m0, 2
+    pextrd          [r0 + r2 * 1 - 3], m0, 3
+    pextrw          [r0 + r1 * 0 + 1], m4, 0
+    pextrw          [r0 + r1 * 1 + 1], m4, 1
+    pextrw          [r0 + r1 * 2 + 1], m4, 2
+    pextrw          [r0 + r2 * 1 + 1], m4, 3
+    RET
+%endif ; ARCH_X86_64
+
+
 
 ;void saoCuStatsE2_c(const int16_t *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
 ;{
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/loopfilter.h	Thu Oct 08 12:06:20 2015 +0530
@@ -46,4 +46,7 @@
 DECL_SAO(sse4);
 DECL_SAO(avx2);
 
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 #endif // ifndef X265_LOOPFILTER_H
diff -r 38e4b94377fa -r 86627e458e6e source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 14:19:56 2015 +0530
+++ b/source/test/pixelharness.cpp	Thu Oct 08 12:06:20 2015 +0530
@@ -1835,6 +1835,72 @@
     return true;
 }
 
+bool PixelHarness::check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
+{
+    intptr_t srcStep = 1, offset = 64;
+    int32_t tcP, tcQ, maskP, maskQ, tc;
+    int j = 0;
+
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+    for (int i = 0; i < TEST_CASES; i++)
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel) * BUFFSIZE);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        tc      = rand() % PIXEL_MAX;
+        maskP   = (rand() % PIXEL_MAX) - 1;
+        maskQ   = (rand() % PIXEL_MAX) - 1;
+        tcP     = (tc & maskP);
+        tcQ     = (tc & maskQ);
+
+        int index = rand() % 3;
+
+        ref(pixel_test_buff[index]  + 4 * offset + j, srcStep, offset, tcP, tcQ);
+        checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tcP, tcQ);
+
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel) * BUFFSIZE))
+            return false;
+
+        reportfail()
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
+{
+    intptr_t srcStep = 64, offset = 1;
+    int32_t tcP, tcQ, maskP, maskQ, tc;
+    int j = 0;
+
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+    for (int i = 0; i < TEST_CASES; i++)
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel) * BUFFSIZE);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        tc      = rand() % PIXEL_MAX;
+        maskP   = (rand() % PIXEL_MAX) - 1;
+        maskQ   = (rand() % PIXEL_MAX) - 1;
+        tcP     = (tc & maskP);
+        tcQ     = (tc & maskQ);
+
+        int index = rand() % 3;
+
+        ref(pixel_test_buff[index]  + 4 + j, srcStep, offset, tcP, tcQ);
+        checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tcP, tcQ);
+
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel) * BUFFSIZE))
+            return false;
+
+        reportfail()
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.pu[part].satd)
@@ -2484,6 +2550,24 @@
         }
     }
 
+    if (opt.pelFilterLumaStrong[0])
+    {
+        if (!check_pelFilterLumaStrong_V(ref.pelFilterLumaStrong[0], opt.pelFilterLumaStrong[0]))
+        {
+            printf("pelFilterLumaStrong Vertical failed!\n");
+            return false;
+        }
+    }
+
+    if (opt.pelFilterLumaStrong[1])
+    {
+        if (!check_pelFilterLumaStrong_H(ref.pelFilterLumaStrong[1], opt.pelFilterLumaStrong[1]))
+        {
+            printf("pelFilterLumaStrong Horizontal failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -2973,4 +3057,20 @@
         uint64_t dummy;
         REPORT_SPEEDUP(opt.planeClipAndMax, ref.planeClipAndMax, pbuf1, 128, 63, 62, &dummy, 1, PIXEL_MAX - 1);
     }
+
+    if (opt.pelFilterLumaStrong[0])
+    {
+        int32_t tcP = (rand() % PIXEL_MAX) - 1;
+        int32_t tcQ = (rand() % PIXEL_MAX) - 1;
+        HEADER0("pelFilterLumaStrong_Vertical");
+        REPORT_SPEEDUP(opt.pelFilterLumaStrong[0], ref.pelFilterLumaStrong[0], pbuf1, STRIDE, 1, tcP, tcQ);
+    }
+
+    if (opt.pelFilterLumaStrong[1])
+    {
+        int32_t tcP = (rand() % PIXEL_MAX) - 1;
+        int32_t tcQ = (rand() % PIXEL_MAX) - 1;
+        HEADER0("pelFilterLumaStrong_Horizontal");
+        REPORT_SPEEDUP(opt.pelFilterLumaStrong[1], ref.pelFilterLumaStrong[1], pbuf1, 1, STRIDE, tcP, tcQ);
+    }
 }
diff -r 38e4b94377fa -r 86627e458e6e source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Oct 06 14:19:56 2015 +0530
+++ b/source/test/pixelharness.h	Thu Oct 08 12:06:20 2015 +0530
@@ -119,6 +119,8 @@
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
     bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
     bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
+    bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+    bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
 
 public:
 


More information about the x265-devel mailing list