[x265] [PATCH 2 of 3] asm: asm code for pelFilterChroma_V/H

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Feb 26 10:11:30 CET 2016


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456466679 -19800
#      Fri Feb 26 11:34:39 2016 +0530
# Node ID 59d9eca3d144e71f11d509a5dd40b634bb9ab500
# Parent  5ff8ee940ad7f4d34b106ae4999b996245c87919
asm: asm code for pelFilterChroma_V/H

diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 26 11:34:39 2016 +0530
@@ -2535,6 +2535,8 @@
 #if X86_64
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
         p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
 
 //        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/const-a.asm	Fri Feb 26 11:34:39 2016 +0530
@@ -48,6 +48,8 @@
 const pb_a1,                times 16 db 0xa1
 
 const pb_01,                times  8 db   0,   1
+const pb_0123,              times  4 db   0,   1
+                            times  4 db   2,   3
 const hsub_mul,             times 16 db   1,  -1
 const pw_swap,              times  2 db   6,   7,   4,   5,   2,   3,   0,   1
 const pb_unpackbd1,         times  2 db   0,   0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3
@@ -66,6 +68,7 @@
                             times 12 db 0x00
 const pb_000000000000000F,           db 0xff
                             times 15 db 0x00
+const pb_shuf_off4,         times  2 db   0,   4,   1,   5,   2,   6,   3,   7
 
 ;; 16-bit constants
 
@@ -117,6 +120,8 @@
 const hmul_16p,             times 16 db   1
                             times  8 db   1,  -1
 const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
+const pw_1_ffff,            times  4 dw 1
+                            times  4 dw 0xFFFF
 
 
 ;; 32-bit constants
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/loopfilter.asm	Fri Feb 26 11:34:39 2016 +0530
@@ -36,6 +36,7 @@
 cextern pb_3
 cextern pb_4
 cextern pb_01
+cextern pb_0123
 cextern pb_15
 cextern pb_31
 cextern pb_124
@@ -48,8 +49,8 @@
 cextern pb_movemask
 cextern pb_movemask_32
 cextern hmul_16p
-
-
+cextern pw_1_ffff
+cextern pb_shuf_off4
 ;============================================================================================================
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
 ;============================================================================================================
@@ -3987,3 +3988,108 @@
     pextrw          [r0 + r2 * 1 + 1], m4, 3
     RET
 %endif ; ARCH_X86_64
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal pelFilterChroma_H, 6,6,5
+    mov             r1, r2
+    neg             r3d
+    neg             r1
+
+    pmovzxbw        m4, [r0]                ; src[0]
+    pmovzxbw        m3, [r0 + r1]           ; src[-offset]
+    pmovzxbw        m0, [r0 + r2]           ; src[offset]
+    pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
+
+    psubw           m1, m4, m3              ; m4 - m3
+    psubw           m2, m0                  ; m2 - m5
+    paddw           m2, [pw_4]
+    psllw           m1, 2                   ; (m4 - m3) * 4
+    paddw           m1, m2
+    psraw           m1, 3
+
+    movd            m0, r3d
+    pshufb          m0, [pb_01]             ; -tc
+
+    pmaxsw          m1, m0
+    psignw          m0, [pw_n1]
+    pminsw          m1, m0                  ; delta
+    punpcklqdq      m1, m1
+
+    shl             r5d, 16
+    or              r5w, r4w
+    punpcklqdq      m3, m4
+    mova            m2, [pw_1_ffff]
+
+    movd            m0, r5d
+    pshufb          m0, [pb_0123]
+
+    pand            m0, m1                  ; (delta & maskP) (delta & maskQ)
+    psignw          m0, m2
+    paddw           m3, m0
+
+    pxor            m0, m0
+    pmaxsw          m3, m0
+    pminsw          m3, [pw_pixel_max]
+
+    packuswb        m3, m3
+    movd            [r0 + r1], m3
+    pextrd          [r0], m3, 1
+    RET
+
+INIT_XMM sse4
+cglobal pelFilterChroma_V, 6,6,5
+    neg             r3d
+    lea             r2, [r1 * 3]
+
+    pmovzxbw        m4, [r0 + r1 * 0 - 2]   ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
+    pmovzxbw        m3, [r0 + r1 * 1 - 2]
+    pmovzxbw        m0, [r0 + r1 * 2 - 2]
+    pmovzxbw        m2, [r0 + r2 * 1 - 2]
+
+    punpcklwd       m4, m3
+    punpcklwd       m0, m2
+    punpckldq       m2, m4, m0              ; [m2 m2 m2 m2 m3 m3 m3 m3]
+    punpckhdq       m4, m0                  ; [m4 m4 m4 m4 m5 m5 m5 m5]
+    psrldq          m3, m2, 8
+    psrldq          m0, m4, 8
+
+    psubw           m1, m4, m3              ; m4 - m3
+    psubw           m2, m0                  ; m2 - m5
+    paddw           m2, [pw_4]
+    psllw           m1, 2                   ; (m4 - m3) * 4
+    paddw           m1, m2
+    psraw           m1, 3
+
+    movd            m0, r3d
+    pshufb          m0, [pb_01]             ; -tc
+
+    pmaxsw          m1, m0
+    psignw          m0, [pw_n1]
+    pminsw          m1, m0                  ; delta
+    punpcklqdq      m1, m1
+
+    shl             r5d, 16
+    or              r5w, r4w
+    punpcklqdq      m3, m4
+    mova            m2, [pw_1_ffff]
+
+    movd            m0, r5d
+    pshufb          m0, [pb_0123]
+
+    pand            m0, m1                  ; (delta & maskP) (delta & maskQ)
+    psignw          m0, m2
+    paddw           m3, m0
+
+    pxor            m0, m0
+    pmaxsw          m3, m0
+    pminsw          m3, [pw_pixel_max]
+
+    packuswb        m3, m3
+    pshufb          m3, [pb_shuf_off4]
+    pextrw          [r0 + r1 * 0 - 1], m3, 0
+    pextrw          [r0 + r1 * 1 - 1], m3, 1
+    pextrw          [r0 + r1 * 2 - 1], m3, 2
+    pextrw          [r0 + r2 * 1 - 1], m3, 3
+    RET
+%endif ; ARCH_X86_64
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/loopfilter.h	Fri Feb 26 11:34:39 2016 +0530
@@ -48,5 +48,7 @@
 
 void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
 void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterChroma_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
+void PFX(pelFilterChroma_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
 
 #endif // ifndef X265_LOOPFILTER_H
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Fri Feb 26 11:33:33 2016 +0530
+++ b/source/test/pixelharness.cpp	Fri Feb 26 11:34:39 2016 +0530
@@ -1912,6 +1912,68 @@
     return true;
 }
 
+bool PixelHarness::check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt)
+{
+    intptr_t srcStep = 1, offset = 64;
+    int32_t maskP, maskQ, tc;
+    int j = 0;
+
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+    for (int i = 0; i < TEST_CASES; i++)
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        tc = rand() % PIXEL_MAX;
+        maskP = (rand() % PIXEL_MAX) - 1;
+        maskQ = (rand() % PIXEL_MAX) - 1;
+
+        int index = rand() % 3;
+
+        ref(pixel_test_buff[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
+        checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
+
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
+            return false;
+
+        reportfail()
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt)
+{
+    intptr_t srcStep = 64, offset = 1;
+    int32_t maskP, maskQ, tc;
+    int j = 0;
+
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+    for (int i = 0; i < TEST_CASES; i++)
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        tc = rand() % PIXEL_MAX;
+        maskP = (rand() % PIXEL_MAX) - 1;
+        maskQ = (rand() % PIXEL_MAX) - 1;
+
+        int index = rand() % 3;
+
+        ref(pixel_test_buff[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
+        checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
+
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
+            return false;
+
+        reportfail()
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.pu[part].satd)
@@ -2570,6 +2632,24 @@
         }
     }
 
+    if (opt.pelFilterChroma[0])
+    {
+        if (!check_pelFilterChroma_V(ref.pelFilterChroma[0], opt.pelFilterChroma[0]))
+        {
+            printf("pelFilterChroma Vertical failed!\n");
+            return false;
+        }
+    }
+
+    if (opt.pelFilterChroma[1])
+    {
+        if (!check_pelFilterChroma_H(ref.pelFilterChroma[1], opt.pelFilterChroma[1]))
+        {
+            printf("pelFilterChroma Horizontal failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -3069,4 +3149,22 @@
         HEADER0("pelFilterLumaStrong_Horizontal");
         REPORT_SPEEDUP(opt.pelFilterLumaStrong[1], ref.pelFilterLumaStrong[1], pbuf1, 1, STRIDE, tcP, tcQ);
     }
+
+    if (opt.pelFilterChroma[0])
+    {
+        int32_t tc = (rand() % PIXEL_MAX);
+        int32_t maskP = (rand() % PIXEL_MAX) - 1;
+        int32_t maskQ = (rand() % PIXEL_MAX) - 1;
+        HEADER0("pelFilterChroma_Vertical");
+        REPORT_SPEEDUP(opt.pelFilterChroma[0], ref.pelFilterChroma[0], pbuf1, STRIDE, 1, tc, maskP, maskQ);
+    }
+
+    if (opt.pelFilterChroma[1])
+    {
+        int32_t tc = (rand() % PIXEL_MAX);
+        int32_t maskP = (rand() % PIXEL_MAX) - 1;
+        int32_t maskQ = (rand() % PIXEL_MAX) - 1;
+        HEADER0("pelFilterChroma_Horizontal");
+        REPORT_SPEEDUP(opt.pelFilterChroma[1], ref.pelFilterChroma[1], pbuf1, 1, STRIDE, tc, maskP, maskQ);
+    }
 }
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Fri Feb 26 11:33:33 2016 +0530
+++ b/source/test/pixelharness.h	Fri Feb 26 11:34:39 2016 +0530
@@ -123,6 +123,8 @@
     bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
     bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
     bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+    bool check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt);
+    bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt);
 
 public:
 


More information about the x265-devel mailing list