[x265] [PATCH] cvt16to32_cnt optimization

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Sep 2 16:11:31 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1409571425 -19800
# Node ID 51b5a6d820da97a4178dc42d2ef98ffe1970511b
# Parent  c09f34b0ab57b4ce2f5cf4aa59c25d20eb6cbd54
cvt16to32_cnt optimization

diff -r c09f34b0ab57 -r 51b5a6d820da source/common/dct.cpp
--- a/source/common/dct.cpp	Mon Aug 25 16:54:19 2014 +0530
+++ b/source/common/dct.cpp	Mon Sep 01 17:07:05 2014 +0530
@@ -834,7 +834,7 @@
     {
         for (int j = 0; j < trSize; j++)
         {
-            coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
+            coeff[k * trSize + j] = residual[k * stride + j];
             numSig += (residual[k * stride + j] != 0);
         }
     }
diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Mon Aug 25 16:54:19 2014 +0530
+++ b/source/common/x86/blockcopy8.asm	Mon Sep 01 17:07:05 2014 +0530
@@ -29,6 +29,10 @@
 
 tab_Vm:    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
 
+cextern pb_4
+cextern pb_1
+cextern pb_16
+cextern pb_64
 cextern pw_4
 cextern pb_8
 cextern pb_32
@@ -3946,52 +3950,47 @@
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal cvt16to32_cnt_4, 3,3,5
     add         r2d, r2d
     pxor        m4, m4
 
-    ; row 0 & 1
-    movh        m0, [r1]
-    movhps      m0, [r1 + r2]
-    mova        m2, m0
-    pmovsxwd    m1, m0
-    punpckhwd   m0, m0
-    psrad       m0, 16
-    movu        [r0 + 0 * mmsize], m1
-    movu        [r0 + 1 * mmsize], m0
-
-    ; row 2 & 3
-    movh        m0, [r1 + r2 * 2]
-    lea         r2, [r2 * 3]
-    movhps      m0, [r1 + r2]
-    packsswb    m2, m0
-    pcmpeqb     m2, m4
-    pmovsxwd    m1, m0
-    punpckhwd   m0, m0
-    psrad       m0, 16
-    movu        [r0 + 2 * mmsize], m1
-    movu        [r0 + 3 * mmsize], m0
-
-    ; get count
-    ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
-%if 1
-    pmovmskb    eax, m2
-    not         ax
-    popcnt      ax, ax
+     ; row 0 & 1
+     movh        m0, [r1]
+     movh        m1, [r1 + r2]
+     movh        [r0], m0
+     movh        [r0 + 8], m1
+
+     mova        m2, [r0]
+
+     ; row 2 & 3
+     movh        m0, [r1 + r2 * 2]
+     lea         r2, [r2 * 3]
+     movh        m1, [r1 + r2]
+     movh        [r0 + 16], m0
+     movh        [r0 + 24], m1
+ 
+     mova        m0, [r0 + 16]
+     packsswb    m2, m0
+     pcmpeqb     m2, m4
+
+     ; get count
+     ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
+%if 0
+     pmovmskb    eax, m2
+     not         ax
+     popcnt      ax, ax
 %else
-    movhlps     m3, m2
-    paddw       m2, m3
-
-    mova        m3, [pw_4]
-    paddw       m3, m2
-    psadbw      m3, m4
-
-    movd        eax, m3
-%endif
-    RET
+     mova        m0, [pb_1]
+     paddb       m2, m0
+     psadbw      m2, m4
+     pshufd      m0, m2, 2
+     paddw       m2, m0
+     movd        eax, m2
+ %endif
+     RET
 
 
 INIT_YMM avx2
@@ -4023,71 +4022,65 @@
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal cvt16to32_cnt_8, 3,5,6
+cglobal cvt16to32_cnt_8, 3,3,6
     add         r2d, r2d
     pxor        m4, m4
-    mov         r3d, 8/4
-    lea         r4, [r2 * 3]
     pxor        m5, m5
 
-.loop
-    ; row 0
+   ; row 0 & 1
+    movu         m0, [r1]
+    movu        m1, [r1 + r2]
+    movu        [r0], m0
+    movu        [r0 + 16], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    ; row 2 & 3
+    lea         r1, [r1 + 2 * r2]
     movu        m0, [r1]
-    mova        m2, m0
-    pmovsxwd    m1, m0
-    punpckhwd   m0, m0
-    psrad       m0, 16
-    movu        [r0 + 0 * mmsize], m1
-    movu        [r0 + 1 * mmsize], m0
-
-    ; row 1
-    movu        m0, [r1 + r2]
-    packsswb    m2, m0
-    pcmpeqb     m2, m4
-    paddb       m5, m2
-    pmovsxwd    m1, m0
-    punpckhwd   m0, m0
-    psrad       m0, 16
-    movu        [r0 + 2 * mmsize], m1
-    movu        [r0 + 3 * mmsize], m0
-
-    ; row 2
-    movu        m0, [r1 + r2 * 2]
-    mova        m2, m0
-    pmovsxwd    m1, m0
-    punpckhwd   m0, m0
-    psrad       m0, 16
-    movu        [r0 + 4 * mmsize], m1
-    movu        [r0 + 5 * mmsize], m0
-
-    ; row 3
-    movu        m0, [r1 + r4]
-    packsswb    m2, m0
-    pcmpeqb     m2, m4
-    paddb       m5, m2
-    pmovsxwd    m1, m0
-    punpckhwd   m0, m0
-    psrad       m0, 16
-    movu        [r0 + 6 * mmsize], m1
-    movu        [r0 + 7 * mmsize], m0
-
-    add         r0, 8 * mmsize
-    lea         r1, [r1 + r2 * 4]
-    dec         r3d
-    jnz        .loop
+    movu        m1, [r1 + r2]
+    movu        [r0 + 32], m0
+    movu        [r0 + 48], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    ; row 4 & 5
+    lea         r1, [r1 + 2 * r2]
+    movu        m0, [r1]
+    movu        m1, [r1 + r2]
+    movu        [r0 + 64], m0
+    movu        [r0 + 80], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    ; row 6 & 7
+    lea         r1, [r1 + 2 * r2]
+    movu        m0, [r1]
+    movu        m1, [r1 + r2]
+    movu        [r0 + 96], m0
+    movu        [r0 + 112], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
 
     ; get count
-    movhlps     m3, m5
-    paddb       m3, m5
-
-    paddb       m3, [pb_8]
-    psadbw      m3, m4
-
-    movd        eax, m3
-    RET
+    mova        m0, [pb_4]
+    paddb       m5, m0
+    psadbw      m5, m4
+    pshufd      m0, m5, 2
+    paddw       m5, m0
+    movd        eax, m5
+     RET
 
 
 INIT_YMM avx2
@@ -4174,58 +4167,69 @@
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal cvt16to32_cnt_16, 3,4,7
-    add         r2d, r2d
-    mov         r3d, 16/2
-    pxor        m5, m5
-    pxor        m6, m6
+cglobal cvt16to32_cnt_16, 3,4,6
+     add         r2d, r2d
+     mov         r3d, 4
+     pxor        m4, m4
+     pxor        m5, m5
 
 .loop
     ; row 0
     movu        m0, [r1]
-    movu        m1, [r1 + mmsize]
-    packsswb    m4, m0, m1
-    pcmpeqb     m4, m6
-    paddb       m5, m4
-    pmovsxwd    m2, m0
-    pmovsxwd    m0, [r1 + 8]
-    pmovsxwd    m3, m1
-    pmovsxwd    m1, [r1 + mmsize + 8]
-    movu        [r0 + 0 * mmsize], m2
-    movu        [r0 + 1 * mmsize], m0
-    movu        [r0 + 2 * mmsize], m3
-    movu        [r0 + 3 * mmsize], m1
-
-    ; row 1
+    movu        m1, [r1 + 16]
+    movu        [r0], m0
+    movu        [r0 + 16], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+     ; row 1
     movu        m0, [r1 + r2]
-    movu        m1, [r1 + r2 + mmsize]
-    packsswb    m4, m0, m1
-    pcmpeqb     m4, m6
-    paddb       m5, m4
-    pmovsxwd    m2, m0
-    pmovsxwd    m0, [r1 + r2 + 8]
-    pmovsxwd    m3, m1
-    pmovsxwd    m1, [r1 + r2 + mmsize + 8]
-    movu        [r0 + 4 * mmsize], m2
-    movu        [r0 + 5 * mmsize], m0
-    movu        [r0 + 6 * mmsize], m3
-    movu        [r0 + 7 * mmsize], m1
-
-    add         r0, 8 * mmsize
-    lea         r1, [r1 + r2 * 2]
-    dec         r3d
-    jnz        .loop
-
-    ; get count
-    movhlps     m0, m5
-    paddb       m0, m5
-    paddb       m0, [pb_32]
-    psadbw      m0, m6
-    movd        eax, m0
-    RET
+    movu        m1, [r1 + r2 + 16]
+    movu        [r0 + 32], m0
+    movu        [r0 + 48], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    ; row 2
+    movu        m0, [r1 + 2 * r2]
+    movu        m1, [r1 + 2 * r2 + 16]
+    movu        [r0 + 64], m0
+    movu        [r0 + 80], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    ; row 3
+    lea         r1, [r1 + 2 * r2]
+    movu        m0, [r1 + r2]
+    movu        m1, [r1 + r2 + 16]
+    movu        [r0 + 96], m0
+    movu        [r0 + 112], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    add         r0, 128
+    lea         r1, [r1 + 2 * r2]
+     dec         r3d
+     jnz        .loop
+
+    mova        m0, [pb_16]
+    paddb       m5, m0
+    psadbw      m5, m4
+    pshufd      m0, m5, 2
+    paddw       m5, m0
+    movd        eax, m5
+     RET
 
 
 INIT_YMM avx2
@@ -4294,59 +4298,68 @@
     movd        eax, xm0
     RET
 
-
 ;--------------------------------------------------------------------------------------
 ; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal cvt16to32_cnt_32, 3,4,8
+cglobal cvt16to32_cnt_32, 3,4,6
     add         r2d, r2d
-    mov         r3d, 32/1
-    pxor        m6, m6
-    pxor        m7, m7
+    mov         r3d, 16
+    pxor        m4, m4
+    pxor        m5, m5
 
 .loop
     ; row 0
-    movu        m0, [r1 + 0 * mmsize]
-    movu        m1, [r1 + 1 * mmsize]
-    movu        m2, [r1 + 2 * mmsize]
-    movu        m3, [r1 + 3 * mmsize]
-    packsswb    m4, m0, m1
-    packsswb    m5, m2, m3
-    pcmpeqb     m4, m7
-    pcmpeqb     m5, m7
-    paddb       m6, m4
-    paddb       m6, m5
-
-    pmovsxwd    m4, m0
-    pmovsxwd    m5, [r1 + 0 * mmsize + mmsize/2]
-    movu        [r0 + 0 * mmsize], m4
-    movu        [r0 + 1 * mmsize], m5
-    pmovsxwd    m4, m1
-    pmovsxwd    m5, [r1 + 1 * mmsize + mmsize/2]
-    movu        [r0 + 2 * mmsize], m4
-    movu        [r0 + 3 * mmsize], m5
-    pmovsxwd    m4, m2
-    pmovsxwd    m5, [r1 + 2 * mmsize + mmsize/2]
-    movu        [r0 + 4 * mmsize], m4
-    movu        [r0 + 5 * mmsize], m5
-    pmovsxwd    m4, m3
-    pmovsxwd    m5, [r1 + 3 * mmsize + mmsize/2]
-    movu        [r0 + 6 * mmsize], m4
-    movu        [r0 + 7 * mmsize], m5
-
-    add         r0, 8 * mmsize
-    add         r1, r2
-    dec         r3d
-    jnz        .loop
-
-    ; get count
-    movhlps     m0, m6
-    paddb       m0, m6
-    paddb       m0, [pb_128]
-    psadbw      m0, m7
-    movd        eax, m0
-    RET
+    movu        m0, [r1]
+    movu        m1, [r1 + 16]
+    movu        [r0], m0
+    movu        [r0 + 16], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    movu        m0, [r1 + 32]
+    movu        m1, [r1 + 48]
+    movu        [r0 + 32], m0
+    movu        [r0 + 48], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    ; row 1
+    movu        m0, [r1 + r2]
+    movu        m1, [r1 + r2 + 16]
+    movu        [r0 + 64], m0
+    movu        [r0 + 80], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    movu        m0, [r1 + r2 + 32]
+    movu        m1, [r1 + r2 + 48]
+    movu        [r0 + 96], m0
+    movu        [r0 + 112], m1
+
+    packsswb    m0, m1
+    pcmpeqb     m0, m4
+    paddb       m5, m0
+
+    add         r0, 128
+    lea         r1, [r1 + 2 * r2]
+     dec         r3d
+     jnz        .loop
+
+     ; get count
+    mova        m0, [pb_64]
+    paddb       m5, m0
+    psadbw      m5, m4
+    pshufd      m0, m5, 2
+    paddw       m5, m0
+    movd        eax, m5
+     RET
 
 
 INIT_YMM avx2
diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Aug 25 16:54:19 2014 +0530
+++ b/source/common/x86/const-a.asm	Mon Sep 01 17:07:05 2014 +0530
@@ -48,6 +48,9 @@
 const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
 const pw_swap,      times 2 db 6,7,4,5,2,3,0,1
 
+const pb_4,        times 16 db 4
+const pb_16,       times 16 db 16
+const pb_64,       times 16 db 64
 const pb_01,       times  8 db 0,1
 const pb_0,        times 16 db 0
 const pb_1,        times 32 db 1
diff -r c09f34b0ab57 -r 51b5a6d820da source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Aug 25 16:54:19 2014 +0530
+++ b/source/test/pixelharness.cpp	Mon Sep 01 17:07:05 2014 +0530
@@ -592,14 +592,11 @@
     intptr_t stride = STRIDE;
     for (int i = 0; i < ITERS; i++)
     {
-#ifdef _DEBUG
-        memset(ref_dest, 0xCD, sizeof(ref_dest));
-        memset(opt_dest, 0xCD, sizeof(opt_dest));
-#endif
-        int opt_cnt = (int)checked(opt, opt_dest, sbuf1 + j, stride);
-        int ref_cnt = ref(ref_dest, sbuf1 + j, stride);
+        int index = i % TEST_CASES;
+        int opt_cnt = (int)checked(opt, opt_dest, short_test_buff1[index] + j, stride);
+        int ref_cnt = ref(ref_dest, short_test_buff1[index] + j, stride);
 
-        if ((ref_cnt != opt_cnt) || memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
+        if ((ref_cnt != opt_cnt) || memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
             return false;
 
         reportfail();


More information about the x265-devel mailing list