[x265] [PATCH] asm-ssse3: filter_p2s[12x16](9.64x), filter_p2s[24x32](10.30x),

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Mon Mar 2 14:22:46 CET 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1425273779 -19800
#      Mon Mar 02 10:52:59 2015 +0530
# Node ID 70be3fa2ee550ec1b954c420e3c7a915589163a7
# Parent  018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
asm-ssse3: filter_p2s[12x16](9.64x), filter_p2s[24x32](10.30x),
           filter_p2s[48x64](7.79x)

diff -r 018e8bbaa854 -r 70be3fa2ee55 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 02 10:52:59 2015 +0530
@@ -1272,6 +1272,9 @@
         p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3;
         p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3;
         p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3;
+        p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_ssse3;
+        p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_ssse3;
+        p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_ssse3;
 
         p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
         p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
diff -r 018e8bbaa854 -r 70be3fa2ee55 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Mar 02 10:52:59 2015 +0530
@@ -5416,8 +5416,9 @@
 FILTER_V4_W16n_H2 64, 48
 FILTER_V4_W16n_H2 48, 64
 FILTER_V4_W16n_H2 64, 16
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_WH_4xN 2
 INIT_XMM ssse3
@@ -5480,7 +5481,7 @@
 PIXEL_WH_4xN 4, 16
 
 ;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_WH_8xN 2
 INIT_XMM ssse3
@@ -5541,9 +5542,8 @@
 PIXEL_WH_8xN 8, 16
 PIXEL_WH_8xN 8, 32
 
-
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_WH_16xN 2
 INIT_XMM ssse3
@@ -5607,7 +5607,7 @@
 PIXEL_WH_16xN 16, 64
 
 ;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_WH_32xN 2
 INIT_XMM ssse3
@@ -5670,7 +5670,7 @@
 PIXEL_WH_32xN 32, 64
 
 ;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_WH_64xN 2
 INIT_XMM ssse3
@@ -5731,6 +5731,173 @@
 PIXEL_WH_64xN 64, 32
 PIXEL_WH_64xN 64, 48
 
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixelToShort_12x16, 3, 7, 6
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+%rep 4
+    mov         r6, r0
+
+    movu        m0, [r6]
+    movu        m1, m0
+    punpcklbw   m1, m0, m4
+    punpckhbw   m0, m4
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+
+    movu        m2, [r6 + r1]
+    movu        m3, m2
+    punpcklbw   m3, m2, m4
+    punpckhbw   m2, m4
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+
+    movu        [r2 + FENC_STRIDE * 0], m1
+    movu        [r2 + FENC_STRIDE * 2], m3
+
+    movh        [r2 + FENC_STRIDE * 0 + 16], m0
+    movh        [r2 + FENC_STRIDE * 2 + 16], m2
+
+    movu        m0, [r6 + r1 * 2]
+    movu        m1, m0
+    punpcklbw   m1, m0, m4
+    punpckhbw   m0, m4
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movu        m2, [r6 + r1]
+    movu        m3, m2
+    punpcklbw   m3, m2, m4
+    punpckhbw   m2, m4
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+
+    movu        [r2 + FENC_STRIDE * 4], m1
+    movu        [r2 + FENC_STRIDE * 6], m3
+
+    movh        [r2 + FENC_STRIDE * 4 + 16], m0
+    movh        [r2 + FENC_STRIDE * 6 + 16], m2
+
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+%endrep
+    RET
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixelToShort_24x32, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, 24
+    mov         r4d, 32
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+    RET
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixelToShort_48x64, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, 48
+    mov         r4d, 64
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+    RET
+
 %macro PROCESS_LUMA_W4_4R 0
     movd        m0, [r0]
     movd        m1, [r0 + r1]
diff -r 018e8bbaa854 -r 70be3fa2ee55 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/x86/ipfilter8.h	Mon Mar 02 10:52:59 2015 +0530
@@ -642,6 +642,10 @@
 void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
 void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
 void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_12x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
 #undef LUMA_SS_FILTERS
diff -r 018e8bbaa854 -r 70be3fa2ee55 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/test/ipfilterharness.cpp	Mon Mar 02 10:52:59 2015 +0530
@@ -523,7 +523,7 @@
 
         checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
 
-        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
+        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
             return false;
 
         reportfail();


More information about the x265-devel mailing list