[x265] [PATCH] asm : asm routine for chroma_p2s for 4:4:4 color space format

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Mon Feb 17 13:44:29 CET 2014


# HG changeset patch
# User Nabajit Deka
# Date 1392641037 -19800
#      Mon Feb 17 18:13:57 2014 +0530
# Node ID f5275ca8f2985bb0daf563738e6071b81967c2cd
# Parent  ce96cdb390fe26aee6effa731e51303c1d9056b0
asm : asm routine for chroma_p2s for 4:4:4 color space format

diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sun Feb 16 22:47:32 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Mon Feb 17 18:13:57 2014 +0530
@@ -1119,8 +1119,8 @@
 
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
         p.luma_p2s = x265_luma_p2s_ssse3;
-        p.chroma_p2s[X265_CSP_I444] = x265_chroma_p2s_ssse3;
         p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
+        p.chroma_p2s[X265_CSP_I444] = x265_chroma_p2s_i444_ssse3;
 
         CHROMA_SP_FILTERS_420(_ssse3);
         CHROMA_SP_FILTERS_444(_ssse3);
diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Sun Feb 16 22:47:32 2014 -0600
+++ b/source/common/x86/ipfilter8.asm	Mon Feb 17 18:13:57 2014 +0530
@@ -3680,6 +3680,64 @@
 
     RET
 
+INIT_XMM ssse3
+cglobal chroma_p2s_i444, 3, 7, 4
+
+    ; load width and height
+    mov         r3d, r3m
+    mov         r4d, r4m
+
+    ; load constant
+    mova        m2, [tab_c_128]
+    mova        m3, [tab_c_64_n64]
+
+.loopH:
+
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m2
+    pmaddubsw   m0, m3
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m2
+    pmaddubsw   m1, m3
+
+    add         r5d, 8
+    cmp         r5d, r3d
+    lea         r6, [r2 + r5 * 2]
+    jg          .width4
+    movu        [r6 + FENC_STRIDE * 0 - 16], m0
+    movu        [r6 + FENC_STRIDE * 2 - 16], m1
+    je          .nextH
+    jmp         .loopW
+
+.width4:
+    test        r3d, 4
+    jz          .width2
+    test        r3d, 2
+    movh        [r6 + FENC_STRIDE * 0 - 16], m0
+    movh        [r6 + FENC_STRIDE * 2 - 16], m1
+    lea         r6, [r6 + 8]
+    pshufd      m0, m0, 2
+    pshufd      m1, m1, 2
+    jz          .nextH
+
+.width2:
+    movd        [r6 + FENC_STRIDE * 0 - 16], m0
+    movd        [r6 + FENC_STRIDE * 2 - 16], m1
+
+.nextH:
+    lea         r0, [r0 + r1 * 2]
+    add         r2, FENC_STRIDE * 4
+
+    sub         r4d, 2
+    jnz         .loopH
+
+    RET
+
 %macro PROCESS_CHROMA_SP_W4_4R 0
     movq       m0, [r0]
     movq       m1, [r0 + r1]
diff -r ce96cdb390fe -r f5275ca8f298 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Sun Feb 16 22:47:32 2014 -0600
+++ b/source/common/x86/ipfilter8.h	Mon Feb 17 18:13:57 2014 +0530
@@ -214,6 +214,7 @@
 void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
 void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_i444_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 void x265_interp_4tap_vert_sp_2x4_sse4(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_vert_sp_2x8_sse4(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_vert_sp_6x8_sse4(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);


More information about the x265-devel mailing list