[x265] [PATCH] asm: assembly code for pixel_sse_ss_4x4

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Fri Nov 22 14:28:27 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385126838 -19800
#      Fri Nov 22 18:57:18 2013 +0530
# Node ID 98bcf33302ef613f814056d152c8b8deee2ee0a6
# Parent  d2173ec27a151a1c1adaaba387f4f20a43742246
asm: assembly code for pixel_sse_ss_4x4

diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 21 20:16:39 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Nov 22 18:57:18 2013 +0530
@@ -87,6 +87,9 @@
     p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
     p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
 
+#define ASSGN_SSE_SS(cpu) \
+    p.sse_ss[LUMA_4x4]   = x265_pixel_ssd_ss_4x4_ ## cpu;
+
 #define SA8D_INTER_FROM_BLOCK(cpu) \
     p.sa8d[BLOCK_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
     p.sa8d[BLOCK_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
@@ -489,6 +492,7 @@
         p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
 
         ASSGN_SSE(sse2);
+        ASSGN_SSE_SS(sse2);
         INIT2(sad, _sse2);
         INIT2(sad_x3, _sse2);
         INIT2(sad_x4, _sse2);
@@ -564,6 +568,7 @@
         SA8D_INTER_FROM_BLOCK(ssse3);
         p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
         ASSGN_SSE(ssse3);
+        ASSGN_SSE_SS(ssse3);
         PIXEL_AVG(ssse3);
         PIXEL_AVG_W4(ssse3);
 
@@ -625,7 +630,7 @@
         p.sa8d[BLOCK_8x8]   = x265_pixel_sa8d_8x8_sse4;
         p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
         SA8D_INTER_FROM_BLOCK(sse4);
-
+        ASSGN_SSE_SS(sse4);
         CHROMA_PIXELSUB_PS(_sse4);
 
         CHROMA_FILTERS(_sse4);
@@ -675,6 +680,7 @@
         SA8D_INTER_FROM_BLOCK(avx);
         ASSGN_SSE(avx);
         HEVC_SATD(avx);
+        ASSGN_SSE_SS(avx);
 
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Nov 21 20:16:39 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Fri Nov 22 18:57:18 2013 +0530
@@ -258,6 +258,45 @@
 SSD_ONE    16, 16
 %endif ; HIGH_BIT_DEPTH
 
+;-----------------------------------------------------------------------------
+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+
+%macro HEVC_SSD_SS 0
+cglobal pixel_ssd_ss_4x4, 4,7,6
+    pxor    m0, m0
+    pmovsxwd  m1, [r0]
+    pmovsxwd  m2, [r2]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    lea     r0, [r0 + r1*2]
+    lea     r2, [r2 + r3*2]
+    pmovsxwd  m1, [r0]
+    pmovsxwd  m2, [r2]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    lea     r0, [r0 + r1*2]
+    lea     r2, [r2 + r3*2]
+    pmovsxwd  m1, [r0]
+    pmovsxwd  m2, [r2]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    lea     r0, [r0 + r1*2]
+    lea     r2, [r2 + r3*2]
+    pmovsxwd  m1, [r0]
+    pmovsxwd  m2, [r2]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    phaddd  m0, m0
+    phaddd  m0, m0
+    movd   eax, m0
+    RET
+%endmacro
+
 %if HIGH_BIT_DEPTH == 0
 %macro SSD_LOAD_FULL 5
     mova      m1, [t0+%1]
@@ -512,12 +551,17 @@
 %define SSD_CORE SSD_CORE_SSE2
 %define JOIN JOIN_SSE2
 HEVC_SSD
+HEVC_SSD_SS
 INIT_XMM ssse3
 %define SSD_CORE SSD_CORE_SSSE3
 %define JOIN JOIN_SSSE3
 HEVC_SSD
+HEVC_SSD_SS
+INIT_XMM sse4
+HEVC_SSD_SS
 INIT_XMM avx
 HEVC_SSD
+HEVC_SSD_SS
 INIT_MMX ssse3
 SSD  4,  4
 SSD  4,  8
diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Thu Nov 21 20:16:39 2013 +0530
+++ b/source/common/x86/pixel.h	Fri Nov 22 18:57:18 2013 +0530
@@ -59,6 +59,9 @@
 #define DECL_X1(name, suffix) \
     DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
 
+#define DECL_X1_SS(name, suffix) \
+    DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
+
 #define DECL_X4(name, suffix) \
     DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
     DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
@@ -86,6 +89,15 @@
 DECL_X1(ssd, avx)
 DECL_X1(ssd, xop)
 DECL_X1(ssd, avx2)
+DECL_X1_SS(ssd_ss, mmx)
+DECL_X1_SS(ssd_ss, mmx2)
+DECL_X1_SS(ssd_ss, sse2slow)
+DECL_X1_SS(ssd_ss, sse2)
+DECL_X1_SS(ssd_ss, ssse3)
+DECL_X1_SS(ssd_ss, sse4)
+DECL_X1_SS(ssd_ss, avx)
+DECL_X1_SS(ssd_ss, xop)
+DECL_X1_SS(ssd_ss, avx2)
 DECL_X1(satd, mmx2)
 DECL_X1(satd, sse2)
 DECL_X1(satd, ssse3)


More information about the x265-devel mailing list