[x265] [PATCH] asm: assembly code for pixel_sse_ss_12x16

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Mon Nov 25 14:24:53 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385385872 -19800
#      Mon Nov 25 18:54:32 2013 +0530
# Node ID fea660d227b842c411240ff17297ddfbb738b540
# Parent  a69a8392ffeb32d5b136bd315b456b2067cceb29
asm: assembly code for pixel_sse_ss_12x16

diff -r a69a8392ffeb -r fea660d227b8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 25 18:30:49 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 25 18:54:32 2013 +0530
@@ -95,6 +95,7 @@
     p.sse_ss[LUMA_8x8]   = x265_pixel_ssd_ss_8x8_ ## cpu; \
     p.sse_ss[LUMA_8x16]   = x265_pixel_ssd_ss_8x16_ ## cpu; \
     p.sse_ss[LUMA_8x32]   = x265_pixel_ssd_ss_8x32_ ## cpu; \
+    p.sse_ss[LUMA_12x16]   = x265_pixel_ssd_ss_12x16_ ## cpu; \
     p.sse_ss[LUMA_16x4]   = x265_pixel_ssd_ss_16x4_ ## cpu; \
     p.sse_ss[LUMA_16x8]   = x265_pixel_ssd_ss_16x8_ ## cpu; \
     p.sse_ss[LUMA_16x12]   = x265_pixel_ssd_ss_16x12_ ## cpu; \
diff -r a69a8392ffeb -r fea660d227b8 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Nov 25 18:30:49 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Nov 25 18:54:32 2013 +0530
@@ -378,12 +378,63 @@
 SSD_SS    16, 64
 %endmacro
 
+%macro SSD_SS_12x16 0
+cglobal pixel_ssd_ss_12x16, 4,7,6
+    FIX_STRIDES r1, r3
+    mov    r4d, 8
+    pxor    m0, m0
+.loop
+    pmovsxwd  m1, [r0]
+    pmovsxwd  m2, [r2]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    pmovsxwd  m1, [r0 + 8]
+    pmovsxwd  m2, [r2 + 8]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    pmovsxwd  m1, [r0 + 16]
+    pmovsxwd  m2, [r2 + 16]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    lea       r0, [r0 + 2*r1]
+    lea       r2, [r2 + 2*r3]
+    pmovsxwd  m1, [r0]
+    pmovsxwd  m2, [r2]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    pmovsxwd  m1, [r0 + 8]
+    pmovsxwd  m2, [r2 + 8]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    pmovsxwd  m1, [r0 + 16]
+    pmovsxwd  m2, [r2 + 16]
+    psubd     m1, m2
+    pmulld    m1, m1
+    paddd     m0, m1
+    lea       r0, [r0 + 2*r1]
+    lea       r2, [r2 + 2*r3]
+    dec      r4d
+    jnz .loop
+    phaddd    m0, m0
+    phaddd    m0, m0
+    movd     eax, m0
+    RET
+%endmacro
+
 INIT_XMM sse2
 SSD_SS_ONE
+SSD_SS_12x16
 INIT_XMM sse4
 SSD_SS_ONE
+SSD_SS_12x16
 INIT_XMM avx
 SSD_SS_ONE
+SSD_SS_12x16
 %endif ; !HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0


More information about the x265-devel mailing list