[x265] [PATCH] asm: assembly code for pixel_satd_64x16

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Wed Nov 13 12:34:39 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1384342448 -19800
#      Wed Nov 13 17:04:08 2013 +0530
# Node ID 32e01ab333a6f2b49ead3c9f3f7de500de188f35
# Parent  4ee655b93b0388268bbec051205f02d83861549b
asm: assembly code for pixel_satd_64x16

diff -r 4ee655b93b03 -r 32e01ab333a6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Nov 13 16:43:37 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Nov 13 17:04:08 2013 +0530
@@ -66,7 +66,7 @@
     p.satd[LUMA_32x64] = cmp<32, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
     p.satd[LUMA_64x48] = cmp<64, 48, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
     p.satd[LUMA_48x64] = cmp<48, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
-    p.satd[LUMA_64x16] = cmp < 64, 16, 16, 16, x265_pixel_satd_16x16_ ## cpu >
+    p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
 
 #define ASSGN_SSE(cpu) \
     p.sse_pp[LUMA_8x8]   = x265_pixel_ssd_8x8_ ## cpu; \
diff -r 4ee655b93b03 -r 32e01ab333a6 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Nov 13 16:43:37 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Wed Nov 13 17:04:08 2013 +0530
@@ -1819,6 +1819,37 @@
     call pixel_satd_16x4_internal
     SATD_END_SSE2 m10
 
+cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    lea r0, [r6 + 48]
+    lea r2, [r7 + 48]
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    call pixel_satd_16x4_internal
+    SATD_END_SSE2 m10
+
 %else
 
 cglobal pixel_satd_32x8, 4,6,8    ;if !WIN64
@@ -2005,6 +2036,92 @@
     SATD_END_SSE2 m6
 %endif
 
+%if WIN64
+cglobal pixel_satd_64x16, 4,8,8    ;if WIN64 && cpuflag(avx)
+    SATD_START_SSE2 m6, m7
+    mov r6, r0
+    mov r7, r2
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 8]
+    lea r2, [r7 + 8]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 24]
+    lea r2, [r7 + 24]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 32]
+    lea r2, [r7 + 32]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 40]
+    lea r2, [r7 + 40]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 48]
+    lea r2, [r7 + 48]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    lea r0, [r6 + 56]
+    lea r2, [r7 + 56]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6
+%else
+cglobal pixel_satd_64x16, 4,6,8    ;if !WIN64
+    SATD_START_SSE2 m6, m7
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 8
+    add r2, 8
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 16
+    add r2, 16
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 24
+    add r2, 24
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 32
+    add r2, 32
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 40
+    add r2, 40
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 48
+    add r2, 48
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    mov r0, r0mp
+    mov r2, r2mp
+    add r0, 56
+    add r2, 56
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6
+%endif
+
 cglobal pixel_satd_16x4, 4,6,8
     SATD_START_SSE2 m6, m7
     BACKUP_POINTERS


More information about the x265-devel mailing list