[x265] [PATCH 6 of 6] asm: AVX2 of sa8d[32x32], 7.6K -> 6.7K cycles
Min Chen
chenm003 at 163.com
Thu Jun 4 21:13:48 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1433445200 25200
# Node ID 99e4204426f66ba45ca5c208fa4b33c120f76c07
# Parent c9b76e7b55f921a8eeb57659e61f818512c5c660
asm: AVX2 of sa8d[32x32], 7.6K -> 6.7K cycles
---
source/common/x86/asm-primitives.cpp | 2 +
source/common/x86/pixel-a.asm | 72 ++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 0 deletions(-)
diff -r c9b76e7b55f9 -r 99e4204426f6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jun 04 12:13:17 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 04 12:13:20 2015 -0700
@@ -2264,8 +2264,10 @@
p.cu[BLOCK_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
p.cu[BLOCK_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
+ p.cu[BLOCK_32x32].sa8d = x265_pixel_sa8d_32x32_avx2;
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = x265_pixel_sa8d_32x32_avx2;
p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
diff -r c9b76e7b55f9 -r 99e4204426f6 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Jun 04 12:13:17 2015 -0700
+++ b/source/common/x86/pixel-a.asm Thu Jun 04 12:13:20 2015 -0700
@@ -9,6 +9,7 @@
;* Alex Izvorski <aizvorksi at gmail.com>
;* Fiona Glaser <fiona at x264.com>
;* Oskar Arvidsson <oskar at irock.se>
+;* Min Chen <chenm003 at 163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -7015,6 +7016,77 @@
shr eax, 1
RET
+cglobal pixel_sa8d_16x16_internal
+ call pixel_sa8d_8x8_internal ; pix[0]
+
+ sub r0, r1
+ sub r0, r1
+ add r0, 8*SIZEOF_PIXEL
+ sub r2, r3
+ sub r2, r3
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal ; pix[8]
+
+ add r0, r4
+ add r0, r1
+ add r2, r5
+ add r2, r3
+ call pixel_sa8d_8x8_internal ; pix[8*stride+8]
+
+ sub r0, r1
+ sub r0, r1
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, r3
+ sub r2, r3
+ sub r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal ; pix[8*stride]
+
+ ; TODO: analyze Dynamic Range
+ vextracti128 xm0, m6, 1
+ paddusw xm6, xm0
+ HADDUW xm6, xm0
+ movd eax, xm6
+ add eax, 1
+ shr eax, 1
+ ret
+
+%if ARCH_X86_64
+cglobal pixel_sa8d_32x32, 4,8,8
+ ; TODO: R6 is RAX on x64 platform, so we use it directly
+
+ SATD_START_AVX2 m6, m7, 1
+ xor r7d, r7d
+
+ call pixel_sa8d_16x16_internal ; [0]
+ pxor m6, m6
+ add r7d, eax
+
+ add r0, r4
+ add r0, r1
+ add r2, r5
+ add r2, r3
+ call pixel_sa8d_16x16_internal ; [2]
+ pxor m6, m6
+ add r7d, eax
+
+ lea eax, [r4 * 5 - 16]
+ sub r0, rax
+ sub r0, r1
+ lea eax, [r5 * 5 - 16]
+ sub r2, rax
+ sub r2, r3
+ call pixel_sa8d_16x16_internal ; [1]
+ pxor m6, m6
+ add r7d, eax
+
+ add r0, r4
+ add r0, r1
+ add r2, r5
+ add r2, r3
+ call pixel_sa8d_16x16_internal ; [3]
+ add eax, r7d
+ RET
+%endif ; ARCH_X86_64=1
%endif ; HIGH_BIT_DEPTH
; Input 16bpp, Output 8bpp
More information about the x265-devel
mailing list