[x265] [PATCH] asm-avx2: filter_vps[4x4]: improve 201c->156c
Divya Manivannan
divya at multicorewareinc.com
Tue Feb 24 07:47:05 CET 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1424760406 -19800
# Tue Feb 24 12:16:46 2015 +0530
# Node ID a2db9c6435f95b5b02c056c6641d19808e5a41ff
# Parent edc794a061474f75e57d94c927e6d1f866ebfb16
asm-avx2: filter_vps[4x4]: improve 201c->156c
diff -r edc794a06147 -r a2db9c6435f9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Feb 24 09:25:06 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Feb 24 12:16:46 2015 +0530
@@ -1801,6 +1801,7 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2;
diff -r edc794a06147 -r a2db9c6435f9 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Feb 24 09:25:06 2015 +0530
+++ b/source/common/x86/const-a.asm Tue Feb 24 12:16:46 2015 +0530
@@ -74,6 +74,7 @@
const pw_32_0, times 4 dw 32,
times 4 dw 0
const pw_2000, times 8 dw 0x2000
+const sw_2000, times 16 dw 0x2000
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
diff -r edc794a06147 -r a2db9c6435f9 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Feb 24 09:25:06 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Feb 24 12:16:46 2015 +0530
@@ -245,6 +245,7 @@
cextern pw_1
cextern pw_512
cextern pw_2000
+cextern sw_2000
%macro FILTER_H4_w2_2 3
movh %2, [srcq - 1]
@@ -2556,8 +2557,9 @@
RET
+%macro FILTER_VER_CHROMA_AVX2_4x4 1
INIT_YMM avx2
-cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
+cglobal interp_4tap_vert_%1_4x4, 4, 6, 3
mov r4d, r4m
shl r4d, 6
sub r0, r1
@@ -2591,6 +2593,7 @@
pmaddubsw m0, [r5]
pmaddubsw m1, [r5 + mmsize]
paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
+%ifidn %1,pp
pmulhrsw m0, [pw_512]
vextracti128 xm1, m0, 1
packuswb xm0, xm1
@@ -2599,7 +2602,21 @@
pextrd [r2 + r3], xm0, 1
pextrd [r2 + r3 * 2], xm0, 2
pextrd [r2 + r5], xm0, 3
- RET
+%else
+ add r3d, r3d
+ psubw m0, [sw_2000]
+ vextracti128 xm1, m0, 1
+ lea r5, [r3 * 3]
+ movq [r2], xm0
+ movhps [r2 + r3], xm0
+ movq [r2 + r3 * 2], xm1
+ movhps [r2 + r5], xm1
+%endif
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_4x4 pp
+FILTER_VER_CHROMA_AVX2_4x4 ps
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -5359,8 +5376,7 @@
paddw m0, m4
paddw m0, m2 ; m0 = WORD ROW[3 2 1 0]
- vbroadcasti128 m3, [pw_2000]
- psubw m0, m3
+ psubw m0, [sw_2000]
vextracti128 xm2, m0, 1
lea r5, [r3 * 3]
movq [r2], xm0
More information about the x265-devel
mailing list