[x265] [PATCH 1 of 5] cleanup: remove unused code in mc-a2.asm
Min Chen
chenm003 at 163.com
Mon Nov 25 07:37:22 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1385275951 -28800
# Node ID 464af047f7b12a0a0e105d7550d454f30cf16eea
# Parent 10f605bd053009c8c981c7529322fecd1e54af7b
cleanup: remove unused code in mc-a2.asm
diff -r 10f605bd0530 -r 464af047f7b1 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/mc-a2.asm Sun Nov 24 14:52:31 2013 +0800
@@ -32,10 +32,6 @@
SECTION_RODATA 32
-filt_mul20: times 32 db 20
-filt_mul15: times 16 db 1, -5
-filt_mul51: times 16 db -5, 1
-hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
%if HIGH_BIT_DEPTH
@@ -51,15 +47,6 @@
pd_0f: times 4 dd 0xffff
pf_inv256: times 8 dd 0.00390625
-pad10: times 8 dw 10*PIXEL_MAX
-pad20: times 8 dw 20*PIXEL_MAX
-pad30: times 8 dw 30*PIXEL_MAX
-depad: times 4 dd 32*20*PIXEL_MAX + 512
-
-tap1: times 4 dw 1, -5
-tap2: times 4 dw 20, 20
-tap3: times 4 dw -5, 1
-
SECTION .text
cextern pb_0
@@ -72,86 +59,6 @@
cextern pw_pixel_max
cextern pd_ffff
-%macro LOAD_ADD 4
- movh %4, %3
- movh %1, %2
- punpcklbw %4, m0
- punpcklbw %1, m0
- paddw %1, %4
-%endmacro
-
-%macro LOAD_ADD_2 6
- mova %5, %3
- mova %1, %4
- punpckhbw %6, %5, m0
- punpcklbw %5, m0
- punpckhbw %2, %1, m0
- punpcklbw %1, m0
- paddw %1, %5
- paddw %2, %6
-%endmacro
-
-%macro FILT_V2 6
- psubw %1, %2 ; a-b
- psubw %4, %5
- psubw %2, %3 ; b-c
- psubw %5, %6
- psllw %2, 2
- psllw %5, 2
- psubw %1, %2 ; a-5*b+4*c
- psllw %3, 4
- psubw %4, %5
- psllw %6, 4
- paddw %1, %3 ; a-5*b+20*c
- paddw %4, %6
-%endmacro
-
-%macro FILT_H 3
- psubw %1, %2 ; a-b
- psraw %1, 2 ; (a-b)/4
- psubw %1, %2 ; (a-b)/4-b
- paddw %1, %3 ; (a-b)/4-b+c
- psraw %1, 2 ; ((a-b)/4-b+c)/4
- paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-%endmacro
-
-%macro FILT_H2 6
- psubw %1, %2
- psubw %4, %5
- psraw %1, 2
- psraw %4, 2
- psubw %1, %2
- psubw %4, %5
- paddw %1, %3
- paddw %4, %6
- psraw %1, 2
- psraw %4, 2
- paddw %1, %3
- paddw %4, %6
-%endmacro
-
-%macro FILT_PACK 3-5
-%if cpuflag(ssse3)
- pmulhrsw %1, %3
- pmulhrsw %2, %3
-%else
- paddw %1, %3
- paddw %2, %3
-%if %0 == 5
- psubusw %1, %5
- psubusw %2, %5
- psrlw %1, %4
- psrlw %2, %4
-%else
- psraw %1, %4
- psraw %2, %4
-%endif
-%endif
-%if HIGH_BIT_DEPTH == 0
- packuswb %1, %2
-%endif
-%endmacro
-
;The hpel_filter routines use non-temporal writes for output.
;The following defines may be uncommented for testing.
;Doing the hpel_filter temporal may be a win if the last level cache
@@ -161,738 +68,7 @@
;%define movntps movaps
;%define sfence
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
-;-----------------------------------------------------------------------------
-%macro HPEL_FILTER 0
-cglobal hpel_filter_v, 5,6,11
- FIX_STRIDES r3, r4
- lea r5, [r1+r3]
- sub r1, r3
- sub r1, r3
-%if num_mmregs > 8
- mova m8, [pad10]
- mova m9, [pad20]
- mova m10, [pad30]
- %define s10 m8
- %define s20 m9
- %define s30 m10
-%else
- %define s10 [pad10]
- %define s20 [pad20]
- %define s30 [pad30]
-%endif
- add r0, r4
- add r2, r4
- neg r4
- mova m7, [pw_pixel_max]
- pxor m0, m0
-.loop:
- mova m1, [r1]
- mova m2, [r1+r3]
- mova m3, [r1+r3*2]
- mova m4, [r1+mmsize]
- mova m5, [r1+r3+mmsize]
- mova m6, [r1+r3*2+mmsize]
- paddw m1, [r5+r3*2]
- paddw m2, [r5+r3]
- paddw m3, [r5]
- paddw m4, [r5+r3*2+mmsize]
- paddw m5, [r5+r3+mmsize]
- paddw m6, [r5+mmsize]
- add r1, 2*mmsize
- add r5, 2*mmsize
- FILT_V2 m1, m2, m3, m4, m5, m6
- mova m6, [pw_16]
- psubw m1, s20
- psubw m4, s20
- mova [r2+r4], m1
- mova [r2+r4+mmsize], m4
- paddw m1, s30
- paddw m4, s30
- FILT_PACK m1, m4, m6, 5, s10
- CLIPW m1, m0, m7
- CLIPW m4, m0, m7
- mova [r0+r4], m1
- mova [r0+r4+mmsize], m4
- add r4, 2*mmsize
- jl .loop
- RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_c, 3,3,10
- add r2, r2
- add r0, r2
- add r1, r2
- neg r2
- mova m0, [tap1]
- mova m7, [tap3]
-%if num_mmregs > 8
- mova m8, [tap2]
- mova m9, [depad]
- %define s1 m8
- %define s2 m9
-%else
- %define s1 [tap2]
- %define s2 [depad]
-%endif
-.loop:
- movu m1, [r1+r2-4]
- movu m2, [r1+r2-2]
- mova m3, [r1+r2+0]
- movu m4, [r1+r2+2]
- movu m5, [r1+r2+4]
- movu m6, [r1+r2+6]
- pmaddwd m1, m0
- pmaddwd m2, m0
- pmaddwd m3, s1
- pmaddwd m4, s1
- pmaddwd m5, m7
- pmaddwd m6, m7
- paddd m1, s2
- paddd m2, s2
- paddd m3, m5
- paddd m4, m6
- paddd m1, m3
- paddd m2, m4
- psrad m1, 10
- psrad m2, 10
- pslld m2, 16
- pand m1, [pd_0f]
- por m1, m2
- CLIPW m1, [pb_0], [pw_pixel_max]
- mova [r0+r2], m1
- add r2, mmsize
- jl .loop
- RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_h, 3,4,8
- %define src r1+r2
- add r2, r2
- add r0, r2
- add r1, r2
- neg r2
- mova m0, [pw_pixel_max]
-.loop:
- movu m1, [src-4]
- movu m2, [src-2]
- mova m3, [src+0]
- movu m6, [src+2]
- movu m4, [src+4]
- movu m5, [src+6]
- paddw m3, m6 ; c0
- paddw m2, m4 ; b0
- paddw m1, m5 ; a0
-%if mmsize == 16
- movu m4, [src-4+mmsize]
- movu m5, [src-2+mmsize]
-%endif
- movu m7, [src+4+mmsize]
- movu m6, [src+6+mmsize]
- paddw m5, m7 ; b1
- paddw m4, m6 ; a1
- movu m7, [src+2+mmsize]
- mova m6, [src+0+mmsize]
- paddw m6, m7 ; c1
- FILT_H2 m1, m2, m3, m4, m5, m6
- mova m7, [pw_1]
- pxor m2, m2
- FILT_PACK m1, m4, m7, 1
- CLIPW m1, m2, m0
- CLIPW m4, m2, m0
- mova [r0+r2], m1
- mova [r0+r2+mmsize], m4
- add r2, mmsize*2
- jl .loop
- RET
-%endmacro ; HPEL_FILTER
-
-INIT_MMX mmx2
-HPEL_FILTER
-INIT_XMM sse2
-HPEL_FILTER
-%endif ; HIGH_BIT_DEPTH
-
%if HIGH_BIT_DEPTH == 0
-%macro HPEL_V 1
-;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_v, 5,6,%1
- lea r5, [r1+r3]
- sub r1, r3
- sub r1, r3
- add r0, r4
- lea r2, [r2+r4*2]
- neg r4
-%if cpuflag(ssse3)
- mova m0, [filt_mul15]
-%else
- pxor m0, m0
-%endif
-.loop:
-%if cpuflag(ssse3)
- mova m1, [r1]
- mova m4, [r1+r3]
- mova m2, [r5+r3*2]
- mova m5, [r5+r3]
- mova m3, [r1+r3*2]
- mova m6, [r5]
- SBUTTERFLY bw, 1, 4, 7
- SBUTTERFLY bw, 2, 5, 7
- SBUTTERFLY bw, 3, 6, 7
- pmaddubsw m1, m0
- pmaddubsw m4, m0
- pmaddubsw m2, m0
- pmaddubsw m5, m0
- pmaddubsw m3, [filt_mul20]
- pmaddubsw m6, [filt_mul20]
- paddw m1, m2
- paddw m4, m5
- paddw m1, m3
- paddw m4, m6
- mova m7, [pw_1024]
-%else
- LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
- LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
- LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
- LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
- FILT_V2 m1, m2, m3, m4, m5, m6
- mova m7, [pw_16]
-%endif
-%if mmsize==32
- mova [r2+r4*2], xm1
- mova [r2+r4*2+mmsize/2], xm4
- vextracti128 [r2+r4*2+mmsize], m1, 1
- vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
-%else
- mova [r2+r4*2], m1
- mova [r2+r4*2+mmsize], m4
-%endif
- FILT_PACK m1, m4, m7, 5
- movnta [r0+r4], m1
- add r1, mmsize
- add r5, mmsize
- add r4, mmsize
- jl .loop
- RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal hpel_filter_c, 3,3
- add r0, r2
- lea r1, [r1+r2*2]
- neg r2
- %define src r1+r2*2
- movq m7, [pw_32]
-.loop:
- movq m1, [src-4]
- movq m2, [src-2]
- movq m3, [src ]
- movq m4, [src+4]
- movq m5, [src+6]
- paddw m3, [src+2] ; c0
- paddw m2, m4 ; b0
- paddw m1, m5 ; a0
- movq m6, [src+8]
- paddw m4, [src+14] ; a1
- paddw m5, [src+12] ; b1
- paddw m6, [src+10] ; c1
- FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, m7, 6
- movntq [r0+r2], m1
- add r2, 8
- jl .loop
- RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal hpel_filter_h, 3,3
- add r0, r2
- add r1, r2
- neg r2
- %define src r1+r2
- pxor m0, m0
-.loop:
- movd m1, [src-2]
- movd m2, [src-1]
- movd m3, [src ]
- movd m6, [src+1]
- movd m4, [src+2]
- movd m5, [src+3]
- punpcklbw m1, m0
- punpcklbw m2, m0
- punpcklbw m3, m0
- punpcklbw m6, m0
- punpcklbw m4, m0
- punpcklbw m5, m0
- paddw m3, m6 ; c0
- paddw m2, m4 ; b0
- paddw m1, m5 ; a0
- movd m7, [src+7]
- movd m6, [src+6]
- punpcklbw m7, m0
- punpcklbw m6, m0
- paddw m4, m7 ; c1
- paddw m5, m6 ; b1
- movd m7, [src+5]
- movd m6, [src+4]
- punpcklbw m7, m0
- punpcklbw m6, m0
- paddw m6, m7 ; a1
- movq m7, [pw_1]
- FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, m7, 1
- movntq [r0+r2], m1
- add r2, 8
- jl .loop
- RET
-
-%macro HPEL_C 0
-;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_c, 3,3,9
- add r0, r2
- lea r1, [r1+r2*2]
- neg r2
- %define src r1+r2*2
-%ifnidn cpuname, sse2
-%if cpuflag(ssse3)
- mova m7, [pw_512]
-%else
- mova m7, [pw_32]
-%endif
- %define pw_rnd m7
-%elif ARCH_X86_64
- mova m8, [pw_32]
- %define pw_rnd m8
-%else
- %define pw_rnd [pw_32]
-%endif
-; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if mmsize==32
-.loop:
- movu m4, [src-4]
- movu m5, [src-2]
- mova m6, [src+0]
- movu m3, [src-4+mmsize]
- movu m2, [src-2+mmsize]
- mova m1, [src+0+mmsize]
- paddw m4, [src+6]
- paddw m5, [src+4]
- paddw m6, [src+2]
- paddw m3, [src+6+mmsize]
- paddw m2, [src+4+mmsize]
- paddw m1, [src+2+mmsize]
- FILT_H2 m4, m5, m6, m3, m2, m1
-%else
- mova m0, [src-16]
- mova m1, [src]
-.loop:
- mova m2, [src+16]
- PALIGNR m4, m1, m0, 12, m7
- PALIGNR m5, m1, m0, 14, m0
- PALIGNR m0, m2, m1, 6, m7
- paddw m4, m0
- PALIGNR m0, m2, m1, 4, m7
- paddw m5, m0
- PALIGNR m6, m2, m1, 2, m7
- paddw m6, m1
- FILT_H m4, m5, m6
-
- mova m0, m2
- mova m5, m2
- PALIGNR m2, m1, 12, m7
- PALIGNR m5, m1, 14, m1
- mova m1, [src+32]
- PALIGNR m3, m1, m0, 6, m7
- paddw m3, m2
- PALIGNR m6, m1, m0, 4, m7
- paddw m5, m6
- PALIGNR m6, m1, m0, 2, m7
- paddw m6, m0
- FILT_H m3, m5, m6
-%endif
- FILT_PACK m4, m3, pw_rnd, 6
-%if mmsize==32
- vpermq m4, m4, q3120
-%endif
- movnta [r0+r2], m4
- add r2, mmsize
- jl .loop
- RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal hpel_filter_h, 3,3,8
- add r0, r2
- add r1, r2
- neg r2
- %define src r1+r2
- pxor m0, m0
-.loop:
- movh m1, [src-2]
- movh m2, [src-1]
- movh m3, [src ]
- movh m4, [src+1]
- movh m5, [src+2]
- movh m6, [src+3]
- punpcklbw m1, m0
- punpcklbw m2, m0
- punpcklbw m3, m0
- punpcklbw m4, m0
- punpcklbw m5, m0
- punpcklbw m6, m0
- paddw m3, m4 ; c0
- paddw m2, m5 ; b0
- paddw m1, m6 ; a0
- movh m4, [src+6]
- movh m5, [src+7]
- movh m6, [src+10]
- movh m7, [src+11]
- punpcklbw m4, m0
- punpcklbw m5, m0
- punpcklbw m6, m0
- punpcklbw m7, m0
- paddw m5, m6 ; b1
- paddw m4, m7 ; a1
- movh m6, [src+8]
- movh m7, [src+9]
- punpcklbw m6, m0
- punpcklbw m7, m0
- paddw m6, m7 ; c1
- mova m7, [pw_1] ; FIXME xmm8
- FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, m7, 1
- movntps [r0+r2], m1
- add r2, 16
- jl .loop
- RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-%macro HPEL_H 0
-cglobal hpel_filter_h, 3,3
- add r0, r2
- add r1, r2
- neg r2
- %define src r1+r2
- mova m0, [src-16]
- mova m1, [src]
- mova m7, [pw_1024]
-.loop:
- mova m2, [src+16]
- ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
- ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
- ; the repeated loads of constants for pmaddubsw.
- palignr m3, m1, m0, 14
- palignr m4, m1, m0, 15
- palignr m0, m2, m1, 2
- pmaddubsw m3, [filt_mul15]
- pmaddubsw m4, [filt_mul15]
- pmaddubsw m0, [filt_mul51]
- palignr m5, m2, m1, 1
- palignr m6, m2, m1, 3
- paddw m3, m0
- mova m0, m1
- pmaddubsw m1, [filt_mul20]
- pmaddubsw m5, [filt_mul20]
- pmaddubsw m6, [filt_mul51]
- paddw m3, m1
- paddw m4, m5
- paddw m4, m6
- FILT_PACK m3, m4, m7, 5
- pshufb m3, [hpel_shuf]
- mova m1, m2
- movntps [r0+r2], m3
- add r2, 16
- jl .loop
- RET
-%endmacro
-
-INIT_MMX mmx2
-HPEL_V 0
-INIT_XMM sse2
-HPEL_V 8
-%if ARCH_X86_64 == 0
-INIT_XMM sse2
-HPEL_C
-INIT_XMM ssse3
-HPEL_C
-HPEL_V 0
-HPEL_H
-INIT_XMM avx
-HPEL_C
-HPEL_V 0
-HPEL_H
-INIT_YMM avx2
-HPEL_V 8
-HPEL_C
-
-INIT_YMM avx2
-cglobal hpel_filter_h, 3,3,8
- add r0, r2
- add r1, r2
- neg r2
- %define src r1+r2
- mova m5, [filt_mul15]
- mova m6, [filt_mul20]
- mova m7, [filt_mul51]
-.loop:
- movu m0, [src-2]
- movu m1, [src-1]
- movu m2, [src+2]
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m7
- paddw m0, m2
-
- mova m2, [src+0]
- movu m3, [src+1]
- movu m4, [src+3]
- pmaddubsw m2, m6
- pmaddubsw m3, m6
- pmaddubsw m4, m7
- paddw m0, m2
- paddw m1, m3
- paddw m1, m4
-
- mova m2, [pw_1024]
- FILT_PACK m0, m1, m2, 5
- pshufb m0, [hpel_shuf]
- movnta [r0+r2], m0
- add r2, mmsize
- jl .loop
- RET
-%endif
-
-%if ARCH_X86_64
-%macro DO_FILT_V 5
- ;The optimum prefetch distance is difficult to determine in checkasm:
- ;any prefetch seems slower than not prefetching.
- ;In real use, the prefetch seems to be a slight win.
- ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
- ;loop iteration is going to take longer than the prefetch.
- prefetcht0 [r1+r2*2+mmsize]
-%if cpuflag(ssse3)
- mova m1, [r3]
- mova m2, [r3+r2]
- mova %3, [r3+r2*2]
- mova m3, [r1]
- mova %1, [r1+r2]
- mova %2, [r1+r2*2]
- punpckhbw m4, m1, m2
- punpcklbw m1, m2
- punpckhbw m2, %1, %2
- punpcklbw %1, %2
- punpckhbw %2, m3, %3
- punpcklbw m3, %3
-
- pmaddubsw m1, m12
- pmaddubsw m4, m12
- pmaddubsw %1, m0
- pmaddubsw m2, m0
- pmaddubsw m3, m14
- pmaddubsw %2, m14
-
- paddw m1, %1
- paddw m4, m2
- paddw m1, m3
- paddw m4, %2
-%else
- LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
- LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
- LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
- packuswb %3, %4
- FILT_V2 m1, m2, m3, m4, m5, m6
-%endif
- add r3, mmsize
- add r1, mmsize
-%if mmsize==32
- vinserti128 %1, m1, xm4, 1
- vperm2i128 %2, m1, m4, q0301
-%else
- mova %1, m1
- mova %2, m4
-%endif
- FILT_PACK m1, m4, m15, 5
- movntps [r8+r4+%5], m1
-%endmacro
-
-%macro FILT_C 3
-%if mmsize==32
- vperm2i128 m3, %2, %1, q0003
-%endif
- PALIGNR m1, %2, %1, (mmsize-4), m3
- PALIGNR m2, %2, %1, (mmsize-2), m3
-%if mmsize==32
- vperm2i128 %1, %3, %2, q0003
-%endif
- PALIGNR m3, %3, %2, 4, %1
- PALIGNR m4, %3, %2, 2, %1
- paddw m3, m2
-%if mmsize==32
- mova m2, %1
-%endif
- mova %1, %3
- PALIGNR %3, %3, %2, 6, m2
- paddw m4, %2
- paddw %3, m1
- FILT_H %3, m3, m4
-%endmacro
-
-%macro DO_FILT_C 4
- FILT_C %1, %2, %3
- FILT_C %2, %1, %4
- FILT_PACK %3, %4, m15, 6
-%if mmsize==32
- vpermq %3, %3, q3120
-%endif
- movntps [r5+r4], %3
-%endmacro
-
-%macro ADD8TO16 5
- punpckhbw %3, %1, %5
- punpcklbw %1, %5
- punpcklbw %4, %2, %5
- punpckhbw %2, %5
- paddw %2, %3
- paddw %1, %4
-%endmacro
-
-%macro DO_FILT_H 3
-%if mmsize==32
- vperm2i128 m3, %2, %1, q0003
-%endif
- PALIGNR m1, %2, %1, (mmsize-2), m3
- PALIGNR m2, %2, %1, (mmsize-1), m3
-%if mmsize==32
- vperm2i128 m3, %3, %2, q0003
-%endif
- PALIGNR m4, %3, %2, 1 , m3
- PALIGNR m5, %3, %2, 2 , m3
- PALIGNR m6, %3, %2, 3 , m3
- mova %1, %2
-%if cpuflag(ssse3)
- pmaddubsw m1, m12
- pmaddubsw m2, m12
- pmaddubsw %2, m14
- pmaddubsw m4, m14
- pmaddubsw m5, m0
- pmaddubsw m6, m0
- paddw m1, %2
- paddw m2, m4
- paddw m1, m5
- paddw m2, m6
- FILT_PACK m1, m2, m15, 5
- pshufb m1, [hpel_shuf]
-%else ; ssse3, avx
- ADD8TO16 m1, m6, m12, m3, m0 ; a
- ADD8TO16 m2, m5, m12, m3, m0 ; b
- ADD8TO16 %2, m4, m12, m3, m0 ; c
- FILT_V2 m1, m2, %2, m6, m5, m4
- FILT_PACK m1, m6, m15, 5
-%endif
- movntps [r0+r4], m1
- mova %2, %3
-%endmacro
-
-%macro HPEL 0
-;-----------------------------------------------------------------------------
-; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; uint8_t *src, intptr_t stride, int width, int height )
-;-----------------------------------------------------------------------------
-cglobal hpel_filter, 7,9,16
- mov r7, r3
- sub r5d, mmsize
- mov r8, r1
- and r7, mmsize-1
- sub r3, r7
- add r0, r5
- add r8, r5
- add r7, r5
- add r5, r2
- mov r2, r4
- neg r7
- lea r1, [r3+r2]
- sub r3, r2
- sub r3, r2
- mov r4, r7
-%if cpuflag(ssse3)
- mova m0, [filt_mul51]
- mova m12, [filt_mul15]
- mova m14, [filt_mul20]
- mova m15, [pw_1024]
-%else
- pxor m0, m0
- mova m15, [pw_16]
-%endif
-;ALIGN 16
-.loopy:
-; first filter_v
- DO_FILT_V m8, m7, m13, m12, 0
-;ALIGN 16
-.loopx:
- DO_FILT_V m6, m5, m11, m12, mmsize
-.lastx:
-%if cpuflag(ssse3)
- psrlw m15, 1 ; pw_512
-%else
- paddw m15, m15 ; pw_32
-%endif
- DO_FILT_C m9, m8, m7, m6
-%if cpuflag(ssse3)
- paddw m15, m15 ; pw_1024
-%else
- psrlw m15, 1 ; pw_16
-%endif
- mova m7, m5
- DO_FILT_H m10, m13, m11
- add r4, mmsize
- jl .loopx
- cmp r4, mmsize
- jl .lastx
-; setup regs for next y
- sub r4, r7
- sub r4, r2
- sub r1, r4
- sub r3, r4
- add r0, r2
- add r8, r2
- add r5, r2
- mov r4, r7
- sub r6d, 1
- jg .loopy
- sfence
- RET
-%endmacro
-
-INIT_XMM sse2
-HPEL
-INIT_XMM ssse3
-HPEL
-INIT_XMM avx
-HPEL
-INIT_YMM avx2
-HPEL
-%endif ; ARCH_X86_64
-
%undef movntq
%undef movntps
%undef sfence
More information about the x265-devel
mailing list