[x265] [PATCH] asm: removed unused code from pixel-a.asm
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Dec 2 07:49:53 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385966974 -19800
# Mon Dec 02 12:19:34 2013 +0530
# Node ID 1695371f63a6cdef5ece9d17f94b286fc17cc29e
# Parent ace5b9ee099d1539f020e68971a27577148a4a29
asm: removed unused code from pixel-a.asm
diff -r ace5b9ee099d -r 1695371f63a6 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Dec 02 11:25:00 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Dec 02 12:19:34 2013 +0530
@@ -2242,182 +2242,6 @@
movd eax, m7
RET
-;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
-; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-;
-; The maximum width this function can handle without risk of overflow is given
-; in the following equation: (mmsize in bits)
-;
-; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
-;
-; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
-; distortion levels it will take much more than that though.
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7,7
- shl r4d, 2
- FIX_STRIDES r1, r3
- add r0, r4
- add r2, r4
- xor r6, r6
- pxor m4, m4
- pxor m5, m5
- pxor m6, m6
-.loopy:
- mov r6, r4
- neg r6
- pxor m2, m2
- pxor m3, m3
-.loopx:
- mova m0, [r0+r6]
- mova m1, [r0+r6+mmsize]
- psubw m0, [r2+r6]
- psubw m1, [r2+r6+mmsize]
- PSHUFLW m0, m0, q3120
- PSHUFLW m1, m1, q3120
-%if mmsize >= 16
- pshufhw m0, m0, q3120
- pshufhw m1, m1, q3120
-%endif
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m2, m0
- paddd m3, m1
- add r6, 2*mmsize
- jl .loopx
-%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
- jz .no_overread
- psubd m3, m1
-.no_overread:
-%endif
-%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
- ; equation above, putting the width limit at 8208
- punpckhdq m0, m2, m6
- punpckhdq m1, m3, m6
- punpckldq m2, m6
- punpckldq m3, m6
- paddq m3, m2
- paddq m1, m0
- paddq m4, m3
- paddq m4, m1
-%else ; unfortunately paddq is sse2
- ; emulate 48 bit precision for mmx2 instead
- mova m0, m2
- mova m1, m3
- punpcklwd m2, m6
- punpcklwd m3, m6
- punpckhwd m0, m6
- punpckhwd m1, m6
- paddd m3, m2
- paddd m1, m0
- paddd m4, m3
- paddd m5, m1
-%endif
- add r0, r1
- add r2, r3
- dec r5d
- jg .loopy
- mov r3, r6m
- mov r4, r7m
-%if mmsize == 32
- vextracti128 xm0, m4, 1
- paddq xm4, xm0
-%endif
-%if mmsize >= 16
- movq [r3], xm4
- movhps [r4], xm4
-%else ; fixup for mmx2
- SBUTTERFLY dq, 4, 5, 0
- mova m0, m4
- psrld m4, 16
- paddd m5, m4
- pslld m0, 16
- SBUTTERFLY dq, 0, 5, 4
- psrlq m0, 16
- psrlq m5, 16
- movq [r3], m0
- movq [r4], m5
-%endif
- RET
-%endmacro ; SSD_NV12
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
-; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-;
-; This implementation can potentially overflow on image widths >= 11008 (or
-; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
-; 20). At sane distortion levels it will take much more than that though.
-;-----------------------------------------------------------------------------
-%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7
- add r4d, r4d
- add r0, r4
- add r2, r4
- pxor m3, m3
- pxor m4, m4
- mova m5, [pw_00ff]
-.loopy:
- mov r6, r4
- neg r6
-.loopx:
-%if mmsize == 32 ; only 16-byte alignment is guaranteed
- movu m2, [r0+r6]
- movu m1, [r2+r6]
-%else
- mova m2, [r0+r6]
- mova m1, [r2+r6]
-%endif
- psubusb m0, m2, m1
- psubusb m1, m2
- por m0, m1
- psrlw m2, m0, 8
- pand m0, m5
- pmaddwd m2, m2
- pmaddwd m0, m0
- paddd m3, m0
- paddd m4, m2
- add r6, mmsize
- jl .loopx
-%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
- jz .no_overread
- pcmpeqb xm1, xm1
- pandn m0, m1, m0 ; zero the lower half
- pandn m2, m1, m2
- psubd m3, m0
- psubd m4, m2
-.no_overread:
-%endif
- add r0, r1
- add r2, r3
- dec r5d
- jg .loopy
- mov r3, r6m
- mov r4, r7m
- HADDD m3, m0
- HADDD m4, m0
- pxor xm0, xm0
- punpckldq xm3, xm0
- punpckldq xm4, xm0
- movq [r3], xm3
- movq [r4], xm4
- RET
-%endmacro ; SSD_NV12
-%endif ; !HIGH_BIT_DEPTH
-
-INIT_MMX mmx2
-SSD_NV12
-INIT_XMM sse2
-SSD_NV12
-INIT_XMM avx
-SSD_NV12
-INIT_YMM avx2
-SSD_NV12
-
;=============================================================================
; variance
;=============================================================================
@@ -2841,183 +2665,6 @@
RET
%endmacro
-;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
-;-----------------------------------------------------------------------------
-%macro VAR2_8x8_MMX 2
-cglobal pixel_var2_8x%1, 5,6
- FIX_STRIDES r1, r3
- VAR_START 0
- mov r5d, %1
-.loop:
-%if HIGH_BIT_DEPTH
- mova m0, [r0]
- mova m1, [r0+mmsize]
- psubw m0, [r2]
- psubw m1, [r2+mmsize]
-%else ; !HIGH_BIT_DEPTH
- movq m0, [r0]
- movq m1, m0
- movq m2, [r2]
- movq m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- psubw m0, m2
- psubw m1, m3
-%endif ; HIGH_BIT_DEPTH
- paddw m5, m0
- paddw m5, m1
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m6, m0
- paddd m6, m1
- add r0, r1
- add r2, r3
- dec r5d
- jg .loop
- VAR2_END %2, m5, m6
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-VAR2_8x8_MMX 8, 6
-VAR2_8x8_MMX 16, 7
-%endif
-
-%macro VAR2_8x8_SSE2 2
-cglobal pixel_var2_8x%1, 5,6,8
- VAR_START 1
- mov r5d, %1/2
-.loop:
-%if HIGH_BIT_DEPTH
- mova m0, [r0]
- mova m1, [r0+r1*2]
- mova m2, [r2]
- mova m3, [r2+r3*2]
-%else ; !HIGH_BIT_DEPTH
- movq m1, [r0]
- movhps m1, [r0+r1]
- movq m3, [r2]
- movhps m3, [r2+r3]
- DEINTB 0, 1, 2, 3, 7
-%endif ; HIGH_BIT_DEPTH
- psubw m0, m2
- psubw m1, m3
- paddw m5, m0
- paddw m5, m1
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m6, m0
- paddd m6, m1
- lea r0, [r0+r1*2*SIZEOF_PIXEL]
- lea r2, [r2+r3*2*SIZEOF_PIXEL]
- dec r5d
- jg .loop
- VAR2_END %2, m5, m6
-%endmacro
-
-INIT_XMM sse2
-VAR2_8x8_SSE2 8, 6
-VAR2_8x8_SSE2 16, 7
-
-%if HIGH_BIT_DEPTH == 0
-%macro VAR2_8x8_SSSE3 2
-cglobal pixel_var2_8x%1, 5,6,8
- pxor m5, m5 ; sum
- pxor m6, m6 ; sum squared
- mova m7, [hsub_mul]
- mov r5d, %1/4
-.loop:
- movq m0, [r0]
- movq m2, [r2]
- movq m1, [r0+r1]
- movq m3, [r2+r3]
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- punpcklbw m0, m2
- punpcklbw m1, m3
- movq m2, [r0]
- movq m3, [r2]
- punpcklbw m2, m3
- movq m3, [r0+r1]
- movq m4, [r2+r3]
- punpcklbw m3, m4
- pmaddubsw m0, m7
- pmaddubsw m1, m7
- pmaddubsw m2, m7
- pmaddubsw m3, m7
- paddw m5, m0
- paddw m5, m1
- paddw m5, m2
- paddw m5, m3
- pmaddwd m0, m0
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- paddd m6, m0
- paddd m6, m1
- paddd m6, m2
- paddd m6, m3
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- dec r5d
- jg .loop
- VAR2_END %2, m5, m6
-%endmacro
-
-INIT_XMM ssse3
-VAR2_8x8_SSSE3 8, 6
-VAR2_8x8_SSSE3 16, 7
-INIT_XMM xop
-VAR2_8x8_SSSE3 8, 6
-VAR2_8x8_SSSE3 16, 7
-
-%macro VAR2_8x8_AVX2 2
-cglobal pixel_var2_8x%1, 5,6,6
- pxor m3, m3 ; sum
- pxor m4, m4 ; sum squared
- mova m5, [hsub_mul]
- mov r5d, %1/4
-.loop:
- movq xm0, [r0]
- movq xm1, [r2]
- vinserti128 m0, m0, [r0+r1], 1
- vinserti128 m1, m1, [r2+r3], 1
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- punpcklbw m0, m1
- movq xm1, [r0]
- movq xm2, [r2]
- vinserti128 m1, m1, [r0+r1], 1
- vinserti128 m2, m2, [r2+r3], 1
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- punpcklbw m1, m2
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- paddw m3, m0
- paddw m3, m1
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m4, m0
- paddd m4, m1
- dec r5d
- jg .loop
- vextracti128 xm0, m3, 1
- vextracti128 xm1, m4, 1
- paddw xm3, xm0
- paddd xm4, xm1
- VAR2_END %2, xm3, xm4
-%endmacro
-
-INIT_YMM avx2
-VAR2_8x8_AVX2 8, 6
-VAR2_8x8_AVX2 16, 7
-
-%endif ; !HIGH_BIT_DEPTH
-
;=============================================================================
; SATD
;=============================================================================
@@ -9009,446 +8656,6 @@
INIT_XMM avx
SSIM
-;-----------------------------------------------------------------------------
-; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-;-----------------------------------------------------------------------------
-%macro ASD8 0
-cglobal pixel_asd8, 5,5
- pxor m0, m0
- pxor m1, m1
-.loop:
-%if HIGH_BIT_DEPTH
- paddw m0, [r0]
- paddw m1, [r2]
- paddw m0, [r0+2*r1]
- paddw m1, [r2+2*r3]
- lea r0, [r0+4*r1]
- paddw m0, [r0]
- paddw m1, [r2+4*r3]
- lea r2, [r2+4*r3]
- paddw m0, [r0+2*r1]
- paddw m1, [r2+2*r3]
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
-%else
- movq m2, [r0]
- movq m3, [r2]
- movhps m2, [r0+r1]
- movhps m3, [r2+r3]
- lea r0, [r0+2*r1]
- psadbw m2, m1
- psadbw m3, m1
- movq m4, [r0]
- movq m5, [r2+2*r3]
- lea r2, [r2+2*r3]
- movhps m4, [r0+r1]
- movhps m5, [r2+r3]
- lea r0, [r0+2*r1]
- paddw m0, m2
- psubw m0, m3
- psadbw m4, m1
- psadbw m5, m1
- lea r2, [r2+2*r3]
- paddw m0, m4
- psubw m0, m5
-%endif
- sub r4d, 4
- jg .loop
-%if HIGH_BIT_DEPTH
- psubw m0, m1
- HADDW m0, m1
- ABSD m1, m0
-%else
- movhlps m1, m0
- paddw m0, m1
- ABSW m1, m0
-%endif
- movd eax, m1
- RET
-%endmacro
-
-INIT_XMM sse2
-ASD8
-INIT_XMM ssse3
-ASD8
-%if HIGH_BIT_DEPTH
-INIT_XMM xop
-ASD8
-%endif
-
-;=============================================================================
-; Successive Elimination ADS
-;=============================================================================
-
-%macro ADS_START 0
-%if UNIX64
- movsxd r5, r5d
-%else
- mov r5d, r5m
-%endif
- mov r0d, r5d
- lea r6, [r4+r5+(mmsize-1)]
- and r6, ~(mmsize-1)
- shl r2d, 1
-%endmacro
-
-%macro ADS_END 1 ; unroll_size
- add r1, 8*%1
- add r3, 8*%1
- add r6, 4*%1
- sub r0d, 4*%1
- jg .loop
- WIN64_RESTORE_XMM rsp
-%if mmsize==32
- vzeroupper
-%endif
- lea r6, [r4+r5+(mmsize-1)]
- and r6, ~(mmsize-1)
-%if cpuflag(ssse3)
- jmp ads_mvs_ssse3
-%else
- jmp ads_mvs_mmx
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_ads4, 5,7
- mova m6, [r0]
- mova m4, [r0+8]
- pshufw m7, m6, 0
- pshufw m6, m6, q2222
- pshufw m5, m4, 0
- pshufw m4, m4, q2222
- ADS_START
-.loop:
- movu m0, [r1]
- movu m1, [r1+16]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- movu m2, [r1+r2]
- movu m3, [r1+r2+16]
- psubw m2, m5
- psubw m3, m4
- paddw m0, m1
- ABSW m2, m2, m1
- ABSW m3, m3, m1
- paddw m0, m2
- paddw m0, m3
- pshufw m1, r6m, 0
- paddusw m0, [r3]
- psubusw m1, m0
- packsswb m1, m1
- movd [r6], m1
- ADS_END 1
-
-cglobal pixel_ads2, 5,7
- mova m6, [r0]
- pshufw m5, r6m, 0
- pshufw m7, m6, 0
- pshufw m6, m6, q2222
- ADS_START
-.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddw m0, m1
- paddusw m0, [r3]
- mova m4, m5
- psubusw m4, m0
- packsswb m4, m4
- movd [r6], m4
- ADS_END 1
-
-cglobal pixel_ads1, 5,7
- pshufw m7, [r0], 0
- pshufw m6, r6m, 0
- ADS_START
-.loop:
- movu m0, [r1]
- movu m1, [r1+8]
- psubw m0, m7
- psubw m1, m7
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddusw m0, [r3]
- paddusw m1, [r3+8]
- mova m4, m6
- mova m5, m6
- psubusw m4, m0
- psubusw m5, m1
- packsswb m4, m5
- mova [r6], m4
- ADS_END 2
-
-%macro ADS_XMM 0
-%if mmsize==32
-cglobal pixel_ads4, 5,7,8
- vpbroadcastw m7, [r0+ 0]
- vpbroadcastw m6, [r0+ 4]
- vpbroadcastw m5, [r0+ 8]
- vpbroadcastw m4, [r0+12]
-%else
-cglobal pixel_ads4, 5,7,12
- mova m4, [r0]
- pshuflw m7, m4, q0000
- pshuflw m6, m4, q2222
- pshufhw m5, m4, q0000
- pshufhw m4, m4, q2222
- punpcklqdq m7, m7
- punpcklqdq m6, m6
- punpckhqdq m5, m5
- punpckhqdq m4, m4
-%endif
-%if ARCH_X86_64 && mmsize == 16
- movd m8, r6m
- SPLATW m8, m8
- ADS_START
- movu m10, [r1]
- movu m11, [r1+r2]
-.loop:
- psubw m0, m10, m7
- movu m10, [r1+16]
- psubw m1, m10, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- psubw m2, m11, m5
- movu m11, [r1+r2+16]
- paddw m0, m1
- psubw m3, m11, m4
- movu m9, [r3]
- ABSW m2, m2, m1
- ABSW m3, m3, m1
- paddw m0, m2
- paddw m0, m3
- paddusw m0, m9
- psubusw m1, m8, m0
-%else
- ADS_START
-.loop:
- movu m0, [r1]
- movu m1, [r1+16]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- movu m2, [r1+r2]
- movu m3, [r1+r2+16]
- psubw m2, m5
- psubw m3, m4
- paddw m0, m1
- ABSW m2, m2, m1
- ABSW m3, m3, m1
- paddw m0, m2
- paddw m0, m3
- movu m2, [r3]
-%if mmsize==32
- vpbroadcastw m1, r6m
-%else
- movd m1, r6m
- pshuflw m1, m1, 0
- punpcklqdq m1, m1
-%endif
- paddusw m0, m2
- psubusw m1, m0
-%endif ; ARCH
- packsswb m1, m1
-%if mmsize==32
- vpermq m1, m1, q3120
- mova [r6], xm1
-%else
- movh [r6], m1
-%endif
- ADS_END mmsize/8
-
-cglobal pixel_ads2, 5,7,8
-%if mmsize==32
- vpbroadcastw m7, [r0+0]
- vpbroadcastw m6, [r0+4]
- vpbroadcastw m5, r6m
-%else
- movq m6, [r0]
- movd m5, r6m
- pshuflw m7, m6, 0
- pshuflw m6, m6, q2222
- pshuflw m5, m5, 0
- punpcklqdq m7, m7
- punpcklqdq m6, m6
- punpcklqdq m5, m5
-%endif
- ADS_START
-.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- psubw m0, m7
- psubw m1, m6
- movu m4, [r3]
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddw m0, m1
- paddusw m0, m4
- psubusw m1, m5, m0
- packsswb m1, m1
-%if mmsize==32
- vpermq m1, m1, q3120
- mova [r6], xm1
-%else
- movh [r6], m1
-%endif
- ADS_END mmsize/8
-
-cglobal pixel_ads1, 5,7,8
-%if mmsize==32
- vpbroadcastw m7, [r0]
- vpbroadcastw m6, r6m
-%else
- movd m7, [r0]
- movd m6, r6m
- pshuflw m7, m7, 0
- pshuflw m6, m6, 0
- punpcklqdq m7, m7
- punpcklqdq m6, m6
-%endif
- ADS_START
-.loop:
- movu m0, [r1]
- movu m1, [r1+mmsize]
- psubw m0, m7
- psubw m1, m7
- movu m2, [r3]
- movu m3, [r3+mmsize]
- ABSW m0, m0, m4
- ABSW m1, m1, m5
- paddusw m0, m2
- paddusw m1, m3
- psubusw m4, m6, m0
- psubusw m5, m6, m1
- packsswb m4, m5
-%if mmsize==32
- vpermq m4, m4, q3120
-%endif
- mova [r6], m4
- ADS_END mmsize/4
-%endmacro
-
-INIT_XMM sse2
-ADS_XMM
-INIT_XMM ssse3
-ADS_XMM
-INIT_XMM avx
-ADS_XMM
-INIT_YMM avx2
-ADS_XMM
-
-; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
-; {
-; int nmv=0, i, j;
-; *(uint32_t*)(masks+width) = 0;
-; for( i=0; i<width; i+=8 )
-; {
-; uint64_t mask = *(uint64_t*)(masks+i);
-; if( !mask ) continue;
-; for( j=0; j<8; j++ )
-; if( mask & (255<<j*8) )
-; mvs[nmv++] = i+j;
-; }
-; return nmv;
-; }
-
-%macro TEST 1
- mov [r4+r0*2], r1w
- test r2d, 0xff<<(%1*8)
- setne r3b
- add r0d, r3d
- inc r1d
-%endmacro
-
-INIT_MMX mmx
-cglobal pixel_ads_mvs, 0,7,0
-ads_mvs_mmx:
- ; mvs = r4
- ; masks = r6
- ; width = r5
- ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
- xor r0d, r0d
- xor r1d, r1d
- mov [r6+r5], r0d
- jmp .loopi
-ALIGN 16
-.loopi0:
- add r1d, 8
- cmp r1d, r5d
- jge .end
-.loopi:
- mov r2, [r6+r1]
-%if ARCH_X86_64
- test r2, r2
-%else
- mov r3, r2
- add r3d, [r6+r1+4]
-%endif
- jz .loopi0
- xor r3d, r3d
- TEST 0
- TEST 1
- TEST 2
- TEST 3
-%if ARCH_X86_64
- shr r2, 32
-%else
- mov r2d, [r6+r1]
-%endif
- TEST 0
- TEST 1
- TEST 2
- TEST 3
- cmp r1d, r5d
- jl .loopi
-.end:
- movifnidn eax, r0d
- RET
-
-INIT_XMM ssse3
-cglobal pixel_ads_mvs, 0,7,0
-ads_mvs_ssse3:
- mova m3, [pw_8]
- mova m4, [pw_76543210]
- pxor m5, m5
- add r5, r6
- xor r0d, r0d ; nmv
- mov [r5], r0d
-%ifdef PIC
- lea r1, [$$]
- %define GLOBAL +r1-$$
-%else
- %define GLOBAL
-%endif
-.loop:
- movh m0, [r6]
- pcmpeqb m0, m5
- pmovmskb r2d, m0
- xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
- movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
- add r2d, r2d
- ; shuffle counters based on mv mask
- pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
- movu [r4+r0*2], m2
- add r0d, r3d
- paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
- add r6, 8
- cmp r6, r5
- jl .loop
- movifnidn eax, r0d
- RET
-
;-----------------------------------------------------------------
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
More information about the x265-devel
mailing list