[x265] [PATCH] asm: leading space nit
dtyx265 at gmail.com
dtyx265 at gmail.com
Sat Apr 18 19:02:36 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1429376539 25200
# Node ID 14b0bed44a7bc2f36b357a198104dd1cfaa4214c
# Parent 3ec6052eaf9c1c1e3a280fa6d3fb392902b2a849
asm: leading space nit
Added leading 4 spaces to asm instructions
diff -r 3ec6052eaf9c -r 14b0bed44a7b source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Apr 17 14:02:26 2015 -0700
+++ b/source/common/x86/ipfilter8.asm Sat Apr 18 10:02:19 2015 -0700
@@ -344,76 +344,76 @@
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m5, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-FILTER_H4_w2_2_sse2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-FILTER_H4_w2_2_sse2
-
-RET
+ mov r4d, r4m
+ mova m5, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+ FILTER_H4_w2_2_sse2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w2_2_sse2
+
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m5, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
+ mov r4d, r4m
+ mova m5, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
%endif
%assign x 1
%rep 4
-FILTER_H4_w2_2_sse2
+ FILTER_H4_w2_2_sse2
%if x < 4
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m5, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
+ mov r4d, r4m
+ mova m5, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
%endif
%assign x 1
%rep 8
-FILTER_H4_w2_2_sse2
+ FILTER_H4_w2_2_sse2
%if x < 8
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
-RET
+ RET
%macro FILTER_H4_w4_2_sse2 0
pxor m5, m5
@@ -461,122 +461,122 @@
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_4x2, 4, 6, 8, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m7, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-FILTER_H4_w4_2_sse2
-
-RET
+ mov r4d, r4m
+ mova m7, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+ FILTER_H4_w4_2_sse2
+
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_4x4, 4, 6, 8, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m7, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-FILTER_H4_w4_2_sse2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-FILTER_H4_w4_2_sse2
-
-RET
+ mov r4d, r4m
+ mova m7, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+ FILTER_H4_w4_2_sse2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w4_2_sse2
+
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_4x8, 4, 6, 8, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m7, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
+ mov r4d, r4m
+ mova m7, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
%endif
%assign x 1
%rep 4
-FILTER_H4_w4_2_sse2
+ FILTER_H4_w4_2_sse2
%if x < 4
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_4x16, 4, 6, 8, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m7, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
+ mov r4d, r4m
+ mova m7, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
%endif
%assign x 1
%rep 8
-FILTER_H4_w4_2_sse2
+ FILTER_H4_w4_2_sse2
%if x < 8
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_4tap_horiz_pp_4x32, 4, 6, 8, src, srcstride, dst, dststride
-mov r4d, r4m
-mova m7, [pw_32]
-
-%ifdef PIC
-lea r5, [tabw_ChromaCoeff]
-movddup m4, [r5 + r4 * 8]
-%else
-movddup m4, [tabw_ChromaCoeff + r4 * 8]
+ mov r4d, r4m
+ mova m7, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
%endif
%assign x 1
%rep 16
-FILTER_H4_w4_2_sse2
+ FILTER_H4_w4_2_sse2
%if x < 16
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
-RET
+ RET
%macro FILTER_H4_w2_2 3
movh %2, [srcq - 1]
@@ -605,26 +605,26 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
%rep 2
-FILTER_H4_w2_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w2_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -637,26 +637,26 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
%rep 4
-FILTER_H4_w2_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w2_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -669,29 +669,29 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-mov r5d, 16/2
-
-.loop:
-FILTER_H4_w2_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-dec r5d
-jnz .loop
-
-RET
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
+
+ mov r5d, 16/2
+
+.loop:
+ FILTER_H4_w2_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
+ dec r5d
+ jnz .loop
+
+ RET
%macro FILTER_H4_w4_2 3
movh %2, [srcq - 1]
@@ -719,22 +719,22 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-FILTER_H4_w4_2 t0, t1, t2
-
-RET
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
+
+ FILTER_H4_w4_2 t0, t1, t2
+
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -747,26 +747,26 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
%rep 2
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w4_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -779,26 +779,26 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
%rep 4
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w4_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -811,26 +811,26 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
%rep 8
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
+ FILTER_H4_w4_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endrep
-RET
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -843,29 +843,29 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-mov r5d, 32/2
-
-.loop:
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-dec r5d
-jnz .loop
-
-RET
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
+
+ mov r5d, 32/2
+
+.loop:
+ FILTER_H4_w4_2 t0, t1, t2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
+ dec r5d
+ jnz .loop
+
+ RET
ALIGN 32
const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
@@ -1041,47 +1041,47 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-mov r5d, %2
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-mova Tm1, [tab_Tm + 16]
-
-.loop:
-FILTER_H4_w%1 t0, t1, t2
-add srcq, srcstrideq
-add dstq, dststrideq
-
-dec r5d
-jnz .loop
-
-RET
-%endmacro
-
-
-IPFILTER_CHROMA 6, 8
-IPFILTER_CHROMA 8, 2
-IPFILTER_CHROMA 8, 4
-IPFILTER_CHROMA 8, 6
-IPFILTER_CHROMA 8, 8
-IPFILTER_CHROMA 8, 16
-IPFILTER_CHROMA 8, 32
-IPFILTER_CHROMA 12, 16
-
-IPFILTER_CHROMA 6, 16
-IPFILTER_CHROMA 8, 12
-IPFILTER_CHROMA 8, 64
-IPFILTER_CHROMA 12, 32
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mov r5d, %2
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
+ mova Tm1, [tab_Tm + 16]
+
+.loop:
+ FILTER_H4_w%1 t0, t1, t2
+ add srcq, srcstrideq
+ add dstq, dststrideq
+
+ dec r5d
+ jnz .loop
+
+ RET
+%endmacro
+
+
+ IPFILTER_CHROMA 6, 8
+ IPFILTER_CHROMA 8, 2
+ IPFILTER_CHROMA 8, 4
+ IPFILTER_CHROMA 8, 6
+ IPFILTER_CHROMA 8, 8
+ IPFILTER_CHROMA 8, 16
+ IPFILTER_CHROMA 8, 32
+ IPFILTER_CHROMA 12, 16
+
+ IPFILTER_CHROMA 6, 16
+ IPFILTER_CHROMA 8, 12
+ IPFILTER_CHROMA 8, 64
+ IPFILTER_CHROMA 12, 32
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -1097,55 +1097,55 @@
%define t1 m1
%define t0 m0
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-mov r5d, %2
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-mova Tm1, [tab_Tm + 16]
-
-.loop:
-FILTER_H4_w%1 t0, t1, t2, t3
-add srcq, srcstrideq
-add dstq, dststrideq
-
-dec r5d
-jnz .loop
-
-RET
-%endmacro
-
-IPFILTER_CHROMA_W 16, 4
-IPFILTER_CHROMA_W 16, 8
-IPFILTER_CHROMA_W 16, 12
-IPFILTER_CHROMA_W 16, 16
-IPFILTER_CHROMA_W 16, 32
-IPFILTER_CHROMA_W 32, 8
-IPFILTER_CHROMA_W 32, 16
-IPFILTER_CHROMA_W 32, 24
-IPFILTER_CHROMA_W 24, 32
-IPFILTER_CHROMA_W 32, 32
-
-IPFILTER_CHROMA_W 16, 24
-IPFILTER_CHROMA_W 16, 64
-IPFILTER_CHROMA_W 32, 48
-IPFILTER_CHROMA_W 24, 64
-IPFILTER_CHROMA_W 32, 64
-
-IPFILTER_CHROMA_W 64, 64
-IPFILTER_CHROMA_W 64, 32
-IPFILTER_CHROMA_W 64, 48
-IPFILTER_CHROMA_W 48, 64
-IPFILTER_CHROMA_W 64, 16
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd coef2, [r5 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ mov r5d, %2
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_512]
+ mova Tm0, [tab_Tm]
+ mova Tm1, [tab_Tm + 16]
+
+.loop:
+ FILTER_H4_w%1 t0, t1, t2, t3
+ add srcq, srcstrideq
+ add dstq, dststrideq
+
+ dec r5d
+ jnz .loop
+
+ RET
+%endmacro
+
+ IPFILTER_CHROMA_W 16, 4
+ IPFILTER_CHROMA_W 16, 8
+ IPFILTER_CHROMA_W 16, 12
+ IPFILTER_CHROMA_W 16, 16
+ IPFILTER_CHROMA_W 16, 32
+ IPFILTER_CHROMA_W 32, 8
+ IPFILTER_CHROMA_W 32, 16
+ IPFILTER_CHROMA_W 32, 24
+ IPFILTER_CHROMA_W 24, 32
+ IPFILTER_CHROMA_W 32, 32
+
+ IPFILTER_CHROMA_W 16, 24
+ IPFILTER_CHROMA_W 16, 64
+ IPFILTER_CHROMA_W 32, 48
+ IPFILTER_CHROMA_W 24, 64
+ IPFILTER_CHROMA_W 32, 64
+
+ IPFILTER_CHROMA_W 64, 64
+ IPFILTER_CHROMA_W 64, 32
+ IPFILTER_CHROMA_W 64, 48
+ IPFILTER_CHROMA_W 48, 64
+ IPFILTER_CHROMA_W 64, 16
%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
@@ -1397,8 +1397,8 @@
%endif
%endmacro
-FILTER_HORIZ_LUMA_AVX2_4xN 8
-FILTER_HORIZ_LUMA_AVX2_4xN 16
+ FILTER_HORIZ_LUMA_AVX2_4xN 8
+ FILTER_HORIZ_LUMA_AVX2_4xN 16
INIT_YMM avx2
cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
@@ -1548,9 +1548,9 @@
RET
%endmacro
-IPFILTER_LUMA_AVX2_8xN 8, 8
-IPFILTER_LUMA_AVX2_8xN 8, 16
-IPFILTER_LUMA_AVX2_8xN 8, 32
+ IPFILTER_LUMA_AVX2_8xN 8, 8
+ IPFILTER_LUMA_AVX2_8xN 8, 16
+ IPFILTER_LUMA_AVX2_8xN 8, 32
%macro IPFILTER_LUMA_AVX2 2
INIT_YMM avx2
@@ -2713,10 +2713,10 @@
%endif
%endmacro ; IPFILTER_LUMA_PS_8xN_AVX2
-IPFILTER_LUMA_PS_8xN_AVX2 4
-IPFILTER_LUMA_PS_8xN_AVX2 8
-IPFILTER_LUMA_PS_8xN_AVX2 16
-IPFILTER_LUMA_PS_8xN_AVX2 32
+ IPFILTER_LUMA_PS_8xN_AVX2 4
+ IPFILTER_LUMA_PS_8xN_AVX2 8
+ IPFILTER_LUMA_PS_8xN_AVX2 16
+ IPFILTER_LUMA_PS_8xN_AVX2 32
%macro IPFILTER_LUMA_PS_16x_AVX2 2
@@ -2778,17 +2778,17 @@
dec r9d
jnz .label
-RET
-%endif
-%endmacro
-
-
-IPFILTER_LUMA_PS_16x_AVX2 16 , 16
-IPFILTER_LUMA_PS_16x_AVX2 16 , 8
-IPFILTER_LUMA_PS_16x_AVX2 16 , 12
-IPFILTER_LUMA_PS_16x_AVX2 16 , 4
-IPFILTER_LUMA_PS_16x_AVX2 16 , 32
-IPFILTER_LUMA_PS_16x_AVX2 16 , 64
+ RET
+%endif
+%endmacro
+
+
+ IPFILTER_LUMA_PS_16x_AVX2 16 , 16
+ IPFILTER_LUMA_PS_16x_AVX2 16 , 8
+ IPFILTER_LUMA_PS_16x_AVX2 16 , 12
+ IPFILTER_LUMA_PS_16x_AVX2 16 , 4
+ IPFILTER_LUMA_PS_16x_AVX2 16 , 32
+ IPFILTER_LUMA_PS_16x_AVX2 16 , 64
;--------------------------------------------------------------------------------------------------------------
@@ -2839,27 +2839,27 @@
RET
%endmacro
-IPFILTER_LUMA_PP_W8 8, 4
-IPFILTER_LUMA_PP_W8 8, 8
-IPFILTER_LUMA_PP_W8 8, 16
-IPFILTER_LUMA_PP_W8 8, 32
-IPFILTER_LUMA_PP_W8 16, 4
-IPFILTER_LUMA_PP_W8 16, 8
-IPFILTER_LUMA_PP_W8 16, 12
-IPFILTER_LUMA_PP_W8 16, 16
-IPFILTER_LUMA_PP_W8 16, 32
-IPFILTER_LUMA_PP_W8 16, 64
-IPFILTER_LUMA_PP_W8 24, 32
-IPFILTER_LUMA_PP_W8 32, 8
-IPFILTER_LUMA_PP_W8 32, 16
-IPFILTER_LUMA_PP_W8 32, 24
-IPFILTER_LUMA_PP_W8 32, 32
-IPFILTER_LUMA_PP_W8 32, 64
-IPFILTER_LUMA_PP_W8 48, 64
-IPFILTER_LUMA_PP_W8 64, 16
-IPFILTER_LUMA_PP_W8 64, 32
-IPFILTER_LUMA_PP_W8 64, 48
-IPFILTER_LUMA_PP_W8 64, 64
+ IPFILTER_LUMA_PP_W8 8, 4
+ IPFILTER_LUMA_PP_W8 8, 8
+ IPFILTER_LUMA_PP_W8 8, 16
+ IPFILTER_LUMA_PP_W8 8, 32
+ IPFILTER_LUMA_PP_W8 16, 4
+ IPFILTER_LUMA_PP_W8 16, 8
+ IPFILTER_LUMA_PP_W8 16, 12
+ IPFILTER_LUMA_PP_W8 16, 16
+ IPFILTER_LUMA_PP_W8 16, 32
+ IPFILTER_LUMA_PP_W8 16, 64
+ IPFILTER_LUMA_PP_W8 24, 32
+ IPFILTER_LUMA_PP_W8 32, 8
+ IPFILTER_LUMA_PP_W8 32, 16
+ IPFILTER_LUMA_PP_W8 32, 24
+ IPFILTER_LUMA_PP_W8 32, 32
+ IPFILTER_LUMA_PP_W8 32, 64
+ IPFILTER_LUMA_PP_W8 48, 64
+ IPFILTER_LUMA_PP_W8 64, 16
+ IPFILTER_LUMA_PP_W8 64, 32
+ IPFILTER_LUMA_PP_W8 64, 48
+ IPFILTER_LUMA_PP_W8 64, 64
;----------------------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -3024,71 +3024,71 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-lea r4, [r1 * 3]
-lea r5, [r0 + 4 * r1]
-pshufb m0, [tab_Cm]
-mova m1, [pw_512]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r4]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-movd m6, [r5]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r5 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r5 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m2, m4
-
-pextrw [r2], m2, 0
-pextrw [r2 + r3], m2, 2
-lea r2, [r2 + 2 * r3]
-pextrw [r2], m2, 4
-pextrw [r2 + r3], m2, 6
-
-RET
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+ lea r4, [r1 * 3]
+ lea r5, [r0 + 4 * r1]
+ pshufb m0, [tab_Cm]
+ mova m1, [pw_512]
+
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r4]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ movd m6, [r5]
+
+ punpcklbw m3, m4
+ punpcklbw m7, m5, m6
+ punpcklbw m3, m7
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ pmulhrsw m2, m1
+
+ movd m7, [r5 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m7
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r5 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m7, m3
+ punpcklbw m5, m7
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ pmulhrsw m4, m1
+ packuswb m2, m4
+
+ pextrw [r2], m2, 0
+ pextrw [r2 + r3], m2, 2
+ lea r2, [r2 + 2 * r3]
+ pextrw [r2], m2, 4
+ pextrw [r2 + r3], m2, 6
+
+ RET
%macro FILTER_VER_CHROMA_AVX2_2x4 1
INIT_YMM avx2
@@ -3141,8 +3141,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_2x4 pp
-FILTER_VER_CHROMA_AVX2_2x4 ps
+ FILTER_VER_CHROMA_AVX2_2x4 pp
+ FILTER_VER_CHROMA_AVX2_2x4 ps
%macro FILTER_VER_CHROMA_AVX2_2x8 1
INIT_YMM avx2
@@ -3213,8 +3213,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_2x8 pp
-FILTER_VER_CHROMA_AVX2_2x8 ps
+ FILTER_VER_CHROMA_AVX2_2x8 pp
+ FILTER_VER_CHROMA_AVX2_2x8 ps
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3223,85 +3223,85 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-
-mova m1, [pw_512]
-
-mov r4d, %2
-lea r5, [3 * r1]
-
-.loop:
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r5]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-lea r0, [r0 + 4 * r1]
-movd m6, [r0]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r0 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r0 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m2, m4
-
-pextrw [r2], m2, 0
-pextrw [r2 + r3], m2, 2
-lea r2, [r2 + 2 * r3]
-pextrw [r2], m2, 4
-pextrw [r2 + r3], m2, 6
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W2_H4 2, 8
-
-FILTER_V4_W2_H4 2, 16
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ mova m1, [pw_512]
+
+ mov r4d, %2
+ lea r5, [3 * r1]
+
+.loop:
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r5]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ lea r0, [r0 + 4 * r1]
+ movd m6, [r0]
+
+ punpcklbw m3, m4
+ punpcklbw m7, m5, m6
+ punpcklbw m3, m7
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ pmulhrsw m2, m1
+
+ movd m7, [r0 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m7
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r0 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m7, m3
+ punpcklbw m5, m7
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ pmulhrsw m4, m1
+ packuswb m2, m4
+
+ pextrw [r2], m2, 0
+ pextrw [r2 + r3], m2, 2
+ lea r2, [r2 + 2 * r3]
+ pextrw [r2], m2, 4
+ pextrw [r2 + r3], m2, 6
+
+ lea r2, [r2 + 2 * r3]
+
+ sub r4, 4
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W2_H4 2, 8
+
+ FILTER_V4_W2_H4 2, 16
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3309,46 +3309,46 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-lea r5, [r0 + 2 * r1]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r5]
-movd m5, [r5 + r1]
-
-punpcklbw m2, m3
-punpcklbw m1, m4, m5
-punpcklbw m2, m1
-
-pmaddubsw m2, m0
-
-movd m1, [r0 + 4 * r1]
-
-punpcklbw m3, m4
-punpcklbw m5, m1
-punpcklbw m3, m5
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, [pw_512]
-packuswb m2, m2
-movd [r2], m2
-pextrd [r2 + r3], m2, 1
-
-RET
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+ lea r5, [r0 + 2 * r1]
+
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r5]
+ movd m5, [r5 + r1]
+
+ punpcklbw m2, m3
+ punpcklbw m1, m4, m5
+ punpcklbw m2, m1
+
+ pmaddubsw m2, m0
+
+ movd m1, [r0 + 4 * r1]
+
+ punpcklbw m3, m4
+ punpcklbw m5, m1
+ punpcklbw m3, m5
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ pmulhrsw m2, [pw_512]
+ packuswb m2, m2
+ movd [r2], m2
+ pextrd [r2 + r3], m2, 1
+
+ RET
%macro FILTER_VER_CHROMA_AVX2_4x2 1
INIT_YMM avx2
@@ -3396,8 +3396,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_4x2 pp
-FILTER_VER_CHROMA_AVX2_4x2 ps
+ FILTER_VER_CHROMA_AVX2_4x2 pp
+ FILTER_VER_CHROMA_AVX2_4x2 ps
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3405,71 +3405,71 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-mova m1, [pw_512]
-lea r5, [r0 + 4 * r1]
-lea r4, [r1 * 3]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r4]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-movd m6, [r5]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r5 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r5 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-
-packuswb m2, m4
-movd [r2], m2
-pextrd [r2 + r3], m2, 1
-lea r2, [r2 + 2 * r3]
-pextrd [r2], m2, 2
-pextrd [r2 + r3], m2, 3
-RET
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+ mova m1, [pw_512]
+ lea r5, [r0 + 4 * r1]
+ lea r4, [r1 * 3]
+
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r4]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ movd m6, [r5]
+
+ punpcklbw m3, m4
+ punpcklbw m7, m5, m6
+ punpcklbw m3, m7
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ pmulhrsw m2, m1
+
+ movd m7, [r5 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m7
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r5 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m7, m3
+ punpcklbw m5, m7
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ pmulhrsw m4, m1
+
+ packuswb m2, m4
+ movd [r2], m2
+ pextrd [r2 + r3], m2, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m2, 2
+ pextrd [r2 + r3], m2, 3
+ RET
%macro FILTER_VER_CHROMA_AVX2_4x4 1
INIT_YMM avx2
cglobal interp_4tap_vert_%1_4x4, 4, 6, 3
@@ -3527,8 +3527,8 @@
%endif
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_4x4 pp
-FILTER_VER_CHROMA_AVX2_4x4 ps
+ FILTER_VER_CHROMA_AVX2_4x4 pp
+ FILTER_VER_CHROMA_AVX2_4x4 ps
%macro FILTER_VER_CHROMA_AVX2_4x8 1
INIT_YMM avx2
@@ -3614,8 +3614,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_4x8 pp
-FILTER_VER_CHROMA_AVX2_4x8 ps
+ FILTER_VER_CHROMA_AVX2_4x8 pp
+ FILTER_VER_CHROMA_AVX2_4x8 ps
%macro FILTER_VER_CHROMA_AVX2_4x16 1
INIT_YMM avx2
@@ -3759,8 +3759,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_AVX2_4x16 pp
-FILTER_VER_CHROMA_AVX2_4x16 ps
+ FILTER_VER_CHROMA_AVX2_4x16 pp
+ FILTER_VER_CHROMA_AVX2_4x16 ps
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3769,184 +3769,184 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-
-mova m1, [pw_512]
-
-mov r4d, %2
-
-lea r5, [3 * r1]
-
-.loop:
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r5]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-lea r0, [r0 + 4 * r1]
-movd m6, [r0]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r0 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r0 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m2, m4
-movd [r2], m2
-pextrd [r2 + r3], m2, 1
-lea r2, [r2 + 2 * r3]
-pextrd [r2], m2, 2
-pextrd [r2 + r3], m2, 3
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W4_H4 4, 8
-FILTER_V4_W4_H4 4, 16
-
-FILTER_V4_W4_H4 4, 32
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ mova m1, [pw_512]
+
+ mov r4d, %2
+
+ lea r5, [3 * r1]
+
+.loop:
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r5]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ lea r0, [r0 + 4 * r1]
+ movd m6, [r0]
+
+ punpcklbw m3, m4
+ punpcklbw m7, m5, m6
+ punpcklbw m3, m7
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ pmulhrsw m2, m1
+
+ movd m7, [r0 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m7
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r0 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m7, m3
+ punpcklbw m5, m7
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ pmulhrsw m4, m1
+ packuswb m2, m4
+ movd [r2], m2
+ pextrd [r2 + r3], m2, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m2, 2
+ pextrd [r2 + r3], m2, 3
+
+ lea r2, [r2 + 2 * r3]
+
+ sub r4, 4
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W4_H4 4, 8
+ FILTER_V4_W4_H4 4, 16
+
+ FILTER_V4_W4_H4 4, 32
%macro FILTER_V4_W8_H2 0
-punpcklbw m1, m2
-punpcklbw m7, m3, m0
-
-pmaddubsw m1, m6
-pmaddubsw m7, m5
-
-paddw m1, m7
-
-pmulhrsw m1, m4
-packuswb m1, m1
+ punpcklbw m1, m2
+ punpcklbw m7, m3, m0
+
+ pmaddubsw m1, m6
+ pmaddubsw m7, m5
+
+ paddw m1, m7
+
+ pmulhrsw m1, m4
+ packuswb m1, m1
%endmacro
%macro FILTER_V4_W8_H3 0
-punpcklbw m2, m3
-punpcklbw m7, m0, m1
-
-pmaddubsw m2, m6
-pmaddubsw m7, m5
-
-paddw m2, m7
-
-pmulhrsw m2, m4
-packuswb m2, m2
+ punpcklbw m2, m3
+ punpcklbw m7, m0, m1
+
+ pmaddubsw m2, m6
+ pmaddubsw m7, m5
+
+ paddw m2, m7
+
+ pmulhrsw m2, m4
+ packuswb m2, m2
%endmacro
%macro FILTER_V4_W8_H4 0
-punpcklbw m3, m0
-punpcklbw m7, m1, m2
-
-pmaddubsw m3, m6
-pmaddubsw m7, m5
-
-paddw m3, m7
-
-pmulhrsw m3, m4
-packuswb m3, m3
+ punpcklbw m3, m0
+ punpcklbw m7, m1, m2
+
+ pmaddubsw m3, m6
+ pmaddubsw m7, m5
+
+ paddw m3, m7
+
+ pmulhrsw m3, m4
+ packuswb m3, m3
%endmacro
%macro FILTER_V4_W8_H5 0
-punpcklbw m0, m1
-punpcklbw m7, m2, m3
-
-pmaddubsw m0, m6
-pmaddubsw m7, m5
-
-paddw m0, m7
-
-pmulhrsw m0, m4
-packuswb m0, m0
+ punpcklbw m0, m1
+ punpcklbw m7, m2, m3
+
+ pmaddubsw m0, m6
+ pmaddubsw m7, m5
+
+ paddw m0, m7
+
+ pmulhrsw m0, m4
+ packuswb m0, m0
%endmacro
%macro FILTER_V4_W8_8x2 2
-FILTER_V4_W8 %1, %2
-movq m0, [r0 + 4 * r1]
-
-FILTER_V4_W8_H2
-
-movh [r2 + r3], m1
+ FILTER_V4_W8 %1, %2
+ movq m0, [r0 + 4 * r1]
+
+ FILTER_V4_W8_H2
+
+ movh [r2 + r3], m1
%endmacro
%macro FILTER_V4_W8_8x4 2
-FILTER_V4_W8_8x2 %1, %2
+ FILTER_V4_W8_8x2 %1, %2
;8x3
-lea r6, [r0 + 4 * r1]
-movq m1, [r6 + r1]
-
-FILTER_V4_W8_H3
-
-movh [r2 + 2 * r3], m2
+ lea r6, [r0 + 4 * r1]
+ movq m1, [r6 + r1]
+
+ FILTER_V4_W8_H3
+
+ movh [r2 + 2 * r3], m2
;8x4
-movq m2, [r6 + 2 * r1]
-
-FILTER_V4_W8_H4
-
-lea r5, [r2 + 2 * r3]
-movh [r5 + r3], m3
+ movq m2, [r6 + 2 * r1]
+
+ FILTER_V4_W8_H4
+
+ lea r5, [r2 + 2 * r3]
+ movh [r5 + r3], m3
%endmacro
%macro FILTER_V4_W8_8x6 2
-FILTER_V4_W8_8x4 %1, %2
+ FILTER_V4_W8_8x4 %1, %2
;8x5
-lea r6, [r6 + 2 * r1]
-movq m3, [r6 + r1]
-
-FILTER_V4_W8_H5
-
-movh [r2 + 4 * r3], m0
+ lea r6, [r6 + 2 * r1]
+ movq m3, [r6 + r1]
+
+ FILTER_V4_W8_H5
+
+ movh [r2 + 4 * r3], m0
;8x6
-movq m0, [r0 + 8 * r1]
-
-FILTER_V4_W8_H2
-
-lea r5, [r2 + 4 * r3]
-movh [r5 + r3], m1
+ movq m0, [r0 + 8 * r1]
+
+ FILTER_V4_W8_H2
+
+ lea r5, [r2 + 4 * r3]
+ movh [r5 + r3], m1
%endmacro
;-----------------------------------------------------------------------------
@@ -3956,60 +3956,60 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
-mov r4d, r4m
-
-sub r0, r1
-movq m0, [r0]
-movq m1, [r0 + r1]
-movq m2, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movq m3, [r5 + r1]
-
-punpcklbw m0, m1
-punpcklbw m4, m2, m3
-
-%ifdef PIC
-lea r6, [tab_ChromaCoeff]
-movd m5, [r6 + r4 * 4]
-%else
-movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m6, m5, [tab_Vm]
-pmaddubsw m0, m6
-
-pshufb m5, [tab_Vm + 16]
-pmaddubsw m4, m5
-
-paddw m0, m4
-
-mova m4, [pw_512]
-
-pmulhrsw m0, m4
-packuswb m0, m0
-movh [r2], m0
+ mov r4d, r4m
+
+ sub r0, r1
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ movq m2, [r0 + 2 * r1]
+ lea r5, [r0 + 2 * r1]
+ movq m3, [r5 + r1]
+
+ punpcklbw m0, m1
+ punpcklbw m4, m2, m3
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movd m5, [r6 + r4 * 4]
+%else
+ movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m6, m5, [tab_Vm]
+ pmaddubsw m0, m6
+
+ pshufb m5, [tab_Vm + 16]
+ pmaddubsw m4, m5
+
+ paddw m0, m4
+
+ mova m4, [pw_512]
+
+ pmulhrsw m0, m4
+ packuswb m0, m0
+ movh [r2], m0
%endmacro
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
-FILTER_V4_W8_8x2 8, 2
-
-RET
+ FILTER_V4_W8_8x2 8, 2
+
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
-FILTER_V4_W8_8x4 8, 4
-
-RET
+ FILTER_V4_W8_8x4 8, 4
+
+ RET
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
-FILTER_V4_W8_8x6 8, 6
-
-RET
+ FILTER_V4_W8_8x6 8, 6
+
+ RET
;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4017,46 +4017,46 @@
INIT_XMM sse4
cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
-mov r4d, r4m
-sub r0, r1
-add r3d, r3d
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-lea r5, [r0 + 2 * r1]
-movd m4, [r5]
-movd m5, [r5 + r1]
-
-punpcklbw m2, m3
-punpcklbw m1, m4, m5
-punpcklbw m2, m1
-
-pmaddubsw m2, m0
-
-movd m1, [r0 + 4 * r1]
-
-punpcklbw m3, m4
-punpcklbw m5, m1
-punpcklbw m3, m5
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-psubw m2, [pw_2000]
-movh [r2], m2
-movhps [r2 + r3], m2
-
-RET
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ lea r5, [r0 + 2 * r1]
+ movd m4, [r5]
+ movd m5, [r5 + r1]
+
+ punpcklbw m2, m3
+ punpcklbw m1, m4, m5
+ punpcklbw m2, m1
+
+ pmaddubsw m2, m0
+
+ movd m1, [r0 + 4 * r1]
+
+ punpcklbw m3, m4
+ punpcklbw m5, m1
+ punpcklbw m3, m5
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ psubw m2, [pw_2000]
+ movh [r2], m2
+ movhps [r2 + r3], m2
+
+ RET
;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4214,10 +4214,10 @@
RET
%endmacro
-FILTER_V_PS_W4_H4 4, 8
-FILTER_V_PS_W4_H4 4, 16
-
-FILTER_V_PS_W4_H4 4, 32
+ FILTER_V_PS_W4_H4 4, 8
+ FILTER_V_PS_W4_H4 4, 16
+
+ FILTER_V_PS_W4_H4 4, 32
;--------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4283,12 +4283,12 @@
RET
%endmacro
-FILTER_V_PS_W8_H8_H16_H2 8, 2
-FILTER_V_PS_W8_H8_H16_H2 8, 4
-FILTER_V_PS_W8_H8_H16_H2 8, 6
-
-FILTER_V_PS_W8_H8_H16_H2 8, 12
-FILTER_V_PS_W8_H8_H16_H2 8, 64
+ FILTER_V_PS_W8_H8_H16_H2 8, 2
+ FILTER_V_PS_W8_H8_H16_H2 8, 4
+ FILTER_V_PS_W8_H8_H16_H2 8, 6
+
+ FILTER_V_PS_W8_H8_H16_H2 8, 12
+ FILTER_V_PS_W8_H8_H16_H2 8, 64
;--------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4378,9 +4378,9 @@
RET
%endmacro
-FILTER_V_PS_W8_H8_H16_H32 8, 8
-FILTER_V_PS_W8_H8_H16_H32 8, 16
-FILTER_V_PS_W8_H8_H16_H32 8, 32
+ FILTER_V_PS_W8_H8_H16_H32 8, 8
+ FILTER_V_PS_W8_H8_H16_H32 8, 16
+ FILTER_V_PS_W8_H8_H16_H32 8, 32
;------------------------------------------------------------------------------------------------------------
;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4474,8 +4474,8 @@
RET
%endmacro
-FILTER_V_PS_W6 6, 8
-FILTER_V_PS_W6 6, 16
+ FILTER_V_PS_W6 6, 8
+ FILTER_V_PS_W6 6, 16
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4560,8 +4560,8 @@
RET
%endmacro
-FILTER_V_PS_W12 12, 16
-FILTER_V_PS_W12 12, 32
+ FILTER_V_PS_W12 12, 16
+ FILTER_V_PS_W12 12, 32
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4645,14 +4645,14 @@
RET
%endmacro
-FILTER_V_PS_W16 16, 4
-FILTER_V_PS_W16 16, 8
-FILTER_V_PS_W16 16, 12
-FILTER_V_PS_W16 16, 16
-FILTER_V_PS_W16 16, 32
-
-FILTER_V_PS_W16 16, 24
-FILTER_V_PS_W16 16, 64
+ FILTER_V_PS_W16 16, 4
+ FILTER_V_PS_W16 16, 8
+ FILTER_V_PS_W16 16, 12
+ FILTER_V_PS_W16 16, 16
+ FILTER_V_PS_W16 16, 32
+
+ FILTER_V_PS_W16 16, 24
+ FILTER_V_PS_W16 16, 64
;--------------------------------------------------------------------------------------------------------------
;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4768,9 +4768,9 @@
RET
%endmacro
-FILTER_V4_PS_W24 24, 32
-
-FILTER_V4_PS_W24 24, 64
+ FILTER_V4_PS_W24 24, 32
+
+ FILTER_V4_PS_W24 24, 64
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -4861,13 +4861,13 @@
RET
%endmacro
-FILTER_V_PS_W32 32, 8
-FILTER_V_PS_W32 32, 16
-FILTER_V_PS_W32 32, 24
-FILTER_V_PS_W32 32, 32
-
-FILTER_V_PS_W32 32, 48
-FILTER_V_PS_W32 32, 64
+ FILTER_V_PS_W32 32, 8
+ FILTER_V_PS_W32 32, 16
+ FILTER_V_PS_W32 32, 24
+ FILTER_V_PS_W32 32, 32
+
+ FILTER_V_PS_W32 32, 48
+ FILTER_V_PS_W32 32, 64
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -4876,95 +4876,95 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m5, [r5 + r4 * 4]
-%else
-movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m6, m5, [tab_Vm]
-pshufb m5, [tab_Vm + 16]
-mova m4, [pw_512]
-lea r5, [r1 * 3]
-
-mov r4d, %2
-
-.loop:
-movq m0, [r0]
-movq m1, [r0 + r1]
-movq m2, [r0 + 2 * r1]
-movq m3, [r0 + r5]
-
-punpcklbw m0, m1
-punpcklbw m1, m2
-punpcklbw m2, m3
-
-pmaddubsw m0, m6
-pmaddubsw m7, m2, m5
-
-paddw m0, m7
-
-pmulhrsw m0, m4
-packuswb m0, m0
-movh [r2], m0
-
-lea r0, [r0 + 4 * r1]
-movq m0, [r0]
-
-punpcklbw m3, m0
-
-pmaddubsw m1, m6
-pmaddubsw m7, m3, m5
-
-paddw m1, m7
-
-pmulhrsw m1, m4
-packuswb m1, m1
-movh [r2 + r3], m1
-
-movq m1, [r0 + r1]
-
-punpcklbw m0, m1
-
-pmaddubsw m2, m6
-pmaddubsw m0, m5
-
-paddw m2, m0
-
-pmulhrsw m2, m4
-
-movq m7, [r0 + 2 * r1]
-punpcklbw m1, m7
-
-pmaddubsw m3, m6
-pmaddubsw m1, m5
-
-paddw m3, m1
-
-pmulhrsw m3, m4
-packuswb m2, m3
-
-lea r2, [r2 + 2 * r3]
-movh [r2], m2
-movhps [r2 + r3], m2
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W8_H8_H16_H32 8, 8
-FILTER_V4_W8_H8_H16_H32 8, 16
-FILTER_V4_W8_H8_H16_H32 8, 32
-
-FILTER_V4_W8_H8_H16_H32 8, 12
-FILTER_V4_W8_H8_H16_H32 8, 64
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
+%else
+ movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m6, m5, [tab_Vm]
+ pshufb m5, [tab_Vm + 16]
+ mova m4, [pw_512]
+ lea r5, [r1 * 3]
+
+ mov r4d, %2
+
+.loop:
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ movq m2, [r0 + 2 * r1]
+ movq m3, [r0 + r5]
+
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+
+ pmaddubsw m0, m6
+ pmaddubsw m7, m2, m5
+
+ paddw m0, m7
+
+ pmulhrsw m0, m4
+ packuswb m0, m0
+ movh [r2], m0
+
+ lea r0, [r0 + 4 * r1]
+ movq m0, [r0]
+
+ punpcklbw m3, m0
+
+ pmaddubsw m1, m6
+ pmaddubsw m7, m3, m5
+
+ paddw m1, m7
+
+ pmulhrsw m1, m4
+ packuswb m1, m1
+ movh [r2 + r3], m1
+
+ movq m1, [r0 + r1]
+
+ punpcklbw m0, m1
+
+ pmaddubsw m2, m6
+ pmaddubsw m0, m5
+
+ paddw m2, m0
+
+ pmulhrsw m2, m4
+
+ movq m7, [r0 + 2 * r1]
+ punpcklbw m1, m7
+
+ pmaddubsw m3, m6
+ pmaddubsw m1, m5
+
+ paddw m3, m1
+
+ pmulhrsw m3, m4
+ packuswb m2, m3
+
+ lea r2, [r2 + 2 * r3]
+ movh [r2], m2
+ movhps [r2 + r3], m2
+
+ lea r2, [r2 + 2 * r3]
+
+ sub r4, 4
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W8_H8_H16_H32 8, 8
+ FILTER_V4_W8_H8_H16_H32 8, 16
+ FILTER_V4_W8_H8_H16_H32 8, 32
+
+ FILTER_V4_W8_H8_H16_H32 8, 12
+ FILTER_V4_W8_H8_H16_H32 8, 64
%macro PROCESS_CHROMA_AVX2_W8_8R 0
movq xm1, [r0] ; m1 = row 0
@@ -5070,8 +5070,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_8x8 pp
-FILTER_VER_CHROMA_AVX2_8x8 ps
+ FILTER_VER_CHROMA_AVX2_8x8 pp
+ FILTER_VER_CHROMA_AVX2_8x8 ps
%macro FILTER_VER_CHROMA_AVX2_8x6 1
INIT_YMM avx2
@@ -5159,8 +5159,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_8x6 pp
-FILTER_VER_CHROMA_AVX2_8x6 ps
+ FILTER_VER_CHROMA_AVX2_8x6 pp
+ FILTER_VER_CHROMA_AVX2_8x6 ps
%macro PROCESS_CHROMA_AVX2_W8_16R 1
movq xm1, [r0] ; m1 = row 0
@@ -5340,8 +5340,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_8x16 pp
-FILTER_VER_CHROMA_AVX2_8x16 ps
+ FILTER_VER_CHROMA_AVX2_8x16 pp
+ FILTER_VER_CHROMA_AVX2_8x16 ps
%macro FILTER_VER_CHROMA_AVX2_8x32 1
INIT_YMM avx2
@@ -5372,8 +5372,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_8x32 pp
-FILTER_VER_CHROMA_AVX2_8x32 ps
+ FILTER_VER_CHROMA_AVX2_8x32 pp
+ FILTER_VER_CHROMA_AVX2_8x32 ps
%macro PROCESS_CHROMA_AVX2_W8_4R 0
movq xm1, [r0] ; m1 = row 0
@@ -5444,8 +5444,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_8x4 pp
-FILTER_VER_CHROMA_AVX2_8x4 ps
+ FILTER_VER_CHROMA_AVX2_8x4 pp
+ FILTER_VER_CHROMA_AVX2_8x4 ps
%macro FILTER_VER_CHROMA_AVX2_8x2 1
INIT_YMM avx2
@@ -5493,8 +5493,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_8x2 pp
-FILTER_VER_CHROMA_AVX2_8x2 ps
+ FILTER_VER_CHROMA_AVX2_8x2 pp
+ FILTER_VER_CHROMA_AVX2_8x2 ps
%macro FILTER_VER_CHROMA_AVX2_6x8 1
INIT_YMM avx2
@@ -5573,8 +5573,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_6x8 pp
-FILTER_VER_CHROMA_AVX2_6x8 ps
+ FILTER_VER_CHROMA_AVX2_6x8 pp
+ FILTER_VER_CHROMA_AVX2_6x8 ps
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -5583,96 +5583,96 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m5, [r5 + r4 * 4]
-%else
-movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m6, m5, [tab_Vm]
-pshufb m5, [tab_Vm + 16]
-mova m4, [pw_512]
-
-mov r4d, %2
-lea r5, [3 * r1]
-
-.loop:
-movq m0, [r0]
-movq m1, [r0 + r1]
-movq m2, [r0 + 2 * r1]
-movq m3, [r0 + r5]
-
-punpcklbw m0, m1
-punpcklbw m1, m2
-punpcklbw m2, m3
-
-pmaddubsw m0, m6
-pmaddubsw m7, m2, m5
-
-paddw m0, m7
-
-pmulhrsw m0, m4
-packuswb m0, m0
-movd [r2], m0
-pextrw [r2 + 4], m0, 2
-
-lea r0, [r0 + 4 * r1]
-
-movq m0, [r0]
-punpcklbw m3, m0
-
-pmaddubsw m1, m6
-pmaddubsw m7, m3, m5
-
-paddw m1, m7
-
-pmulhrsw m1, m4
-packuswb m1, m1
-movd [r2 + r3], m1
-pextrw [r2 + r3 + 4], m1, 2
-
-movq m1, [r0 + r1]
-punpcklbw m7, m0, m1
-
-pmaddubsw m2, m6
-pmaddubsw m7, m5
-
-paddw m2, m7
-
-pmulhrsw m2, m4
-packuswb m2, m2
-lea r2, [r2 + 2 * r3]
-movd [r2], m2
-pextrw [r2 + 4], m2, 2
-
-movq m2, [r0 + 2 * r1]
-punpcklbw m1, m2
-
-pmaddubsw m3, m6
-pmaddubsw m1, m5
-
-paddw m3, m1
-
-pmulhrsw m3, m4
-packuswb m3, m3
-
-movd [r2 + r3], m3
-pextrw [r2 + r3 + 4], m3, 2
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W6_H4 6, 8
-
-FILTER_V4_W6_H4 6, 16
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
+%else
+ movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m6, m5, [tab_Vm]
+ pshufb m5, [tab_Vm + 16]
+ mova m4, [pw_512]
+
+ mov r4d, %2
+ lea r5, [3 * r1]
+
+.loop:
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ movq m2, [r0 + 2 * r1]
+ movq m3, [r0 + r5]
+
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+
+ pmaddubsw m0, m6
+ pmaddubsw m7, m2, m5
+
+ paddw m0, m7
+
+ pmulhrsw m0, m4
+ packuswb m0, m0
+ movd [r2], m0
+ pextrw [r2 + 4], m0, 2
+
+ lea r0, [r0 + 4 * r1]
+
+ movq m0, [r0]
+ punpcklbw m3, m0
+
+ pmaddubsw m1, m6
+ pmaddubsw m7, m3, m5
+
+ paddw m1, m7
+
+ pmulhrsw m1, m4
+ packuswb m1, m1
+ movd [r2 + r3], m1
+ pextrw [r2 + r3 + 4], m1, 2
+
+ movq m1, [r0 + r1]
+ punpcklbw m7, m0, m1
+
+ pmaddubsw m2, m6
+ pmaddubsw m7, m5
+
+ paddw m2, m7
+
+ pmulhrsw m2, m4
+ packuswb m2, m2
+ lea r2, [r2 + 2 * r3]
+ movd [r2], m2
+ pextrw [r2 + 4], m2, 2
+
+ movq m2, [r0 + 2 * r1]
+ punpcklbw m1, m2
+
+ pmaddubsw m3, m6
+ pmaddubsw m1, m5
+
+ paddw m3, m1
+
+ pmulhrsw m3, m4
+ packuswb m3, m3
+
+ movd [r2 + r3], m3
+ pextrw [r2 + r3 + 4], m3, 2
+
+ lea r2, [r2 + 2 * r3]
+
+ sub r4, 4
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W6_H4 6, 8
+
+ FILTER_V4_W6_H4 6, 16
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -5681,88 +5681,88 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r0, [r0 + 2 * r1]
-movu m5, [r0]
-movu m7, [r0 + r1]
-
-punpcklbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m4, m6
-
-punpckhbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m2, m6
-
-mova m6, [pw_512]
-
-pmulhrsw m4, m6
-pmulhrsw m2, m6
-
-packuswb m4, m2
-
-movh [r2], m4
-pextrd [r2 + 8], m4, 2
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m5, [r0 + 2 * r1]
-
-punpcklbw m2, m7, m5
-punpckhbw m7, m5
-
-pmaddubsw m2, m0
-pmaddubsw m7, m0
-
-paddw m4, m2
-paddw m3, m7
-
-pmulhrsw m4, m6
-pmulhrsw m3, m6
-
-packuswb m4, m3
-
-movh [r2 + r3], m4
-pextrd [r2 + r3 + 8], m4, 2
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 2
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W12_H2 12, 16
-
-FILTER_V4_W12_H2 12, 32
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mov r4d, %2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0]
+ movu m7, [r0 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_512]
+
+ pmulhrsw m4, m6
+ pmulhrsw m2, m6
+
+ packuswb m4, m2
+
+ movh [r2], m4
+ pextrd [r2 + 8], m4, 2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r0 + 2 * r1]
+
+ punpcklbw m2, m7, m5
+ punpckhbw m7, m5
+
+ pmaddubsw m2, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m2
+ paddw m3, m7
+
+ pmulhrsw m4, m6
+ pmulhrsw m3, m6
+
+ packuswb m4, m3
+
+ movh [r2 + r3], m4
+ pextrd [r2 + r3 + 8], m4, 2
+
+ lea r2, [r2 + 2 * r3]
+
+ sub r4, 2
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W12_H2 12, 16
+
+ FILTER_V4_W12_H2 12, 32
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -5771,91 +5771,91 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2/2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r0, [r0 + 2 * r1]
-movu m5, [r0]
-movu m6, [r0 + r1]
-
-punpckhbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m2, m7
-
-punpcklbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m4, m7
-
-mova m7, [pw_512]
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m5, [r0 + 2 * r1]
-
-punpcklbw m2, m6, m5
-punpckhbw m6, m5
-
-pmaddubsw m2, m0
-pmaddubsw m6, m0
-
-paddw m4, m2
-paddw m3, m6
-
-pmulhrsw m4, m7
-pmulhrsw m3, m7
-
-packuswb m4, m3
-
-movu [r2 + r3], m4
-
-lea r2, [r2 + 2 * r3]
-
-dec r4d
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W16_H2 16, 4
-FILTER_V4_W16_H2 16, 8
-FILTER_V4_W16_H2 16, 12
-FILTER_V4_W16_H2 16, 16
-FILTER_V4_W16_H2 16, 32
-
-FILTER_V4_W16_H2 16, 24
-FILTER_V4_W16_H2 16, 64
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mov r4d, %2/2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0]
+ movu m6, [r0 + r1]
+
+ punpckhbw m7, m5, m6
+ pmaddubsw m7, m0
+ paddw m2, m7
+
+ punpcklbw m7, m5, m6
+ pmaddubsw m7, m0
+ paddw m4, m7
+
+ mova m7, [pw_512]
+
+ pmulhrsw m4, m7
+ pmulhrsw m2, m7
+
+ packuswb m4, m2
+
+ movu [r2], m4
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r0 + 2 * r1]
+
+ punpcklbw m2, m6, m5
+ punpckhbw m6, m5
+
+ pmaddubsw m2, m0
+ pmaddubsw m6, m0
+
+ paddw m4, m2
+ paddw m3, m6
+
+ pmulhrsw m4, m7
+ pmulhrsw m3, m7
+
+ packuswb m4, m3
+
+ movu [r2 + r3], m4
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W16_H2 16, 4
+ FILTER_V4_W16_H2 16, 8
+ FILTER_V4_W16_H2 16, 12
+ FILTER_V4_W16_H2 16, 16
+ FILTER_V4_W16_H2 16, 32
+
+ FILTER_V4_W16_H2 16, 24
+ FILTER_V4_W16_H2 16, 64
%macro FILTER_VER_CHROMA_AVX2_16x16 1
INIT_YMM avx2
@@ -6115,8 +6115,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_AVX2_16x16 pp
-FILTER_VER_CHROMA_AVX2_16x16 ps
+ FILTER_VER_CHROMA_AVX2_16x16 pp
+ FILTER_VER_CHROMA_AVX2_16x16 ps
%macro FILTER_VER_CHROMA_AVX2_16x8 1
INIT_YMM avx2
cglobal interp_4tap_vert_%1_16x8, 4, 7, 7
@@ -6270,8 +6270,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_16x8 pp
-FILTER_VER_CHROMA_AVX2_16x8 ps
+ FILTER_VER_CHROMA_AVX2_16x8 pp
+ FILTER_VER_CHROMA_AVX2_16x8 ps
%macro FILTER_VER_CHROMA_AVX2_16x12 1
INIT_YMM avx2
@@ -6498,8 +6498,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_AVX2_16x12 pp
-FILTER_VER_CHROMA_AVX2_16x12 ps
+ FILTER_VER_CHROMA_AVX2_16x12 pp
+ FILTER_VER_CHROMA_AVX2_16x12 ps
%macro FILTER_VER_CHROMA_AVX2_16x32 1
INIT_YMM avx2
@@ -6791,8 +6791,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_AVX2_16x32 pp
-FILTER_VER_CHROMA_AVX2_16x32 ps
+ FILTER_VER_CHROMA_AVX2_16x32 pp
+ FILTER_VER_CHROMA_AVX2_16x32 ps
%macro FILTER_VER_CHROMA_AVX2_24x32 1
INIT_YMM avx2
@@ -7242,8 +7242,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_AVX2_24x32 pp
-FILTER_VER_CHROMA_AVX2_24x32 ps
+ FILTER_VER_CHROMA_AVX2_24x32 pp
+ FILTER_VER_CHROMA_AVX2_24x32 ps
%macro FILTER_VER_CHROMA_AVX2_16x4 1
INIT_YMM avx2
@@ -7340,8 +7340,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_16x4 pp
-FILTER_VER_CHROMA_AVX2_16x4 ps
+ FILTER_VER_CHROMA_AVX2_16x4 pp
+ FILTER_VER_CHROMA_AVX2_16x4 ps
%macro FILTER_VER_CHROMA_AVX2_12x16 1
INIT_YMM avx2
@@ -7654,8 +7654,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_AVX2_12x16 pp
-FILTER_VER_CHROMA_AVX2_12x16 ps
+ FILTER_VER_CHROMA_AVX2_12x16 pp
+ FILTER_VER_CHROMA_AVX2_12x16 ps
;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -7664,121 +7664,121 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m5, [r5]
-movu m7, [r5 + r1]
-
-punpcklbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m4, m6
-
-punpckhbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m2, m6
-
-mova m6, [pw_512]
-
-pmulhrsw m4, m6
-pmulhrsw m2, m6
-
-packuswb m4, m2
-
-movu [r2], m4
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m2, [r5 + 2 * r1]
-
-punpcklbw m5, m7, m2
-punpckhbw m7, m2
-
-pmaddubsw m5, m0
-pmaddubsw m7, m0
-
-paddw m4, m5
-paddw m3, m7
-
-pmulhrsw m4, m6
-pmulhrsw m3, m6
-
-packuswb m4, m3
-
-movu [r2 + r3], m4
-
-movq m2, [r0 + 16]
-movq m3, [r0 + r1 + 16]
-movq m4, [r5 + 16]
-movq m5, [r5 + r1 + 16]
-
-punpcklbw m2, m3
-punpcklbw m4, m5
-
-pmaddubsw m2, m1
-pmaddubsw m4, m0
-
-paddw m2, m4
-
-pmulhrsw m2, m6
-
-movq m3, [r0 + r1 + 16]
-movq m4, [r5 + 16]
-movq m5, [r5 + r1 + 16]
-movq m7, [r5 + 2 * r1 + 16]
-
-punpcklbw m3, m4
-punpcklbw m5, m7
-
-pmaddubsw m3, m1
-pmaddubsw m5, m0
-
-paddw m3, m5
-
-pmulhrsw m3, m6
-packuswb m2, m3
-
-movh [r2 + 16], m2
-movhps [r2 + r3 + 16], m2
-
-mov r0, r5
-lea r2, [r2 + 2 * r3]
-
-sub r4, 2
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W24 24, 32
-
-FILTER_V4_W24 24, 64
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mov r4d, %2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+ movu m5, [r5]
+ movu m7, [r5 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_512]
+
+ pmulhrsw m4, m6
+ pmulhrsw m2, m6
+
+ packuswb m4, m2
+
+ movu [r2], m4
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m2, [r5 + 2 * r1]
+
+ punpcklbw m5, m7, m2
+ punpckhbw m7, m2
+
+ pmaddubsw m5, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m5
+ paddw m3, m7
+
+ pmulhrsw m4, m6
+ pmulhrsw m3, m6
+
+ packuswb m4, m3
+
+ movu [r2 + r3], m4
+
+ movq m2, [r0 + 16]
+ movq m3, [r0 + r1 + 16]
+ movq m4, [r5 + 16]
+ movq m5, [r5 + r1 + 16]
+
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+
+ pmaddubsw m2, m1
+ pmaddubsw m4, m0
+
+ paddw m2, m4
+
+ pmulhrsw m2, m6
+
+ movq m3, [r0 + r1 + 16]
+ movq m4, [r5 + 16]
+ movq m5, [r5 + r1 + 16]
+ movq m7, [r5 + 2 * r1 + 16]
+
+ punpcklbw m3, m4
+ punpcklbw m5, m7
+
+ pmaddubsw m3, m1
+ pmaddubsw m5, m0
+
+ paddw m3, m5
+
+ pmulhrsw m3, m6
+ packuswb m2, m3
+
+ movh [r2 + 16], m2
+ movhps [r2 + r3 + 16], m2
+
+ mov r0, r5
+ lea r2, [r2 + 2 * r3]
+
+ sub r4, 2
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W24 24, 32
+
+ FILTER_V4_W24 24, 64
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -7787,96 +7787,96 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mova m7, [pw_512]
-
-mov r4d, %2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m3, [r5]
-movu m5, [r5 + r1]
-
-punpcklbw m6, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m6, m0
-pmaddubsw m3, m0
-
-paddw m4, m6
-paddw m2, m3
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
-
-movu m2, [r0 + 16]
-movu m3, [r0 + r1 + 16]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-movu m3, [r5 + 16]
-movu m5, [r5 + r1 + 16]
-
-punpcklbw m6, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m6, m0
-pmaddubsw m3, m0
-
-paddw m4, m6
-paddw m2, m3
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2 + 16], m4
-
-lea r0, [r0 + r1]
-lea r2, [r2 + r3]
-
-dec r4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W32 32, 8
-FILTER_V4_W32 32, 16
-FILTER_V4_W32 32, 24
-FILTER_V4_W32 32, 32
-
-FILTER_V4_W32 32, 48
-FILTER_V4_W32 32, 64
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mova m7, [pw_512]
+
+ mov r4d, %2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+ movu m3, [r5]
+ movu m5, [r5 + r1]
+
+ punpcklbw m6, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m6, m0
+ pmaddubsw m3, m0
+
+ paddw m4, m6
+ paddw m2, m3
+
+ pmulhrsw m4, m7
+ pmulhrsw m2, m7
+
+ packuswb m4, m2
+
+ movu [r2], m4
+
+ movu m2, [r0 + 16]
+ movu m3, [r0 + r1 + 16]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ movu m3, [r5 + 16]
+ movu m5, [r5 + r1 + 16]
+
+ punpcklbw m6, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m6, m0
+ pmaddubsw m3, m0
+
+ paddw m4, m6
+ paddw m2, m3
+
+ pmulhrsw m4, m7
+ pmulhrsw m2, m7
+
+ packuswb m4, m2
+
+ movu [r2 + 16], m4
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+
+ dec r4
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W32 32, 8
+ FILTER_V4_W32 32, 16
+ FILTER_V4_W32 32, 24
+ FILTER_V4_W32 32, 32
+
+ FILTER_V4_W32 32, 48
+ FILTER_V4_W32 32, 64
%macro FILTER_VER_CHROMA_AVX2_32xN 2
INIT_YMM avx2
@@ -8010,14 +8010,14 @@
%endif
%endmacro
-FILTER_VER_CHROMA_AVX2_32xN pp, 32
-FILTER_VER_CHROMA_AVX2_32xN pp, 24
-FILTER_VER_CHROMA_AVX2_32xN pp, 16
-FILTER_VER_CHROMA_AVX2_32xN pp, 8
-FILTER_VER_CHROMA_AVX2_32xN ps, 32
-FILTER_VER_CHROMA_AVX2_32xN ps, 24
-FILTER_VER_CHROMA_AVX2_32xN ps, 16
-FILTER_VER_CHROMA_AVX2_32xN ps, 8
+ FILTER_VER_CHROMA_AVX2_32xN pp, 32
+ FILTER_VER_CHROMA_AVX2_32xN pp, 24
+ FILTER_VER_CHROMA_AVX2_32xN pp, 16
+ FILTER_VER_CHROMA_AVX2_32xN pp, 8
+ FILTER_VER_CHROMA_AVX2_32xN ps, 32
+ FILTER_VER_CHROMA_AVX2_32xN ps, 24
+ FILTER_VER_CHROMA_AVX2_32xN ps, 16
+ FILTER_VER_CHROMA_AVX2_32xN ps, 8
;-----------------------------------------------------------------------------
; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -8026,99 +8026,99 @@
INIT_XMM sse4
cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2/2
-
-.loop:
-
-mov r6d, %1/16
+ mov r4d, r4m
+ sub r0, r1
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mov r4d, %2/2
+
+.loop:
+
+ mov r6d, %1/16
.loopW:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m5, [r5]
-movu m6, [r5 + r1]
-
-punpckhbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m2, m7
-
-punpcklbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m4, m7
-
-mova m7, [pw_512]
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m5, [r5 + 2 * r1]
-
-punpcklbw m2, m6, m5
-punpckhbw m6, m5
-
-pmaddubsw m2, m0
-pmaddubsw m6, m0
-
-paddw m4, m2
-paddw m3, m6
-
-pmulhrsw m4, m7
-pmulhrsw m3, m7
-
-packuswb m4, m3
-
-movu [r2 + r3], m4
-
-add r0, 16
-add r2, 16
-dec r6d
-jnz .loopW
-
-lea r0, [r0 + r1 * 2 - %1]
-lea r2, [r2 + r3 * 2 - %1]
-
-dec r4d
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W16n_H2 64, 64
-FILTER_V4_W16n_H2 64, 32
-FILTER_V4_W16n_H2 64, 48
-FILTER_V4_W16n_H2 48, 64
-FILTER_V4_W16n_H2 64, 16
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+ movu m5, [r5]
+ movu m6, [r5 + r1]
+
+ punpckhbw m7, m5, m6
+ pmaddubsw m7, m0
+ paddw m2, m7
+
+ punpcklbw m7, m5, m6
+ pmaddubsw m7, m0
+ paddw m4, m7
+
+ mova m7, [pw_512]
+
+ pmulhrsw m4, m7
+ pmulhrsw m2, m7
+
+ packuswb m4, m2
+
+ movu [r2], m4
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r5 + 2 * r1]
+
+ punpcklbw m2, m6, m5
+ punpckhbw m6, m5
+
+ pmaddubsw m2, m0
+ pmaddubsw m6, m0
+
+ paddw m4, m2
+ paddw m3, m6
+
+ pmulhrsw m4, m7
+ pmulhrsw m3, m7
+
+ packuswb m4, m3
+
+ movu [r2 + r3], m4
+
+ add r0, 16
+ add r2, 16
+ dec r6d
+ jnz .loopW
+
+ lea r0, [r0 + r1 * 2 - %1]
+ lea r2, [r2 + r3 * 2 - %1]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+ FILTER_V4_W16n_H2 64, 64
+ FILTER_V4_W16n_H2 64, 32
+ FILTER_V4_W16n_H2 64, 48
+ FILTER_V4_W16n_H2 48, 64
+ FILTER_V4_W16n_H2 64, 16
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8147,9 +8147,9 @@
%endrep
RET
%endmacro
-P2S_H_2xN 4
-P2S_H_2xN 8
-P2S_H_2xN 16
+ P2S_H_2xN 4
+ P2S_H_2xN 8
+ P2S_H_2xN 16
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8190,10 +8190,10 @@
%endrep
RET
%endmacro
-P2S_H_4xN 4
-P2S_H_4xN 8
-P2S_H_4xN 16
-P2S_H_4xN 32
+ P2S_H_4xN 4
+ P2S_H_4xN 8
+ P2S_H_4xN 16
+ P2S_H_4xN 32
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8246,8 +8246,8 @@
jnz .loop
RET
%endmacro
-P2S_H_6xN 8
-P2S_H_6xN 16
+ P2S_H_6xN 8
+ P2S_H_6xN 16
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8296,12 +8296,12 @@
jnz .loop
RET
%endmacro
-P2S_H_8xN 8
-P2S_H_8xN 4
-P2S_H_8xN 16
-P2S_H_8xN 32
-P2S_H_8xN 12
-P2S_H_8xN 64
+ P2S_H_8xN 8
+ P2S_H_8xN 4
+ P2S_H_8xN 16
+ P2S_H_8xN 32
+ P2S_H_8xN 12
+ P2S_H_8xN 64
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8423,13 +8423,13 @@
jnz .loop
RET
%endmacro
-P2S_H_16xN 16
-P2S_H_16xN 4
-P2S_H_16xN 8
-P2S_H_16xN 12
-P2S_H_16xN 32
-P2S_H_16xN 64
-P2S_H_16xN 24
+ P2S_H_16xN 16
+ P2S_H_16xN 4
+ P2S_H_16xN 8
+ P2S_H_16xN 12
+ P2S_H_16xN 32
+ P2S_H_16xN 64
+ P2S_H_16xN 24
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8547,12 +8547,12 @@
jnz .loop
RET
%endmacro
-P2S_H_32xN 32
-P2S_H_32xN 8
-P2S_H_32xN 16
-P2S_H_32xN 24
-P2S_H_32xN 64
-P2S_H_32xN 48
+ P2S_H_32xN 32
+ P2S_H_32xN 8
+ P2S_H_32xN 16
+ P2S_H_32xN 24
+ P2S_H_32xN 64
+ P2S_H_32xN 48
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8615,12 +8615,12 @@
jnz .loop
RET
%endmacro
-P2S_H_32xN_avx2 32
-P2S_H_32xN_avx2 8
-P2S_H_32xN_avx2 16
-P2S_H_32xN_avx2 24
-P2S_H_32xN_avx2 64
-P2S_H_32xN_avx2 48
+ P2S_H_32xN_avx2 32
+ P2S_H_32xN_avx2 8
+ P2S_H_32xN_avx2 16
+ P2S_H_32xN_avx2 24
+ P2S_H_32xN_avx2 64
+ P2S_H_32xN_avx2 48
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8830,10 +8830,10 @@
jnz .loop
RET
%endmacro
-P2S_H_64xN 64
-P2S_H_64xN 16
-P2S_H_64xN 32
-P2S_H_64xN 48
+ P2S_H_64xN 64
+ P2S_H_64xN 16
+ P2S_H_64xN 32
+ P2S_H_64xN 48
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8932,10 +8932,10 @@
jnz .loop
RET
%endmacro
-P2S_H_64xN_avx2 64
-P2S_H_64xN_avx2 16
-P2S_H_64xN_avx2 32
-P2S_H_64xN_avx2 48
+ P2S_H_64xN_avx2 64
+ P2S_H_64xN_avx2 16
+ P2S_H_64xN_avx2 32
+ P2S_H_64xN_avx2 48
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
@@ -8997,8 +8997,8 @@
jnz .loop
RET
%endmacro
-P2S_H_12xN 16
-P2S_H_12xN 32
+ P2S_H_12xN 16
+ P2S_H_12xN 32
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -9079,8 +9079,8 @@
jnz .loop
RET
%endmacro
-P2S_H_24xN 32
-P2S_H_24xN 64
+ P2S_H_24xN 32
+ P2S_H_24xN 64
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -9147,8 +9147,8 @@
jnz .loop
RET
%endmacro
-P2S_H_24xN_avx2 32
-P2S_H_24xN_avx2 64
+ P2S_H_24xN_avx2 32
+ P2S_H_24xN_avx2 64
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -9799,36 +9799,36 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 4, pp
+ FILTER_VER_LUMA_4xN 4, 4, pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 8, pp
-FILTER_VER_LUMA_AVX2_4xN 4, 8, pp
+ FILTER_VER_LUMA_4xN 4, 8, pp
+ FILTER_VER_LUMA_AVX2_4xN 4, 8, pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 16, pp
-FILTER_VER_LUMA_AVX2_4xN 4, 16, pp
+ FILTER_VER_LUMA_4xN 4, 16, pp
+ FILTER_VER_LUMA_AVX2_4xN 4, 16, pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 4, ps
+ FILTER_VER_LUMA_4xN 4, 4, ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 8, ps
-FILTER_VER_LUMA_AVX2_4xN 4, 8, ps
+ FILTER_VER_LUMA_4xN 4, 8, ps
+ FILTER_VER_LUMA_AVX2_4xN 4, 8, ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 16, ps
-FILTER_VER_LUMA_AVX2_4xN 4, 16, ps
+ FILTER_VER_LUMA_4xN 4, 16, ps
+ FILTER_VER_LUMA_AVX2_4xN 4, 16, ps
%macro PROCESS_LUMA_AVX2_W8_8R 0
movq xm1, [r0] ; m1 = row 0
@@ -10199,50 +10199,50 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 4, pp
-FILTER_VER_LUMA_AVX2_8x4 pp
+ FILTER_VER_LUMA_8xN 8, 4, pp
+ FILTER_VER_LUMA_AVX2_8x4 pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 8, pp
-FILTER_VER_LUMA_AVX2_8x8 pp
+ FILTER_VER_LUMA_8xN 8, 8, pp
+ FILTER_VER_LUMA_AVX2_8x8 pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 16, pp
-FILTER_VER_LUMA_AVX2_8xN 8, 16, pp
+ FILTER_VER_LUMA_8xN 8, 16, pp
+ FILTER_VER_LUMA_AVX2_8xN 8, 16, pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 32, pp
-FILTER_VER_LUMA_AVX2_8xN 8, 32, pp
+ FILTER_VER_LUMA_8xN 8, 32, pp
+ FILTER_VER_LUMA_AVX2_8xN 8, 32, pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 4, ps
-FILTER_VER_LUMA_AVX2_8x4 ps
+ FILTER_VER_LUMA_8xN 8, 4, ps
+ FILTER_VER_LUMA_AVX2_8x4 ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 8, ps
-FILTER_VER_LUMA_AVX2_8x8 ps
+ FILTER_VER_LUMA_8xN 8, 8, ps
+ FILTER_VER_LUMA_AVX2_8x8 ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 16, ps
-FILTER_VER_LUMA_AVX2_8xN 8, 16, ps
+ FILTER_VER_LUMA_8xN 8, 16, ps
+ FILTER_VER_LUMA_AVX2_8xN 8, 16, ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 32, ps
-FILTER_VER_LUMA_AVX2_8xN 8, 32, ps
+ FILTER_VER_LUMA_8xN 8, 32, ps
+ FILTER_VER_LUMA_AVX2_8xN 8, 32, ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -10351,12 +10351,12 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_12xN 12, 16, pp
+ FILTER_VER_LUMA_12xN 12, 16, pp
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_12xN 12, 16, ps
+ FILTER_VER_LUMA_12xN 12, 16, ps
%macro FILTER_VER_LUMA_AVX2_12x16 1
INIT_YMM avx2
@@ -10747,8 +10747,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_12x16 pp
-FILTER_VER_LUMA_AVX2_12x16 ps
+ FILTER_VER_LUMA_AVX2_12x16 pp
+ FILTER_VER_LUMA_AVX2_12x16 ps
%macro FILTER_VER_LUMA_AVX2_16x16 1
INIT_YMM avx2
@@ -11091,8 +11091,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_16x16 pp
-FILTER_VER_LUMA_AVX2_16x16 ps
+ FILTER_VER_LUMA_AVX2_16x16 pp
+ FILTER_VER_LUMA_AVX2_16x16 ps
%macro FILTER_VER_LUMA_AVX2_16x12 1
INIT_YMM avx2
@@ -11366,8 +11366,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_16x12 pp
-FILTER_VER_LUMA_AVX2_16x12 ps
+ FILTER_VER_LUMA_AVX2_16x12 pp
+ FILTER_VER_LUMA_AVX2_16x12 ps
%macro FILTER_VER_LUMA_AVX2_16x8 1
INIT_YMM avx2
@@ -11562,8 +11562,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_16x8 pp
-FILTER_VER_LUMA_AVX2_16x8 ps
+ FILTER_VER_LUMA_AVX2_16x8 pp
+ FILTER_VER_LUMA_AVX2_16x8 ps
%macro FILTER_VER_LUMA_AVX2_16x4 1
INIT_YMM avx2
@@ -11687,8 +11687,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_16x4 pp
-FILTER_VER_LUMA_AVX2_16x4 ps
+ FILTER_VER_LUMA_AVX2_16x4 pp
+ FILTER_VER_LUMA_AVX2_16x4 ps
%macro FILTER_VER_LUMA_AVX2_16xN 3
INIT_YMM avx2
%if ARCH_X86_64 == 1
@@ -12039,10 +12039,10 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_16xN 16, 32, pp
-FILTER_VER_LUMA_AVX2_16xN 16, 64, pp
-FILTER_VER_LUMA_AVX2_16xN 16, 32, ps
-FILTER_VER_LUMA_AVX2_16xN 16, 64, ps
+ FILTER_VER_LUMA_AVX2_16xN 16, 32, pp
+ FILTER_VER_LUMA_AVX2_16xN 16, 64, pp
+ FILTER_VER_LUMA_AVX2_16xN 16, 32, ps
+ FILTER_VER_LUMA_AVX2_16xN 16, 64, ps
%macro PROCESS_LUMA_AVX2_W16_16R 1
movu xm0, [r0] ; m0 = row 0
@@ -12770,8 +12770,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_24x32 pp
-FILTER_VER_LUMA_AVX2_24x32 ps
+ FILTER_VER_LUMA_AVX2_24x32 pp
+ FILTER_VER_LUMA_AVX2_24x32 ps
%macro FILTER_VER_LUMA_AVX2_32xN 3
INIT_YMM avx2
@@ -12821,10 +12821,10 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_32xN 32, 32, pp
-FILTER_VER_LUMA_AVX2_32xN 32, 64, pp
-FILTER_VER_LUMA_AVX2_32xN 32, 32, ps
-FILTER_VER_LUMA_AVX2_32xN 32, 64, ps
+ FILTER_VER_LUMA_AVX2_32xN 32, 32, pp
+ FILTER_VER_LUMA_AVX2_32xN 32, 64, pp
+ FILTER_VER_LUMA_AVX2_32xN 32, 32, ps
+ FILTER_VER_LUMA_AVX2_32xN 32, 64, ps
%macro FILTER_VER_LUMA_AVX2_32x16 1
INIT_YMM avx2
@@ -12864,8 +12864,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_32x16 pp
-FILTER_VER_LUMA_AVX2_32x16 ps
+ FILTER_VER_LUMA_AVX2_32x16 pp
+ FILTER_VER_LUMA_AVX2_32x16 ps
%macro FILTER_VER_LUMA_AVX2_32x24 1
INIT_YMM avx2
@@ -12924,8 +12924,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_32x24 pp
-FILTER_VER_LUMA_AVX2_32x24 ps
+ FILTER_VER_LUMA_AVX2_32x24 pp
+ FILTER_VER_LUMA_AVX2_32x24 ps
%macro FILTER_VER_LUMA_AVX2_32x8 1
INIT_YMM avx2
@@ -12967,8 +12967,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_32x8 pp
-FILTER_VER_LUMA_AVX2_32x8 ps
+ FILTER_VER_LUMA_AVX2_32x8 pp
+ FILTER_VER_LUMA_AVX2_32x8 ps
%macro FILTER_VER_LUMA_AVX2_48x64 1
INIT_YMM avx2
@@ -13026,8 +13026,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_48x64 pp
-FILTER_VER_LUMA_AVX2_48x64 ps
+ FILTER_VER_LUMA_AVX2_48x64 pp
+ FILTER_VER_LUMA_AVX2_48x64 ps
%macro FILTER_VER_LUMA_AVX2_64xN 3
INIT_YMM avx2
@@ -13085,12 +13085,12 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_64xN 64, 32, pp
-FILTER_VER_LUMA_AVX2_64xN 64, 48, pp
-FILTER_VER_LUMA_AVX2_64xN 64, 64, pp
-FILTER_VER_LUMA_AVX2_64xN 64, 32, ps
-FILTER_VER_LUMA_AVX2_64xN 64, 48, ps
-FILTER_VER_LUMA_AVX2_64xN 64, 64, ps
+ FILTER_VER_LUMA_AVX2_64xN 64, 32, pp
+ FILTER_VER_LUMA_AVX2_64xN 64, 48, pp
+ FILTER_VER_LUMA_AVX2_64xN 64, 64, pp
+ FILTER_VER_LUMA_AVX2_64xN 64, 32, ps
+ FILTER_VER_LUMA_AVX2_64xN 64, 48, ps
+ FILTER_VER_LUMA_AVX2_64xN 64, 64, ps
%macro FILTER_VER_LUMA_AVX2_64x16 1
INIT_YMM avx2
@@ -13136,8 +13136,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_64x16 pp
-FILTER_VER_LUMA_AVX2_64x16 ps
+ FILTER_VER_LUMA_AVX2_64x16 pp
+ FILTER_VER_LUMA_AVX2_64x16 ps
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -13220,41 +13220,41 @@
RET
%endmacro
-FILTER_VER_LUMA 16, 4, pp
-FILTER_VER_LUMA 16, 8, pp
-FILTER_VER_LUMA 16, 12, pp
-FILTER_VER_LUMA 16, 16, pp
-FILTER_VER_LUMA 16, 32, pp
-FILTER_VER_LUMA 16, 64, pp
-FILTER_VER_LUMA 24, 32, pp
-FILTER_VER_LUMA 32, 8, pp
-FILTER_VER_LUMA 32, 16, pp
-FILTER_VER_LUMA 32, 24, pp
-FILTER_VER_LUMA 32, 32, pp
-FILTER_VER_LUMA 32, 64, pp
-FILTER_VER_LUMA 48, 64, pp
-FILTER_VER_LUMA 64, 16, pp
-FILTER_VER_LUMA 64, 32, pp
-FILTER_VER_LUMA 64, 48, pp
-FILTER_VER_LUMA 64, 64, pp
-
-FILTER_VER_LUMA 16, 4, ps
-FILTER_VER_LUMA 16, 8, ps
-FILTER_VER_LUMA 16, 12, ps
-FILTER_VER_LUMA 16, 16, ps
-FILTER_VER_LUMA 16, 32, ps
-FILTER_VER_LUMA 16, 64, ps
-FILTER_VER_LUMA 24, 32, ps
-FILTER_VER_LUMA 32, 8, ps
-FILTER_VER_LUMA 32, 16, ps
-FILTER_VER_LUMA 32, 24, ps
-FILTER_VER_LUMA 32, 32, ps
-FILTER_VER_LUMA 32, 64, ps
-FILTER_VER_LUMA 48, 64, ps
-FILTER_VER_LUMA 64, 16, ps
-FILTER_VER_LUMA 64, 32, ps
-FILTER_VER_LUMA 64, 48, ps
-FILTER_VER_LUMA 64, 64, ps
+ FILTER_VER_LUMA 16, 4, pp
+ FILTER_VER_LUMA 16, 8, pp
+ FILTER_VER_LUMA 16, 12, pp
+ FILTER_VER_LUMA 16, 16, pp
+ FILTER_VER_LUMA 16, 32, pp
+ FILTER_VER_LUMA 16, 64, pp
+ FILTER_VER_LUMA 24, 32, pp
+ FILTER_VER_LUMA 32, 8, pp
+ FILTER_VER_LUMA 32, 16, pp
+ FILTER_VER_LUMA 32, 24, pp
+ FILTER_VER_LUMA 32, 32, pp
+ FILTER_VER_LUMA 32, 64, pp
+ FILTER_VER_LUMA 48, 64, pp
+ FILTER_VER_LUMA 64, 16, pp
+ FILTER_VER_LUMA 64, 32, pp
+ FILTER_VER_LUMA 64, 48, pp
+ FILTER_VER_LUMA 64, 64, pp
+
+ FILTER_VER_LUMA 16, 4, ps
+ FILTER_VER_LUMA 16, 8, ps
+ FILTER_VER_LUMA 16, 12, ps
+ FILTER_VER_LUMA 16, 16, ps
+ FILTER_VER_LUMA 16, 32, ps
+ FILTER_VER_LUMA 16, 64, ps
+ FILTER_VER_LUMA 24, 32, ps
+ FILTER_VER_LUMA 32, 8, ps
+ FILTER_VER_LUMA 32, 16, ps
+ FILTER_VER_LUMA 32, 24, ps
+ FILTER_VER_LUMA 32, 32, ps
+ FILTER_VER_LUMA 32, 64, ps
+ FILTER_VER_LUMA 48, 64, ps
+ FILTER_VER_LUMA 64, 16, ps
+ FILTER_VER_LUMA 64, 32, ps
+ FILTER_VER_LUMA 64, 48, ps
+ FILTER_VER_LUMA 64, 64, ps
%macro PROCESS_LUMA_SP_W4_4R 0
movq m0, [r0]
@@ -13670,10 +13670,10 @@
RET
%endmacro
-FILTER_VER_CHROMA_SP_W2_4R 2, 4
-FILTER_VER_CHROMA_SP_W2_4R 2, 8
-
-FILTER_VER_CHROMA_SP_W2_4R 2, 16
+ FILTER_VER_CHROMA_SP_W2_4R 2, 4
+ FILTER_VER_CHROMA_SP_W2_4R 2, 8
+
+ FILTER_VER_CHROMA_SP_W2_4R 2, 16
;--------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -13802,9 +13802,9 @@
RET
%endmacro
-FILTER_VER_CHROMA_SP_W6_H4 6, 8
-
-FILTER_VER_CHROMA_SP_W6_H4 6, 16
+ FILTER_VER_CHROMA_SP_W6_H4 6, 8
+
+ FILTER_VER_CHROMA_SP_W6_H4 6, 16
%macro PROCESS_CHROMA_SP_W8_2R 0
movu m1, [r0]
@@ -13888,15 +13888,15 @@
RET
%endmacro
-FILTER_VER_CHROMA_SP_W8_H2 8, 2
-FILTER_VER_CHROMA_SP_W8_H2 8, 4
-FILTER_VER_CHROMA_SP_W8_H2 8, 6
-FILTER_VER_CHROMA_SP_W8_H2 8, 8
-FILTER_VER_CHROMA_SP_W8_H2 8, 16
-FILTER_VER_CHROMA_SP_W8_H2 8, 32
-
-FILTER_VER_CHROMA_SP_W8_H2 8, 12
-FILTER_VER_CHROMA_SP_W8_H2 8, 64
+ FILTER_VER_CHROMA_SP_W8_H2 8, 2
+ FILTER_VER_CHROMA_SP_W8_H2 8, 4
+ FILTER_VER_CHROMA_SP_W8_H2 8, 6
+ FILTER_VER_CHROMA_SP_W8_H2 8, 8
+ FILTER_VER_CHROMA_SP_W8_H2 8, 16
+ FILTER_VER_CHROMA_SP_W8_H2 8, 32
+
+ FILTER_VER_CHROMA_SP_W8_H2 8, 12
+ FILTER_VER_CHROMA_SP_W8_H2 8, 64
;-----------------------------------------------------------------------------------------------------------------------------
@@ -13948,10 +13948,10 @@
RET
%endmacro
-FILTER_HORIZ_CHROMA_2xN 2, 4
-FILTER_HORIZ_CHROMA_2xN 2, 8
-
-FILTER_HORIZ_CHROMA_2xN 2, 16
+ FILTER_HORIZ_CHROMA_2xN 2, 4
+ FILTER_HORIZ_CHROMA_2xN 2, 8
+
+ FILTER_HORIZ_CHROMA_2xN 2, 16
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -14001,12 +14001,12 @@
RET
%endmacro
-FILTER_HORIZ_CHROMA_4xN 4, 2
-FILTER_HORIZ_CHROMA_4xN 4, 4
-FILTER_HORIZ_CHROMA_4xN 4, 8
-FILTER_HORIZ_CHROMA_4xN 4, 16
-
-FILTER_HORIZ_CHROMA_4xN 4, 32
+ FILTER_HORIZ_CHROMA_4xN 4, 2
+ FILTER_HORIZ_CHROMA_4xN 4, 4
+ FILTER_HORIZ_CHROMA_4xN 4, 8
+ FILTER_HORIZ_CHROMA_4xN 4, 16
+
+ FILTER_HORIZ_CHROMA_4xN 4, 32
%macro PROCESS_CHROMA_W6 3
movu %1, [srcq]
@@ -14084,11 +14084,11 @@
RET
%endmacro
-FILTER_HORIZ_CHROMA 6, 8
-FILTER_HORIZ_CHROMA 12, 16
-
-FILTER_HORIZ_CHROMA 6, 16
-FILTER_HORIZ_CHROMA 12, 32
+ FILTER_HORIZ_CHROMA 6, 8
+ FILTER_HORIZ_CHROMA 12, 16
+
+ FILTER_HORIZ_CHROMA 6, 16
+ FILTER_HORIZ_CHROMA 12, 32
%macro PROCESS_CHROMA_W8 3
movu %1, [srcq]
@@ -14147,15 +14147,15 @@
RET
%endmacro
-FILTER_HORIZ_CHROMA_8xN 8, 2
-FILTER_HORIZ_CHROMA_8xN 8, 4
-FILTER_HORIZ_CHROMA_8xN 8, 6
-FILTER_HORIZ_CHROMA_8xN 8, 8
-FILTER_HORIZ_CHROMA_8xN 8, 16
-FILTER_HORIZ_CHROMA_8xN 8, 32
-
-FILTER_HORIZ_CHROMA_8xN 8, 12
-FILTER_HORIZ_CHROMA_8xN 8, 64
+ FILTER_HORIZ_CHROMA_8xN 8, 2
+ FILTER_HORIZ_CHROMA_8xN 8, 4
+ FILTER_HORIZ_CHROMA_8xN 8, 6
+ FILTER_HORIZ_CHROMA_8xN 8, 8
+ FILTER_HORIZ_CHROMA_8xN 8, 16
+ FILTER_HORIZ_CHROMA_8xN 8, 32
+
+ FILTER_HORIZ_CHROMA_8xN 8, 12
+ FILTER_HORIZ_CHROMA_8xN 8, 64
%macro PROCESS_CHROMA_W16 4
movu %1, [srcq]
@@ -14317,28 +14317,28 @@
RET
%endmacro
-FILTER_HORIZ_CHROMA_WxN 16, 4
-FILTER_HORIZ_CHROMA_WxN 16, 8
-FILTER_HORIZ_CHROMA_WxN 16, 12
-FILTER_HORIZ_CHROMA_WxN 16, 16
-FILTER_HORIZ_CHROMA_WxN 16, 32
-FILTER_HORIZ_CHROMA_WxN 24, 32
-FILTER_HORIZ_CHROMA_WxN 32, 8
-FILTER_HORIZ_CHROMA_WxN 32, 16
-FILTER_HORIZ_CHROMA_WxN 32, 24
-FILTER_HORIZ_CHROMA_WxN 32, 32
-
-FILTER_HORIZ_CHROMA_WxN 16, 24
-FILTER_HORIZ_CHROMA_WxN 16, 64
-FILTER_HORIZ_CHROMA_WxN 24, 64
-FILTER_HORIZ_CHROMA_WxN 32, 48
-FILTER_HORIZ_CHROMA_WxN 32, 64
-
-FILTER_HORIZ_CHROMA_WxN 64, 64
-FILTER_HORIZ_CHROMA_WxN 64, 32
-FILTER_HORIZ_CHROMA_WxN 64, 48
-FILTER_HORIZ_CHROMA_WxN 48, 64
-FILTER_HORIZ_CHROMA_WxN 64, 16
+ FILTER_HORIZ_CHROMA_WxN 16, 4
+ FILTER_HORIZ_CHROMA_WxN 16, 8
+ FILTER_HORIZ_CHROMA_WxN 16, 12
+ FILTER_HORIZ_CHROMA_WxN 16, 16
+ FILTER_HORIZ_CHROMA_WxN 16, 32
+ FILTER_HORIZ_CHROMA_WxN 24, 32
+ FILTER_HORIZ_CHROMA_WxN 32, 8
+ FILTER_HORIZ_CHROMA_WxN 32, 16
+ FILTER_HORIZ_CHROMA_WxN 32, 24
+ FILTER_HORIZ_CHROMA_WxN 32, 32
+
+ FILTER_HORIZ_CHROMA_WxN 16, 24
+ FILTER_HORIZ_CHROMA_WxN 16, 64
+ FILTER_HORIZ_CHROMA_WxN 24, 64
+ FILTER_HORIZ_CHROMA_WxN 32, 48
+ FILTER_HORIZ_CHROMA_WxN 32, 64
+
+ FILTER_HORIZ_CHROMA_WxN 64, 64
+ FILTER_HORIZ_CHROMA_WxN 64, 32
+ FILTER_HORIZ_CHROMA_WxN 64, 48
+ FILTER_HORIZ_CHROMA_WxN 48, 64
+ FILTER_HORIZ_CHROMA_WxN 64, 16
;---------------------------------------------------------------------------------------------------------------
@@ -14434,11 +14434,11 @@
RET
%endmacro
-FILTER_V_PS_W16n 64, 64
-FILTER_V_PS_W16n 64, 32
-FILTER_V_PS_W16n 64, 48
-FILTER_V_PS_W16n 48, 64
-FILTER_V_PS_W16n 64, 16
+ FILTER_V_PS_W16n 64, 64
+ FILTER_V_PS_W16n 64, 32
+ FILTER_V_PS_W16n 64, 48
+ FILTER_V_PS_W16n 48, 64
+ FILTER_V_PS_W16n 64, 16
;------------------------------------------------------------------------------------------------------------
@@ -14596,12 +14596,12 @@
dec r4d
jnz .loop
-RET
-%endmacro
-
-FILTER_V_PS_W2 2, 8
-
-FILTER_V_PS_W2 2, 16
+ RET
+%endmacro
+
+ FILTER_V_PS_W2 2, 8
+
+ FILTER_V_PS_W2 2, 16
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -14762,8 +14762,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_4x4 sp
-FILTER_VER_CHROMA_S_AVX2_4x4 ss
+ FILTER_VER_CHROMA_S_AVX2_4x4 sp
+ FILTER_VER_CHROMA_S_AVX2_4x4 ss
%macro FILTER_VER_CHROMA_S_AVX2_4x8 1
INIT_YMM avx2
@@ -14874,8 +14874,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_4x8 sp
-FILTER_VER_CHROMA_S_AVX2_4x8 ss
+ FILTER_VER_CHROMA_S_AVX2_4x8 sp
+ FILTER_VER_CHROMA_S_AVX2_4x8 ss
%macro PROCESS_CHROMA_AVX2_W4_16R 1
movq xm0, [r0]
@@ -15069,8 +15069,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_4x16 sp
-FILTER_VER_CHROMA_S_AVX2_4x16 ss
+ FILTER_VER_CHROMA_S_AVX2_4x16 sp
+ FILTER_VER_CHROMA_S_AVX2_4x16 ss
%macro FILTER_VER_CHROMA_S_AVX2_4x2 1
INIT_YMM avx2
@@ -15126,8 +15126,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_4x2 sp
-FILTER_VER_CHROMA_S_AVX2_4x2 ss
+ FILTER_VER_CHROMA_S_AVX2_4x2 sp
+ FILTER_VER_CHROMA_S_AVX2_4x2 ss
%macro FILTER_VER_CHROMA_S_AVX2_2x4 1
INIT_YMM avx2
@@ -15196,8 +15196,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_2x4 sp
-FILTER_VER_CHROMA_S_AVX2_2x4 ss
+ FILTER_VER_CHROMA_S_AVX2_2x4 sp
+ FILTER_VER_CHROMA_S_AVX2_2x4 ss
%macro FILTER_VER_CHROMA_S_AVX2_8x8 1
INIT_YMM avx2
@@ -15375,8 +15375,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_8x8 sp
-FILTER_VER_CHROMA_S_AVX2_8x8 ss
+ FILTER_VER_CHROMA_S_AVX2_8x8 sp
+ FILTER_VER_CHROMA_S_AVX2_8x8 ss
%macro PROCESS_CHROMA_S_AVX2_W8_16R 1
movu xm0, [r0] ; m0 = row 0
@@ -15691,10 +15691,10 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16
-FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32
-FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16
-FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32
+ FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16
+ FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32
+ FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16
+ FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32
%macro FILTER_VER_CHROMA_S_AVX2_NxN 3
INIT_YMM avx2
@@ -15743,12 +15743,12 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp
-FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp
-FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp
-FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss
-FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss
-FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss
+ FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp
+ FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp
+ FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp
+ FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss
+ FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss
+ FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss
%macro PROCESS_CHROMA_S_AVX2_W8_4R 1
movu xm0, [r0] ; m0 = row 0
@@ -15857,8 +15857,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_8x4 sp
-FILTER_VER_CHROMA_S_AVX2_8x4 ss
+ FILTER_VER_CHROMA_S_AVX2_8x4 sp
+ FILTER_VER_CHROMA_S_AVX2_8x4 ss
%macro FILTER_VER_CHROMA_S_AVX2_12x16 1
INIT_YMM avx2
@@ -15896,8 +15896,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_12x16 sp
-FILTER_VER_CHROMA_S_AVX2_12x16 ss
+ FILTER_VER_CHROMA_S_AVX2_12x16 sp
+ FILTER_VER_CHROMA_S_AVX2_12x16 ss
%macro FILTER_VER_CHROMA_S_AVX2_16x12 1
INIT_YMM avx2
@@ -16150,8 +16150,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_16x12 sp
-FILTER_VER_CHROMA_S_AVX2_16x12 ss
+ FILTER_VER_CHROMA_S_AVX2_16x12 sp
+ FILTER_VER_CHROMA_S_AVX2_16x12 ss
%macro FILTER_VER_CHROMA_S_AVX2_16x4 1
INIT_YMM avx2
@@ -16196,8 +16196,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_16x4 sp
-FILTER_VER_CHROMA_S_AVX2_16x4 ss
+ FILTER_VER_CHROMA_S_AVX2_16x4 sp
+ FILTER_VER_CHROMA_S_AVX2_16x4 ss
%macro PROCESS_CHROMA_S_AVX2_W8_8R 1
movu xm0, [r0] ; m0 = row 0
@@ -16387,10 +16387,10 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32
-FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16
-FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32
-FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16
+ FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32
+ FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16
+ FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32
+ FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16
%macro FILTER_VER_CHROMA_S_AVX2_8x2 1
INIT_YMM avx2
@@ -16462,8 +16462,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_8x2 sp
-FILTER_VER_CHROMA_S_AVX2_8x2 ss
+ FILTER_VER_CHROMA_S_AVX2_8x2 sp
+ FILTER_VER_CHROMA_S_AVX2_8x2 ss
%macro FILTER_VER_CHROMA_S_AVX2_8x6 1
INIT_YMM avx2
@@ -16605,8 +16605,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_8x6 sp
-FILTER_VER_CHROMA_S_AVX2_8x6 ss
+ FILTER_VER_CHROMA_S_AVX2_8x6 sp
+ FILTER_VER_CHROMA_S_AVX2_8x6 ss
%macro FILTER_VER_CHROMA_S_AVX2_8xN 2
INIT_YMM avx2
@@ -16927,10 +16927,10 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_8xN sp, 16
-FILTER_VER_CHROMA_S_AVX2_8xN sp, 32
-FILTER_VER_CHROMA_S_AVX2_8xN ss, 16
-FILTER_VER_CHROMA_S_AVX2_8xN ss, 32
+ FILTER_VER_CHROMA_S_AVX2_8xN sp, 16
+ FILTER_VER_CHROMA_S_AVX2_8xN sp, 32
+ FILTER_VER_CHROMA_S_AVX2_8xN ss, 16
+ FILTER_VER_CHROMA_S_AVX2_8xN ss, 32
%macro FILTER_VER_CHROMA_S_AVX2_32x24 1
INIT_YMM avx2
@@ -16988,8 +16988,8 @@
%endif
%endmacro
-FILTER_VER_CHROMA_S_AVX2_32x24 sp
-FILTER_VER_CHROMA_S_AVX2_32x24 ss
+ FILTER_VER_CHROMA_S_AVX2_32x24 sp
+ FILTER_VER_CHROMA_S_AVX2_32x24 ss
%macro FILTER_VER_CHROMA_S_AVX2_2x8 1
INIT_YMM avx2
@@ -17087,8 +17087,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_2x8 sp
-FILTER_VER_CHROMA_S_AVX2_2x8 ss
+ FILTER_VER_CHROMA_S_AVX2_2x8 sp
+ FILTER_VER_CHROMA_S_AVX2_2x8 ss
%macro FILTER_VER_CHROMA_S_AVX2_6x8 1
INIT_YMM avx2
@@ -17275,8 +17275,8 @@
RET
%endmacro
-FILTER_VER_CHROMA_S_AVX2_6x8 sp
-FILTER_VER_CHROMA_S_AVX2_6x8 ss
+ FILTER_VER_CHROMA_S_AVX2_6x8 sp
+ FILTER_VER_CHROMA_S_AVX2_6x8 ss
;---------------------------------------------------------------------------------------------------------------------
; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -17321,10 +17321,10 @@
RET
%endmacro
-FILTER_VER_CHROMA_SS_W2_4R 2, 4
-FILTER_VER_CHROMA_SS_W2_4R 2, 8
-
-FILTER_VER_CHROMA_SS_W2_4R 2, 16
+ FILTER_VER_CHROMA_SS_W2_4R 2, 4
+ FILTER_VER_CHROMA_SS_W2_4R 2, 8
+
+ FILTER_VER_CHROMA_SS_W2_4R 2, 16
;---------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -17437,9 +17437,9 @@
RET
%endmacro
-FILTER_VER_CHROMA_SS_W6_H4 6, 8
-
-FILTER_VER_CHROMA_SS_W6_H4 6, 16
+ FILTER_VER_CHROMA_SS_W6_H4 6, 8
+
+ FILTER_VER_CHROMA_SS_W6_H4 6, 16
;----------------------------------------------------------------------------------------------------------------
@@ -17484,15 +17484,15 @@
RET
%endmacro
-FILTER_VER_CHROMA_SS_W8_H2 8, 2
-FILTER_VER_CHROMA_SS_W8_H2 8, 4
-FILTER_VER_CHROMA_SS_W8_H2 8, 6
-FILTER_VER_CHROMA_SS_W8_H2 8, 8
-FILTER_VER_CHROMA_SS_W8_H2 8, 16
-FILTER_VER_CHROMA_SS_W8_H2 8, 32
-
-FILTER_VER_CHROMA_SS_W8_H2 8, 12
-FILTER_VER_CHROMA_SS_W8_H2 8, 64
+ FILTER_VER_CHROMA_SS_W8_H2 8, 2
+ FILTER_VER_CHROMA_SS_W8_H2 8, 4
+ FILTER_VER_CHROMA_SS_W8_H2 8, 6
+ FILTER_VER_CHROMA_SS_W8_H2 8, 8
+ FILTER_VER_CHROMA_SS_W8_H2 8, 16
+ FILTER_VER_CHROMA_SS_W8_H2 8, 32
+
+ FILTER_VER_CHROMA_SS_W8_H2 8, 12
+ FILTER_VER_CHROMA_SS_W8_H2 8, 64
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
@@ -17732,8 +17732,8 @@
RET
%endmacro
-FILTER_VER_LUMA_AVX2_4x4 sp
-FILTER_VER_LUMA_AVX2_4x4 ss
+ FILTER_VER_LUMA_AVX2_4x4 sp
+ FILTER_VER_LUMA_AVX2_4x4 ss
%macro FILTER_VER_LUMA_AVX2_4x8 1
INIT_YMM avx2
@@ -17878,8 +17878,8 @@
RET
%endmacro
-FILTER_VER_LUMA_AVX2_4x8 sp
-FILTER_VER_LUMA_AVX2_4x8 ss
+ FILTER_VER_LUMA_AVX2_4x8 sp
+ FILTER_VER_LUMA_AVX2_4x8 ss
%macro PROCESS_LUMA_AVX2_W4_16R 1
movq xm0, [r0]
@@ -18123,8 +18123,8 @@
RET
%endmacro
-FILTER_VER_LUMA_AVX2_4x16 sp
-FILTER_VER_LUMA_AVX2_4x16 ss
+ FILTER_VER_LUMA_AVX2_4x16 sp
+ FILTER_VER_LUMA_AVX2_4x16 ss
%macro FILTER_VER_LUMA_S_AVX2_8x8 1
INIT_YMM avx2
@@ -18346,8 +18346,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_S_AVX2_8x8 sp
-FILTER_VER_LUMA_S_AVX2_8x8 ss
+ FILTER_VER_LUMA_S_AVX2_8x8 sp
+ FILTER_VER_LUMA_S_AVX2_8x8 ss
%macro FILTER_VER_LUMA_S_AVX2_8xN 2
INIT_YMM avx2
@@ -18736,10 +18736,10 @@
%endif
%endmacro
-FILTER_VER_LUMA_S_AVX2_8xN sp, 16
-FILTER_VER_LUMA_S_AVX2_8xN sp, 32
-FILTER_VER_LUMA_S_AVX2_8xN ss, 16
-FILTER_VER_LUMA_S_AVX2_8xN ss, 32
+ FILTER_VER_LUMA_S_AVX2_8xN sp, 16
+ FILTER_VER_LUMA_S_AVX2_8xN sp, 32
+ FILTER_VER_LUMA_S_AVX2_8xN ss, 16
+ FILTER_VER_LUMA_S_AVX2_8xN ss, 32
%macro PROCESS_LUMA_S_AVX2_W8_4R 1
movu xm0, [r0] ; m0 = row 0
@@ -18882,8 +18882,8 @@
RET
%endmacro
-FILTER_VER_LUMA_S_AVX2_8x4 sp
-FILTER_VER_LUMA_S_AVX2_8x4 ss
+ FILTER_VER_LUMA_S_AVX2_8x4 sp
+ FILTER_VER_LUMA_S_AVX2_8x4 ss
%macro PROCESS_LUMA_AVX2_W8_16R 1
movu xm0, [r0] ; m0 = row 0
@@ -19278,12 +19278,12 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_Nx16 sp, 16
-FILTER_VER_LUMA_AVX2_Nx16 sp, 32
-FILTER_VER_LUMA_AVX2_Nx16 sp, 64
-FILTER_VER_LUMA_AVX2_Nx16 ss, 16
-FILTER_VER_LUMA_AVX2_Nx16 ss, 32
-FILTER_VER_LUMA_AVX2_Nx16 ss, 64
+ FILTER_VER_LUMA_AVX2_Nx16 sp, 16
+ FILTER_VER_LUMA_AVX2_Nx16 sp, 32
+ FILTER_VER_LUMA_AVX2_Nx16 sp, 64
+ FILTER_VER_LUMA_AVX2_Nx16 ss, 16
+ FILTER_VER_LUMA_AVX2_Nx16 ss, 32
+ FILTER_VER_LUMA_AVX2_Nx16 ss, 64
%macro FILTER_VER_LUMA_AVX2_NxN 3
INIT_YMM avx2
@@ -19337,24 +19337,24 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_NxN 16, 32, sp
-FILTER_VER_LUMA_AVX2_NxN 16, 64, sp
-FILTER_VER_LUMA_AVX2_NxN 24, 32, sp
-FILTER_VER_LUMA_AVX2_NxN 32, 32, sp
-FILTER_VER_LUMA_AVX2_NxN 32, 64, sp
-FILTER_VER_LUMA_AVX2_NxN 48, 64, sp
-FILTER_VER_LUMA_AVX2_NxN 64, 32, sp
-FILTER_VER_LUMA_AVX2_NxN 64, 48, sp
-FILTER_VER_LUMA_AVX2_NxN 64, 64, sp
-FILTER_VER_LUMA_AVX2_NxN 16, 32, ss
-FILTER_VER_LUMA_AVX2_NxN 16, 64, ss
-FILTER_VER_LUMA_AVX2_NxN 24, 32, ss
-FILTER_VER_LUMA_AVX2_NxN 32, 32, ss
-FILTER_VER_LUMA_AVX2_NxN 32, 64, ss
-FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
-FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
-FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
-FILTER_VER_LUMA_AVX2_NxN 64, 64, ss
+ FILTER_VER_LUMA_AVX2_NxN 16, 32, sp
+ FILTER_VER_LUMA_AVX2_NxN 16, 64, sp
+ FILTER_VER_LUMA_AVX2_NxN 24, 32, sp
+ FILTER_VER_LUMA_AVX2_NxN 32, 32, sp
+ FILTER_VER_LUMA_AVX2_NxN 32, 64, sp
+ FILTER_VER_LUMA_AVX2_NxN 48, 64, sp
+ FILTER_VER_LUMA_AVX2_NxN 64, 32, sp
+ FILTER_VER_LUMA_AVX2_NxN 64, 48, sp
+ FILTER_VER_LUMA_AVX2_NxN 64, 64, sp
+ FILTER_VER_LUMA_AVX2_NxN 16, 32, ss
+ FILTER_VER_LUMA_AVX2_NxN 16, 64, ss
+ FILTER_VER_LUMA_AVX2_NxN 24, 32, ss
+ FILTER_VER_LUMA_AVX2_NxN 32, 32, ss
+ FILTER_VER_LUMA_AVX2_NxN 32, 64, ss
+ FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
+ FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
+ FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
+ FILTER_VER_LUMA_AVX2_NxN 64, 64, ss
%macro FILTER_VER_LUMA_S_AVX2_12x16 1
INIT_YMM avx2
@@ -19392,8 +19392,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_S_AVX2_12x16 sp
-FILTER_VER_LUMA_S_AVX2_12x16 ss
+ FILTER_VER_LUMA_S_AVX2_12x16 sp
+ FILTER_VER_LUMA_S_AVX2_12x16 ss
%macro FILTER_VER_LUMA_S_AVX2_16x12 1
INIT_YMM avx2
@@ -19706,8 +19706,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_S_AVX2_16x12 sp
-FILTER_VER_LUMA_S_AVX2_16x12 ss
+ FILTER_VER_LUMA_S_AVX2_16x12 sp
+ FILTER_VER_LUMA_S_AVX2_16x12 ss
%macro FILTER_VER_LUMA_S_AVX2_16x4 1
INIT_YMM avx2
@@ -19754,8 +19754,8 @@
RET
%endmacro
-FILTER_VER_LUMA_S_AVX2_16x4 sp
-FILTER_VER_LUMA_S_AVX2_16x4 ss
+ FILTER_VER_LUMA_S_AVX2_16x4 sp
+ FILTER_VER_LUMA_S_AVX2_16x4 ss
%macro PROCESS_LUMA_S_AVX2_W8_8R 1
movu xm0, [r0] ; m0 = row 0
@@ -19991,10 +19991,10 @@
%endif
%endmacro
-FILTER_VER_LUMA_AVX2_Nx8 sp, 32
-FILTER_VER_LUMA_AVX2_Nx8 sp, 16
-FILTER_VER_LUMA_AVX2_Nx8 ss, 32
-FILTER_VER_LUMA_AVX2_Nx8 ss, 16
+ FILTER_VER_LUMA_AVX2_Nx8 sp, 32
+ FILTER_VER_LUMA_AVX2_Nx8 sp, 16
+ FILTER_VER_LUMA_AVX2_Nx8 ss, 32
+ FILTER_VER_LUMA_AVX2_Nx8 ss, 16
%macro FILTER_VER_LUMA_S_AVX2_32x24 1
INIT_YMM avx2
@@ -20054,8 +20054,8 @@
%endif
%endmacro
-FILTER_VER_LUMA_S_AVX2_32x24 sp
-FILTER_VER_LUMA_S_AVX2_32x24 ss
+ FILTER_VER_LUMA_S_AVX2_32x24 sp
+ FILTER_VER_LUMA_S_AVX2_32x24 ss
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -20122,7 +20122,7 @@
add r0, r1
dec r6d
jnz .loop
- RET
+ RET
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_16x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -20175,7 +20175,7 @@
add r0, r1
dec r6d
jnz .loop
- RET
+ RET
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -20311,11 +20311,11 @@
RET
%endmacro
-IPFILTER_CHROMA_PS_32xN_AVX2 32 , 16
-IPFILTER_CHROMA_PS_32xN_AVX2 32 , 24
-IPFILTER_CHROMA_PS_32xN_AVX2 32 , 8
-IPFILTER_CHROMA_PS_32xN_AVX2 32 , 64
-IPFILTER_CHROMA_PS_32xN_AVX2 32 , 48
+ IPFILTER_CHROMA_PS_32xN_AVX2 32 , 16
+ IPFILTER_CHROMA_PS_32xN_AVX2 32 , 24
+ IPFILTER_CHROMA_PS_32xN_AVX2 32 , 8
+ IPFILTER_CHROMA_PS_32xN_AVX2 32 , 64
+ IPFILTER_CHROMA_PS_32xN_AVX2 32 , 48
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-----------------------------------------------------------------------------------------------------------------------------
@@ -20398,7 +20398,7 @@
lea r2, [r2 + r3 * 2]
movhps [r2], xm3
.end
- RET
+ RET
cglobal interp_4tap_horiz_ps_4x2, 4,7,5
mov r4d, r4m
@@ -20467,7 +20467,7 @@
lea r2, [r2 + r3 * 2]
movhps [r2], xm3
.end
- RET
+ RET
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -20558,7 +20558,7 @@
lea r2, [r2 + r3 * 2]
movhps [r2], xm3
.end
-RET
+ RET
%endmacro
IPFILTER_CHROMA_PS_4xN_AVX2 4 , 8
@@ -20635,7 +20635,7 @@
vpermq m3, m3, 11011000b
movu [r2], xm3
.end
- RET
+ RET
INIT_YMM avx2
cglobal interp_4tap_horiz_pp_4x2, 4,6,4
@@ -20730,11 +20730,11 @@
RET
%endmacro
-IPFILTER_CHROMA_PP_32xN_AVX2 32, 16
-IPFILTER_CHROMA_PP_32xN_AVX2 32, 24
-IPFILTER_CHROMA_PP_32xN_AVX2 32, 8
-IPFILTER_CHROMA_PP_32xN_AVX2 32, 64
-IPFILTER_CHROMA_PP_32xN_AVX2 32, 48
+ IPFILTER_CHROMA_PP_32xN_AVX2 32, 16
+ IPFILTER_CHROMA_PP_32xN_AVX2 32, 24
+ IPFILTER_CHROMA_PP_32xN_AVX2 32, 8
+ IPFILTER_CHROMA_PP_32xN_AVX2 32, 64
+ IPFILTER_CHROMA_PP_32xN_AVX2 32, 48
;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_pp_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
@@ -20808,11 +20808,11 @@
RET
%endmacro
-IPFILTER_CHROMA_PP_8xN_AVX2 8 , 16
-IPFILTER_CHROMA_PP_8xN_AVX2 8 , 32
-IPFILTER_CHROMA_PP_8xN_AVX2 8 , 4
-IPFILTER_CHROMA_PP_8xN_AVX2 8 , 64
-IPFILTER_CHROMA_PP_8xN_AVX2 8 , 12
+ IPFILTER_CHROMA_PP_8xN_AVX2 8 , 16
+ IPFILTER_CHROMA_PP_8xN_AVX2 8 , 32
+ IPFILTER_CHROMA_PP_8xN_AVX2 8 , 4
+ IPFILTER_CHROMA_PP_8xN_AVX2 8 , 64
+ IPFILTER_CHROMA_PP_8xN_AVX2 8 , 12
;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_pp_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
@@ -20874,8 +20874,8 @@
RET
%endmacro
-IPFILTER_CHROMA_PP_4xN_AVX2 4 , 8
-IPFILTER_CHROMA_PP_4xN_AVX2 4 , 16
+ IPFILTER_CHROMA_PP_4xN_AVX2 4 , 8
+ IPFILTER_CHROMA_PP_4xN_AVX2 4 , 16
%macro IPFILTER_LUMA_PS_32xN_AVX2 2
INIT_YMM avx2
@@ -20972,11 +20972,11 @@
RET
%endmacro
-IPFILTER_LUMA_PS_32xN_AVX2 32 , 32
-IPFILTER_LUMA_PS_32xN_AVX2 32 , 16
-IPFILTER_LUMA_PS_32xN_AVX2 32 , 24
-IPFILTER_LUMA_PS_32xN_AVX2 32 , 8
-IPFILTER_LUMA_PS_32xN_AVX2 32 , 64
+ IPFILTER_LUMA_PS_32xN_AVX2 32 , 32
+ IPFILTER_LUMA_PS_32xN_AVX2 32 , 16
+ IPFILTER_LUMA_PS_32xN_AVX2 32 , 24
+ IPFILTER_LUMA_PS_32xN_AVX2 32 , 8
+ IPFILTER_LUMA_PS_32xN_AVX2 32 , 64
INIT_YMM avx2
cglobal interp_8tap_horiz_ps_48x64, 4, 7, 8
@@ -21301,12 +21301,12 @@
RET
%endmacro
-IPFILTER_CHROMA_PP_16xN_AVX2 16 , 8
-IPFILTER_CHROMA_PP_16xN_AVX2 16 , 32
-IPFILTER_CHROMA_PP_16xN_AVX2 16 , 12
-IPFILTER_CHROMA_PP_16xN_AVX2 16 , 4
-IPFILTER_CHROMA_PP_16xN_AVX2 16 , 64
-IPFILTER_CHROMA_PP_16xN_AVX2 16 , 24
+ IPFILTER_CHROMA_PP_16xN_AVX2 16 , 8
+ IPFILTER_CHROMA_PP_16xN_AVX2 16 , 32
+ IPFILTER_CHROMA_PP_16xN_AVX2 16 , 12
+ IPFILTER_CHROMA_PP_16xN_AVX2 16 , 4
+ IPFILTER_CHROMA_PP_16xN_AVX2 16 , 64
+ IPFILTER_CHROMA_PP_16xN_AVX2 16 , 24
%macro IPFILTER_LUMA_PS_64xN_AVX2 1
INIT_YMM avx2
@@ -21444,10 +21444,10 @@
RET
%endmacro
-IPFILTER_LUMA_PS_64xN_AVX2 64
-IPFILTER_LUMA_PS_64xN_AVX2 48
-IPFILTER_LUMA_PS_64xN_AVX2 32
-IPFILTER_LUMA_PS_64xN_AVX2 16
+ IPFILTER_LUMA_PS_64xN_AVX2 64
+ IPFILTER_LUMA_PS_64xN_AVX2 48
+ IPFILTER_LUMA_PS_64xN_AVX2 32
+ IPFILTER_LUMA_PS_64xN_AVX2 16
;-----------------------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_ps_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
@@ -21518,7 +21518,7 @@
vpermq m3, m3, 11011000b
movu [r2], xm3
.end
- RET
+ RET
%endmacro
IPFILTER_CHROMA_PS_8xN_AVX2 2
@@ -22171,7 +22171,7 @@
add r0, 16
dec r9d
jnz .loopW
-RET
+ RET
%endif
INIT_YMM avx2
@@ -22467,7 +22467,7 @@
RET
%endmacro
-IPFILTER_CHROMA_PP_64xN_AVX2 64
-IPFILTER_CHROMA_PP_64xN_AVX2 32
-IPFILTER_CHROMA_PP_64xN_AVX2 48
-IPFILTER_CHROMA_PP_64xN_AVX2 16
+ IPFILTER_CHROMA_PP_64xN_AVX2 64
+ IPFILTER_CHROMA_PP_64xN_AVX2 32
+ IPFILTER_CHROMA_PP_64xN_AVX2 48
+ IPFILTER_CHROMA_PP_64xN_AVX2 16
More information about the x265-devel
mailing list