[x265] [PATCH 1 of 4] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
Min Chen
chenm003 at 163.com
Wed Jul 22 01:20:02 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1437514211 25200
# Node ID ab2c34d6ad913369fd8feb84aee10030ffaa0df5
# Parent 46152345eb6ff261fd90272f7a0712300d6324c0
asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
---
source/common/x86/asm-primitives.cpp | 12 +
source/common/x86/const-a.asm | 1 +
source/common/x86/intrapred16.asm | 42 +-
source/common/x86/ipfilter16.asm | 1199 +++++++++++++++++-----------------
source/common/x86/loopfilter.asm | 48 +-
source/common/x86/mc-a.asm | 176 +++---
source/common/x86/pixel-util8.asm | 24 +-
7 files changed, 756 insertions(+), 746 deletions(-)
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 21 14:30:11 2015 -0700
@@ -1043,7 +1043,9 @@
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
ALL_LUMA_PU(satd, pixel_satd, ssse3);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(ssse3);
+#endif
INTRA_ANG_SSSE3(ssse3);
p.dst4x4 = PFX(dst4_ssse3);
@@ -1126,14 +1128,18 @@
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
ALL_LUMA_PU(satd, pixel_satd, sse4);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(sse4);
+#endif
p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
+#if X265_DEPTH <= 10
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
+#endif
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4_HIGH(sse4);
@@ -1147,7 +1153,9 @@
// TODO: check POPCNT flag!
ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
+#if X265_DEPTH <= 10
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
+#endif
ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
@@ -1184,7 +1192,9 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(avx);
+#endif
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
@@ -1292,7 +1302,9 @@
{
//p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
ALL_LUMA_PU(satd, pixel_satd, xop);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(xop);
+#endif
LUMA_VAR(xop);
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
}
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/const-a.asm Tue Jul 21 14:30:11 2015 -0700
@@ -79,6 +79,7 @@
const pw_512, times 16 dw 512
const pw_1023, times 16 dw 1023
const pw_1024, times 16 dw 1024
+const pw_2048, times 16 dw 2048
const pw_4096, times 16 dw 4096
const pw_8192, times 8 dw 8192
const pw_00ff, times 16 dw 0x00ff
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/intrapred16.asm Tue Jul 21 14:30:11 2015 -0700
@@ -1748,7 +1748,7 @@
; filter top
movu m1, [r2]
paddw m1, m0
- psraw m1, 2
+ psrlw m1, 2
movh [r0], m1 ; overwrite top-left pixel, we will update it later
; filter top-left
@@ -1763,7 +1763,7 @@
lea r0, [r0 + r1 * 2]
movu m1, [r3 + 2]
paddw m1, m0
- psraw m1, 2
+ psrlw m1, 2
movd r3d, m1
mov [r0], r3w
shr r3d, 16
@@ -1872,7 +1872,7 @@
; filter top
movu m0, [r2]
paddw m0, m1
- psraw m0, 2
+ psrlw m0, 2
movu [r6], m0
; filter top-left
@@ -1887,7 +1887,7 @@
add r6, r1
movu m0, [r3 + 2]
paddw m0, m1
- psraw m0, 2
+ psrlw m0, 2
pextrw [r6], m0, 0
pextrw [r6 + r1], m0, 1
pextrw [r6 + r1 * 2], m0, 2
@@ -1913,13 +1913,13 @@
movu m2, [r2]
movu m3, [r2 + 16]
- paddw m0, m1
+ paddw m0, m1 ; dynamic range 13 bits
paddw m2, m3
- paddw m0, m2
- movhlps m1, m0
- paddw m0, m1
- phaddw m0, m0
+ paddw m0, m2 ; dynamic range 14 bits
+ movhlps m1, m0 ; dynamic range 15 bits
+ paddw m0, m1 ; dynamic range 16 bits
pmaddwd m0, [pw_1]
+ phaddd m0, m0
movd r5d, m0
add r5d, 16
@@ -1983,11 +1983,11 @@
; filter top
movu m2, [r2]
paddw m2, m1
- psraw m2, 2
+ psrlw m2, 2
movu [r6], m2
movu m3, [r2 + 16]
paddw m3, m1
- psraw m3, 2
+ psrlw m3, 2
movu [r6 + 16], m3
; filter top-left
@@ -2002,7 +2002,7 @@
add r6, r1
movu m2, [r3 + 2]
paddw m2, m1
- psraw m2, 2
+ psrlw m2, 2
pextrw [r6], m2, 0
pextrw [r6 + r1], m2, 1
@@ -2019,7 +2019,7 @@
lea r6, [r6 + r1 * 2]
movu m3, [r3 + 18]
paddw m3, m1
- psraw m3, 2
+ psrlw m3, 2
pextrw [r6], m3, 0
pextrw [r6 + r1], m3, 1
@@ -2046,21 +2046,21 @@
movu m1, [r3 + 16]
movu m2, [r3 + 32]
movu m3, [r3 + 48]
- paddw m0, m1
+ paddw m0, m1 ; dynamic range 13 bits
paddw m2, m3
- paddw m0, m2
+ paddw m0, m2 ; dynamic range 14 bits
movu m1, [r2]
movu m3, [r2 + 16]
movu m4, [r2 + 32]
movu m5, [r2 + 48]
- paddw m1, m3
+ paddw m1, m3 ; dynamic range 13 bits
paddw m4, m5
- paddw m1, m4
- paddw m0, m1
+ paddw m1, m4 ; dynamic range 14 bits
+ paddw m0, m1 ; dynamic range 15 bits
+ pmaddwd m0, [pw_1]
movhlps m1, m0
- paddw m0, m1
- phaddw m0, m0
- pmaddwd m0, [pw_1]
+ paddd m0, m1
+ phaddd m0, m0
paddd m0, [pd_32] ; sum = sum + 32
psrld m0, 6 ; sum = sum / 64
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/ipfilter16.asm Tue Jul 21 14:30:11 2015 -0700
@@ -26,6 +26,25 @@
%include "x86inc.asm"
%include "x86util.asm"
+
+%define INTERP_OFFSET_PP pd_32
+%define INTERP_SHIFT_PP 6
+
+%if BIT_DEPTH == 10
+ %define INTERP_SHIFT_PS 2
+ %define INTERP_OFFSET_PS pd_n32768
+ %define INTERP_SHIFT_SP 10
+ %define INTERP_OFFSET_SP pd_524800
+%elif BIT_DEPTH == 12
+ %define INTERP_SHIFT_PS 4
+ %define INTERP_OFFSET_PS pd_n131072
+ %define INTERP_SHIFT_SP 8
+ %define INTERP_OFFSET_SP pd_524416
+%else
+ %error Unsupport bit depth!
+%endif
+
+
SECTION_RODATA 32
tab_c_32: times 8 dd 32
@@ -145,21 +164,9 @@
const pb_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
-%if BIT_DEPTH == 10
- %define INTERP_OFFSET_PS pd_n32768
- %define INTERP_SHIFT_PS 2
- %define INTERP_OFFSET_SP pd_524800
- %define INTERP_SHIFT_SP 10
-%elif BIT_DEPTH == 12
- %define INTERP_OFFSET_PS pd_n131072
- %define INTERP_SHIFT_PS 4
- %define INTERP_OFFSET_SP pd_524416
- %define INTERP_SHIFT_SP 8
-%else
- %error Unsupport bit depth!
-%endif
SECTION .text
+cextern pd_8
cextern pd_32
cextern pw_pixel_max
cextern pd_524416
@@ -503,7 +510,7 @@
%endif
%ifidn %1,pp
- mova m7, [pd_32]
+ mova m7, [INTERP_OFFSET_PP]
%define SHIFT 6
%elifidn %1,ps
mova m7, [INTERP_OFFSET_PS]
@@ -1176,7 +1183,6 @@
%macro FILTER_HOR_LUMA_W4 3
INIT_XMM sse4
cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
-
mov r4d, r4m
sub r0, 6
shl r4d, 4
@@ -1229,7 +1235,7 @@
packusdw m4, m4
CLIPW m4, m6, m7
%else
- psrad m4, 2
+ psrad m4, INTERP_SHIFT_PS
packssdw m4, m4
%endif
@@ -1287,7 +1293,7 @@
mov r4d, %2
%ifidn %3, ps
cmp r5m, byte 0
- je .loopH
+ je .loopH
lea r6, [r1 + 2 * r1]
sub r0, r6
add r4d, 7
@@ -1329,8 +1335,8 @@
packusdw m4, m5
CLIPW m4, m7, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
@@ -1340,7 +1346,7 @@
add r2, r3
dec r4d
- jnz .loopH
+ jnz .loopH
RET
%endmacro
@@ -1380,7 +1386,7 @@
mova m0, [tab_LumaCoeff + r4]
%endif
%ifidn %3, pp
- mova m1, [pd_32]
+ mova m1, [INTERP_OFFSET_PP]
%else
mova m1, [INTERP_OFFSET_PS]
%endif
@@ -1425,14 +1431,14 @@
phaddd m5, m6
paddd m5, m1
%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m4, m5
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
@@ -1453,12 +1459,12 @@
phaddd m4, m5
paddd m4, m1
%ifidn %3, pp
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
+ psrad m4, INTERP_SHIFT_PS
packssdw m4, m4
%endif
@@ -1550,14 +1556,14 @@
phaddd m5, m6
paddd m5, m1
%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m4, m5
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
movu [r2 + x], m4
@@ -1591,14 +1597,14 @@
phaddd m5, m6
paddd m5, m1
%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m4, m5
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
movu [r2 + 16 + x], m4
@@ -1743,14 +1749,14 @@
phaddd m5, m6
paddd m5, m1
%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m4, m5
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
movu [r2], m4
@@ -1784,14 +1790,14 @@
phaddd m5, m6
paddd m5, m1
%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m4, m5
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
movu [r2 + 16], m4
@@ -1825,14 +1831,14 @@
phaddd m5, m6
paddd m5, m1
%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m4, m5
pxor m5, m5
CLIPW m4, m5, [pw_pixel_max]
%else
- psrad m4, 2
- psrad m5, 2
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m4, m5
%endif
movu [r2 + 32], m4
@@ -1865,11 +1871,11 @@
phaddd m3, m4
paddd m3, m1
%ifidn %1, pp
- psrad m3, 6
+ psrad m3, INTERP_SHIFT_PP
packusdw m3, m3
CLIPW m3, m7, m6
%else
- psrad m3, 2
+ psrad m3, INTERP_SHIFT_PS
packssdw m3, m3
%endif
movd [r2], m3
@@ -1895,13 +1901,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m7, m6
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2], m3
@@ -1950,7 +1956,7 @@
phaddd m4, m4
vpermq m4, m4, q3120
paddd m4, m6
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -1969,7 +1975,7 @@
phaddd m4, m4
vpermq m4, m4, q3120
paddd m4, m6
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2036,7 +2042,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2064,7 +2070,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2132,7 +2138,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2160,7 +2166,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2232,7 +2238,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2260,7 +2266,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2335,7 +2341,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2363,7 +2369,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2425,7 +2431,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2453,7 +2459,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2481,7 +2487,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2545,7 +2551,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2573,7 +2579,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2601,7 +2607,7 @@
phaddd m4, m5
vpermq m4, m4, q3120
paddd m4, m7
- psrad m4, 6
+ psrad m4, INTERP_SHIFT_PP
packusdw m4, m4
vpermq m4, m4, q2020
@@ -2644,32 +2650,32 @@
mova m1, [INTERP_OFFSET_PS]
cmp r5m, byte 0
je .skip
- sub r0, r1
- movu m3, [r0]
- pshufb m3, m3, m2
- pmaddwd m3, m0
-
- %if %1 == 4
- movu m4, [r0 + 4]
- pshufb m4, m4, m2
- pmaddwd m4, m0
- phaddd m3, m4
- %else
- phaddd m3, m3
- %endif
-
- paddd m3, m1
- psrad m3, INTERP_SHIFT_PS
- packssdw m3, m3
-
- %if %1 == 2
- movd [r2], m3
- %else
- movh [r2], m3
- %endif
-
- add r0, r1
- add r2, r3
+ sub r0, r1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+
+ %if %1 == 4
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ %else
+ phaddd m3, m3
+ %endif
+
+ paddd m3, m1
+ psrad m3, INTERP_SHIFT_PS
+ packssdw m3, m3
+
+ %if %1 == 2
+ movd [r2], m3
+ %else
+ movh [r2], m3
+ %endif
+
+ add r0, r1
+ add r2, r3
FILTER_W%1_2 %3
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
@@ -2689,7 +2695,6 @@
lea r2, [r2 + 2 * r3]
FILTER_W%1_2 %3
%endrep
-
RET
%endmacro
@@ -2729,13 +2734,13 @@
phaddd m4, m4
paddd m4, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m4, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m4, INTERP_SHIFT_PP
packusdw m3, m4
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m4, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m4, INTERP_SHIFT_PS
packssdw m3, m4
%endif
movh [r2], m3
@@ -2769,13 +2774,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2], m3
@@ -2809,13 +2814,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2], m3
@@ -2831,11 +2836,11 @@
paddd m3, m1
%ifidn %1, pp
- psrad m3, 6
+ psrad m3, INTERP_SHIFT_PP
packusdw m3, m3
CLIPW m3, m6, m7
%else
- psrad m3, 2
+ psrad m3, INTERP_SHIFT_PS
packssdw m3, m3
%endif
movh [r2 + 16], m3
@@ -2868,13 +2873,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2], m3
@@ -2898,13 +2903,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + 16], m3
@@ -2938,13 +2943,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2], m3
@@ -2968,13 +2973,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + 16], m3
@@ -2998,13 +3003,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + 32], m3
@@ -3038,13 +3043,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2], m3
@@ -3068,13 +3073,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + 16], m3
@@ -3098,13 +3103,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + 32], m3
@@ -3128,13 +3133,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + 48], m3
@@ -3168,13 +3173,13 @@
phaddd m5, m4
paddd m5, m1
%ifidn %1, pp
- psrad m3, 6
- psrad m5, 6
+ psrad m3, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
packusdw m3, m5
CLIPW m3, m6, m7
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + %2], m3
@@ -3408,7 +3413,7 @@
pmaddwd m4, m0
phaddd m3, m4
paddd m3, m2
- psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
+ psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
packusdw m3, m3
vpermq m3, m3, q2020
@@ -3426,7 +3431,7 @@
pmaddwd m4, m0
phaddd m3, m4
paddd m3, m2
- psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
+ psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
packusdw m3, m3
vpermq m3, m3, q2020
@@ -3474,7 +3479,7 @@
pmaddwd m4, m0
phaddd m3, m4
paddd m3, m2
- psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
+ psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
packusdw m3, m3
vpermq m3, m3,q2020
@@ -3491,7 +3496,7 @@
pmaddwd m4, m0
phaddd m3, m4
paddd m3, m2
- psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
+ psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
packusdw m3, m3
vpermq m3, m3,q2020
@@ -4089,7 +4094,7 @@
%ifnidn %3, ps
mova m7, [pw_pixel_max]
%ifidn %3, pp
- mova m6, [tab_c_32]
+ mova m6, [INTERP_OFFSET_PP]
%else
mova m6, [INTERP_OFFSET_SP]
%endif
@@ -4129,10 +4134,10 @@
paddd m2, m6
paddd m3, m6
%ifidn %3, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%else
psrad m0, INTERP_SHIFT_SP
psrad m1, INTERP_SHIFT_SP
@@ -4344,9 +4349,9 @@
pxor m7, m7
mova m6, [pw_pixel_max]
%ifidn %2, pp
- mova m5, [tab_c_32]
+ mova m5, [INTERP_OFFSET_PP]
%else
- mova m5, [tab_c_524800]
+ mova m5, [INTERP_OFFSET_SP]
%endif
%else
mova m5, [INTERP_OFFSET_PS]
@@ -4362,18 +4367,18 @@
%elifidn %2, ps
paddd m0, m5
paddd m2, m5
- psrad m0, 2
- psrad m2, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
packssdw m0, m2
%else
paddd m0, m5
paddd m2, m5
%ifidn %2, pp
- psrad m0, 6
- psrad m2, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
%else
- psrad m0, 10
- psrad m2, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
%endif
packusdw m0, m2
CLIPW m0, m7, m6
@@ -4389,7 +4394,6 @@
dec r4d
jnz .loopH
-
RET
%endmacro
@@ -4417,7 +4421,6 @@
%macro FILTER_VER_CHROMA_W4 3
INIT_XMM sse4
cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
-
add r1d, r1d
add r3d, r3d
sub r0, r1
@@ -4439,9 +4442,9 @@
pxor m6, m6
mova m5, [pw_pixel_max]
%ifidn %2, pp
- mova m4, [tab_c_32]
+ mova m4, [INTERP_OFFSET_PP]
%else
- mova m4, [tab_c_524800]
+ mova m4, [INTERP_OFFSET_SP]
%endif
%else
mova m4, [INTERP_OFFSET_PS]
@@ -4479,18 +4482,18 @@
%elifidn %2, ps
paddd m0, m4
paddd m1, m4
- psrad m0, 2
- psrad m1, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
packssdw m0, m1
%else
paddd m0, m4
paddd m1, m4
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
%else
- psrad m0, 10
- psrad m1, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
%endif
packusdw m0, m1
CLIPW m0, m6, m5
@@ -4504,7 +4507,6 @@
dec r4d
jnz .loop
%endif
-
RET
%endmacro
@@ -4524,7 +4526,6 @@
%macro FILTER_VER_CHROMA_W6 3
INIT_XMM sse4
cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
-
add r1d, r1d
add r3d, r3d
sub r0, r1
@@ -4543,9 +4544,9 @@
%ifnidn %2, ps
mova m7, [pw_pixel_max]
%ifidn %2, pp
- mova m6, [tab_c_32]
+ mova m6, [INTERP_OFFSET_PP]
%else
- mova m6, [tab_c_524800]
+ mova m6, [INTERP_OFFSET_SP]
%endif
%else
mova m6, [INTERP_OFFSET_PS]
@@ -4568,10 +4569,10 @@
paddd m1, m6
paddd m2, m6
paddd m3, m6
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -4581,15 +4582,15 @@
paddd m2, m6
paddd m3, m6
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -4616,18 +4617,18 @@
%elifidn %2, ps
paddd m0, m6
paddd m2, m6
- psrad m0, 2
- psrad m2, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
packssdw m0, m2
%else
paddd m0, m6
paddd m2, m6
%ifidn %2, pp
- psrad m0, 6
- psrad m2, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
%else
- psrad m0, 10
- psrad m2, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
%endif
packusdw m0, m2
CLIPW m0, m5, m7
@@ -4644,7 +4645,6 @@
dec r4d
jnz .loopH
-
RET
%endmacro
@@ -4712,7 +4712,7 @@
mov r4d, %2/2
%ifidn %3, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %3, sp
mova m7, [INTERP_OFFSET_SP]
%elifidn %3, ps
@@ -4748,10 +4748,10 @@
paddd m2, m7
paddd m3, m7
%ifidn %3, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%else
psrad m0, INTERP_SHIFT_SP
psrad m1, INTERP_SHIFT_SP
@@ -4772,7 +4772,6 @@
dec r4d
jnz .loopH
-
RET
%endmacro
@@ -4868,9 +4867,9 @@
mov r6d, %1/4
%ifidn %2,pp
- vbroadcasti128 m8, [pd_32]
+ vbroadcasti128 m8, [INTERP_OFFSET_PP]
%elifidn %2, sp
- mova m8, [pd_524800]
+ mova m8, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m8, [INTERP_OFFSET_PS]
%endif
@@ -4934,20 +4933,20 @@
paddd m2, m8
paddd m3, m8
%ifidn %2,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%elifidn %2, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
%endif
%endif
@@ -5012,9 +5011,9 @@
mov r4d, %1/2
%ifidn %2, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %2, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %2, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -5034,10 +5033,10 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -5047,15 +5046,15 @@
paddd m2, m7
paddd m3, m7
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+%else
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5184,9 +5183,9 @@
mov r4d, %1/2
%ifidn %2, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %2, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %2, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -5213,18 +5212,18 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
paddd m8, m7
paddd m9, m7
paddd m10, m7
paddd m11, m7
- psrad m8, 2
- psrad m9, 2
- psrad m10, 2
- psrad m11, 2
+ psrad m8, INTERP_SHIFT_PS
+ psrad m9, INTERP_SHIFT_PS
+ psrad m10, INTERP_SHIFT_PS
+ psrad m11, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -5240,23 +5239,23 @@
paddd m10, m7
paddd m11, m7
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
- psrad m8, 6
- psrad m9, 6
- psrad m10, 6
- psrad m11, 6
-%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
- psrad m8, 10
- psrad m9, 10
- psrad m10, 10
- psrad m11, 10
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+ psrad m8, INTERP_SHIFT_PP
+ psrad m9, INTERP_SHIFT_PP
+ psrad m10, INTERP_SHIFT_PP
+ psrad m11, INTERP_SHIFT_PP
+%else
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+ psrad m8, INTERP_SHIFT_SP
+ psrad m9, INTERP_SHIFT_SP
+ psrad m10, INTERP_SHIFT_SP
+ psrad m11, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5326,9 +5325,9 @@
mov r4d, %1/2
%ifidn %2, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %2, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %2, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -5380,10 +5379,10 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -5393,15 +5392,15 @@
paddd m2, m7
paddd m3, m7
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+%else
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5457,9 +5456,9 @@
mov r4d, %1/2
%ifidn %2, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %2, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %2, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -5479,10 +5478,10 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -5492,15 +5491,15 @@
paddd m2, m7
paddd m3, m7
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+%else
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5610,9 +5609,9 @@
mov r4d, %1/2
%ifidn %2, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %2, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %2, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -5639,18 +5638,18 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
paddd m8, m7
paddd m9, m7
paddd m10, m7
paddd m11, m7
- psrad m8, 2
- psrad m9, 2
- psrad m10, 2
- psrad m11, 2
+ psrad m8, INTERP_SHIFT_PS
+ psrad m9, INTERP_SHIFT_PS
+ psrad m10, INTERP_SHIFT_PS
+ psrad m11, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -5666,23 +5665,23 @@
paddd m10, m7
paddd m11, m7
%ifidn %2, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
- psrad m8, 6
- psrad m9, 6
- psrad m10, 6
- psrad m11, 6
-%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
- psrad m8, 10
- psrad m9, 10
- psrad m10, 10
- psrad m11, 10
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+ psrad m8, INTERP_SHIFT_PP
+ psrad m9, INTERP_SHIFT_PP
+ psrad m10, INTERP_SHIFT_PP
+ psrad m11, INTERP_SHIFT_PP
+%else
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+ psrad m8, INTERP_SHIFT_SP
+ psrad m9, INTERP_SHIFT_SP
+ psrad m10, INTERP_SHIFT_SP
+ psrad m11, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5733,9 +5732,9 @@
mov r4d, 32
%ifidn %1, pp
- mova m7, [tab_c_32]
+ mova m7, [INTERP_OFFSET_PP]
%elifidn %1, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %1, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -5787,10 +5786,10 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -5800,15 +5799,15 @@
paddd m2, m7
paddd m3, m7
%ifidn %1, pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+%else
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5827,6 +5826,7 @@
jnz .loopH
RET
%endmacro
+
FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8
FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8
FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7
@@ -5834,7 +5834,6 @@
INIT_XMM sse2
cglobal chroma_p2s, 3, 7, 3
-
; load width and height
mov r3d, r3m
mov r4d, r4m
@@ -5850,11 +5849,11 @@
lea r6, [r0 + r5 * 2]
movu m0, [r6]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
paddw m0, m2
movu m1, [r6 + r1]
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
paddw m1, m2
add r5d, 8
@@ -5887,7 +5886,6 @@
sub r4d, 2
jnz .loopH
-
RET
%macro PROCESS_LUMA_VER_W4_4R 0
@@ -5975,7 +5973,7 @@
lea r6, [tab_LumaCoeffV + r4]
%endif
- mova m7, [pd_32]
+ mova m7, [INTERP_OFFSET_PP]
mov dword [rsp], %2/4
.loopH:
@@ -5988,10 +5986,10 @@
paddd m2, m7
paddd m3, m7
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
packssdw m0, m1
packssdw m2, m3
@@ -6017,7 +6015,6 @@
dec dword [rsp]
jnz .loopH
-
RET
%endmacro
@@ -6126,14 +6123,14 @@
paddd m0, m6
paddd m2, m6
%ifidn %1,pp
- psrad m0, 6
- psrad m2, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m2, 10
-%else
- psrad m0, 2
- psrad m2, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
%endif
%endif
@@ -6294,20 +6291,20 @@
paddd m2, m11
paddd m3, m11
%ifidn %1,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
%endif
%endif
@@ -6365,20 +6362,20 @@
paddd m6, m11
paddd m7, m11
%ifidn %1,pp
- psrad m4, 6
- psrad m5, 6
- psrad m6, 6
- psrad m7, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
+ psrad m6, INTERP_SHIFT_PP
+ psrad m7, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m4, 10
- psrad m5, 10
- psrad m6, 10
- psrad m7, 10
-%else
- psrad m4, 2
- psrad m5, 2
- psrad m6, 2
- psrad m7, 2
+ psrad m4, INTERP_SHIFT_SP
+ psrad m5, INTERP_SHIFT_SP
+ psrad m6, INTERP_SHIFT_SP
+ psrad m7, INTERP_SHIFT_SP
+%else
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
+ psrad m6, INTERP_SHIFT_PS
+ psrad m7, INTERP_SHIFT_PS
%endif
%endif
@@ -6538,26 +6535,26 @@
paddd m4, m14
paddd m5, m14
%ifidn %1,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
- psrad m4, 6
- psrad m5, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
- psrad m4, 10
- psrad m5, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
- psrad m4, 2
- psrad m5, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+ psrad m4, INTERP_SHIFT_SP
+ psrad m5, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
%endif
%endif
@@ -6620,14 +6617,14 @@
paddd m6, m14
paddd m7, m14
%ifidn %1,pp
- psrad m6, 6
- psrad m7, 6
+ psrad m6, INTERP_SHIFT_PP
+ psrad m7, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m6, 10
- psrad m7, 10
-%else
- psrad m6, 2
- psrad m7, 2
+ psrad m6, INTERP_SHIFT_SP
+ psrad m7, INTERP_SHIFT_SP
+%else
+ psrad m6, INTERP_SHIFT_PS
+ psrad m7, INTERP_SHIFT_PS
%endif
%endif
@@ -6734,32 +6731,32 @@
paddd m0, m14
paddd m1, m14
%ifidn %1,pp
- psrad m8, 6
- psrad m9, 6
- psrad m10, 6
- psrad m11, 6
- psrad m12, 6
- psrad m13, 6
- psrad m0, 6
- psrad m1, 6
+ psrad m8, INTERP_SHIFT_PP
+ psrad m9, INTERP_SHIFT_PP
+ psrad m10, INTERP_SHIFT_PP
+ psrad m11, INTERP_SHIFT_PP
+ psrad m12, INTERP_SHIFT_PP
+ psrad m13, INTERP_SHIFT_PP
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m8, 10
- psrad m9, 10
- psrad m10, 10
- psrad m11, 10
- psrad m12, 10
- psrad m13, 10
- psrad m0, 10
- psrad m1, 10
-%else
- psrad m8, 2
- psrad m9, 2
- psrad m10, 2
- psrad m11, 2
- psrad m12, 2
- psrad m13, 2
- psrad m0, 2
- psrad m1, 2
+ psrad m8, INTERP_SHIFT_SP
+ psrad m9, INTERP_SHIFT_SP
+ psrad m10, INTERP_SHIFT_SP
+ psrad m11, INTERP_SHIFT_SP
+ psrad m12, INTERP_SHIFT_SP
+ psrad m13, INTERP_SHIFT_SP
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+%else
+ psrad m8, INTERP_SHIFT_PS
+ psrad m9, INTERP_SHIFT_PS
+ psrad m10, INTERP_SHIFT_PS
+ psrad m11, INTERP_SHIFT_PS
+ psrad m12, INTERP_SHIFT_PS
+ psrad m13, INTERP_SHIFT_PS
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
%endif
%endif
@@ -6819,7 +6816,7 @@
%ifidn %1,pp
vbroadcasti128 m14, [pd_32]
%elifidn %1, sp
- mova m14, [pd_524800]
+ mova m14, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m14, [INTERP_OFFSET_PS]
%endif
@@ -6870,7 +6867,7 @@
%ifidn %3,pp
vbroadcasti128 m14, [pd_32]
%elifidn %3, sp
- mova m14, [pd_524800]
+ mova m14, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m14, [INTERP_OFFSET_PS]
%endif
@@ -6953,7 +6950,7 @@
%ifidn %1,pp
vbroadcasti128 m14, [pd_32]
%elifidn %1, sp
- mova m14, [pd_524800]
+ mova m14, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m14, [INTERP_OFFSET_PS]
%endif
@@ -7089,26 +7086,26 @@
paddd m4, m14
paddd m5, m14
%ifidn %1,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
- psrad m4, 6
- psrad m5, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
- psrad m4, 10
- psrad m5, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
- psrad m4, 2
- psrad m5, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+ psrad m4, INTERP_SHIFT_SP
+ psrad m5, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
%endif
%endif
@@ -7171,14 +7168,14 @@
paddd m6, m14
paddd m7, m14
%ifidn %1,pp
- psrad m6, 6
- psrad m7, 6
+ psrad m6, INTERP_SHIFT_PP
+ psrad m7, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m6, 10
- psrad m7, 10
-%else
- psrad m6, 2
- psrad m7, 2
+ psrad m6, INTERP_SHIFT_SP
+ psrad m7, INTERP_SHIFT_SP
+%else
+ psrad m6, INTERP_SHIFT_PS
+ psrad m7, INTERP_SHIFT_PS
%endif
%endif
@@ -7285,32 +7282,32 @@
paddd m0, m14
paddd m1, m14
%ifidn %1,pp
- psrad m8, 6
- psrad m9, 6
- psrad m10, 6
- psrad m11, 6
- psrad m12, 6
- psrad m13, 6
- psrad m0, 6
- psrad m1, 6
+ psrad m8, INTERP_SHIFT_PP
+ psrad m9, INTERP_SHIFT_PP
+ psrad m10, INTERP_SHIFT_PP
+ psrad m11, INTERP_SHIFT_PP
+ psrad m12, INTERP_SHIFT_PP
+ psrad m13, INTERP_SHIFT_PP
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m8, 10
- psrad m9, 10
- psrad m10, 10
- psrad m11, 10
- psrad m12, 10
- psrad m13, 10
- psrad m0, 10
- psrad m1, 10
-%else
- psrad m8, 2
- psrad m9, 2
- psrad m10, 2
- psrad m11, 2
- psrad m12, 2
- psrad m13, 2
- psrad m0, 2
- psrad m1, 2
+ psrad m8, INTERP_SHIFT_SP
+ psrad m9, INTERP_SHIFT_SP
+ psrad m10, INTERP_SHIFT_SP
+ psrad m11, INTERP_SHIFT_SP
+ psrad m12, INTERP_SHIFT_SP
+ psrad m13, INTERP_SHIFT_SP
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+%else
+ psrad m8, INTERP_SHIFT_PS
+ psrad m9, INTERP_SHIFT_PS
+ psrad m10, INTERP_SHIFT_PS
+ psrad m11, INTERP_SHIFT_PS
+ psrad m12, INTERP_SHIFT_PS
+ psrad m13, INTERP_SHIFT_PS
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
%endif
%endif
@@ -7485,26 +7482,26 @@
paddd m4, m11
paddd m5, m11
%ifidn %1,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
- psrad m4, 6
- psrad m5, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
- psrad m4, 10
- psrad m5, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
- psrad m4, 2
- psrad m5, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+ psrad m4, INTERP_SHIFT_SP
+ psrad m5, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
%endif
%endif
@@ -7556,14 +7553,14 @@
paddd m6, m11
paddd m7, m11
%ifidn %1,pp
- psrad m6, 6
- psrad m7, 6
+ psrad m6, INTERP_SHIFT_PP
+ psrad m7, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m6, 10
- psrad m7, 10
-%else
- psrad m6, 2
- psrad m7, 2
+ psrad m6, INTERP_SHIFT_SP
+ psrad m7, INTERP_SHIFT_SP
+%else
+ psrad m6, INTERP_SHIFT_PS
+ psrad m7, INTERP_SHIFT_PS
%endif
%endif
@@ -7600,7 +7597,7 @@
%ifidn %1,pp
vbroadcasti128 m11, [pd_32]
%elifidn %1, sp
- mova m11, [pd_524800]
+ mova m11, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m11, [INTERP_OFFSET_PS]
%endif
@@ -7647,7 +7644,7 @@
%ifidn %1,pp
vbroadcasti128 m14, [pd_32]
%elifidn %1, sp
- mova m14, [pd_524800]
+ mova m14, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m14, [INTERP_OFFSET_PS]
%endif
@@ -7765,20 +7762,20 @@
paddd m2, m7
paddd m3, m7
%ifidn %1,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
%endif
%endif
@@ -7801,7 +7798,7 @@
%macro FILTER_VER_LUMA_AVX2_16x4 1
INIT_YMM avx2
-cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
+cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize
mov r4d, r4m
shl r4d, 7
add r1d, r1d
@@ -7819,7 +7816,7 @@
%ifidn %1,pp
vbroadcasti128 m7, [pd_32]
%elifidn %1, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m7, [INTERP_OFFSET_PS]
%endif
@@ -7864,7 +7861,7 @@
%ifidn %1,pp
vbroadcasti128 m7, [pd_32]
%elifidn %1, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m7, [INTERP_OFFSET_PS]
%endif
@@ -7904,7 +7901,7 @@
%ifidn %1,pp
vbroadcasti128 m14, [pd_32]
%elifidn %1, sp
- mova m14, [pd_524800]
+ mova m14, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m14, [INTERP_OFFSET_PS]
%endif
@@ -8014,20 +8011,20 @@
paddd m2, m14
paddd m3, m14
%ifidn %1,pp
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
-%else
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
%endif
%endif
@@ -8105,20 +8102,20 @@
paddd m6, m14
paddd m7, m14
%ifidn %1,pp
- psrad m4, 6
- psrad m5, 6
- psrad m6, 6
- psrad m7, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
+ psrad m6, INTERP_SHIFT_PP
+ psrad m7, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m4, 10
- psrad m5, 10
- psrad m6, 10
- psrad m7, 10
-%else
- psrad m4, 2
- psrad m5, 2
- psrad m6, 2
- psrad m7, 2
+ psrad m4, INTERP_SHIFT_SP
+ psrad m5, INTERP_SHIFT_SP
+ psrad m6, INTERP_SHIFT_SP
+ psrad m7, INTERP_SHIFT_SP
+%else
+ psrad m4, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
+ psrad m6, INTERP_SHIFT_PS
+ psrad m7, INTERP_SHIFT_PS
%endif
%endif
@@ -8182,20 +8179,20 @@
paddd m10, m14
paddd m11, m14
%ifidn %1,pp
- psrad m8, 6
- psrad m9, 6
- psrad m10, 6
- psrad m11, 6
+ psrad m8, INTERP_SHIFT_PP
+ psrad m9, INTERP_SHIFT_PP
+ psrad m10, INTERP_SHIFT_PP
+ psrad m11, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m8, 10
- psrad m9, 10
- psrad m10, 10
- psrad m11, 10
-%else
- psrad m8, 2
- psrad m9, 2
- psrad m10, 2
- psrad m11, 2
+ psrad m8, INTERP_SHIFT_SP
+ psrad m9, INTERP_SHIFT_SP
+ psrad m10, INTERP_SHIFT_SP
+ psrad m11, INTERP_SHIFT_SP
+%else
+ psrad m8, INTERP_SHIFT_PS
+ psrad m9, INTERP_SHIFT_PS
+ psrad m10, INTERP_SHIFT_PS
+ psrad m11, INTERP_SHIFT_PS
%endif
%endif
@@ -8251,7 +8248,7 @@
%ifidn %1,pp
vbroadcasti128 m7, [pd_32]
%elifidn %1, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m7, [INTERP_OFFSET_PS]
%endif
@@ -8315,14 +8312,14 @@
paddd m0, m7
paddd m2, m7
%ifidn %1,pp
- psrad m0, 6
- psrad m2, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m2, 10
-%else
- psrad m0, 2
- psrad m2, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
%endif
%endif
@@ -8366,14 +8363,14 @@
paddd m4, m7
paddd m1, m7
%ifidn %1,pp
- psrad m4, 6
- psrad m1, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m4, 10
- psrad m1, 10
-%else
- psrad m4, 2
- psrad m1, 2
+ psrad m4, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+%else
+ psrad m4, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
%endif
%endif
@@ -8458,14 +8455,14 @@
paddd m0, m7
paddd m2, m7
%ifidn %1,pp
- psrad m0, 6
- psrad m2, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m2, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m2, 10
-%else
- psrad m0, 2
- psrad m2, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
%endif
%endif
@@ -8516,14 +8513,14 @@
paddd m4, m7
paddd m1, m7
%ifidn %1,pp
- psrad m4, 6
- psrad m1, 6
+ psrad m4, INTERP_SHIFT_PP
+ psrad m1, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m4, 10
- psrad m1, 10
-%else
- psrad m4, 2
- psrad m1, 2
+ psrad m4, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+%else
+ psrad m4, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
%endif
%endif
@@ -8574,14 +8571,14 @@
paddd m6, m7
paddd m5, m7
%ifidn %1,pp
- psrad m6, 6
- psrad m5, 6
+ psrad m6, INTERP_SHIFT_PP
+ psrad m5, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m6, 10
- psrad m5, 10
-%else
- psrad m6, 2
- psrad m5, 2
+ psrad m6, INTERP_SHIFT_SP
+ psrad m5, INTERP_SHIFT_SP
+%else
+ psrad m6, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
%endif
%endif
@@ -8625,14 +8622,14 @@
paddd m0, m7
paddd m3, m7
%ifidn %1,pp
- psrad m0, 6
- psrad m3, 6
+ psrad m0, INTERP_SHIFT_PP
+ psrad m3, INTERP_SHIFT_PP
%elifidn %1, sp
- psrad m0, 10
- psrad m3, 10
-%else
- psrad m0, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
+%else
+ psrad m0, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
%endif
%endif
@@ -8671,7 +8668,7 @@
%ifidn %1,pp
vbroadcasti128 m7, [pd_32]
%elifidn %1, sp
- mova m7, [pd_524800]
+ mova m7, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m7, [INTERP_OFFSET_PS]
%endif
@@ -8706,7 +8703,7 @@
%ifidn %1,pp
vbroadcasti128 m14, [pd_32]
%elifidn %1, sp
- mova m14, [pd_524800]
+ mova m14, [INTERP_OFFSET_SP]
%else
vbroadcasti128 m14, [INTERP_OFFSET_PS]
%endif
@@ -8758,10 +8755,10 @@
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -8784,7 +8781,6 @@
dec dword [rsp]
jnz .loopH
-
RET
%endmacro
@@ -8837,7 +8833,7 @@
lea r6, [tab_LumaCoeffV + r4]
%endif
- mova m7, [tab_c_524800]
+ mova m7, [INTERP_OFFSET_SP]
mov dword [rsp], %2/4
.loopH:
@@ -8850,10 +8846,10 @@
paddd m2, m7
paddd m3, m7
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
packssdw m0, m1
packssdw m2, m3
@@ -8879,7 +8875,6 @@
dec dword [rsp]
jnz .loopH
-
RET
%endmacro
@@ -8963,7 +8958,6 @@
dec dword [rsp]
jnz .loopH
-
RET
%endmacro
@@ -9011,7 +9005,7 @@
%rep %1/4
movd m0, [r0]
movhps m0, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movd [r2 + r3 * 0], m0
@@ -9019,7 +9013,7 @@
movd m0, [r0 + r1 * 2]
movhps m0, [r0 + r4]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movd [r2 + r3 * 2], m0
@@ -10293,14 +10287,13 @@
mov r4d, r4m
add r1d, r1d
add r3d, r3d
-%ifdef PIC
-
+
+%ifdef PIC
lea r6, [tab_LumaCoeff]
- lea r4 , [r4 * 8]
+ lea r4, [r4 * 8]
vbroadcasti128 m0, [r6 + r4 * 2]
-
-%else
- lea r4 , [r4 * 8]
+%else
+ lea r4, [r4 * 8]
vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2]
%endif
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/loopfilter.asm Tue Jul 21 14:30:11 2015 -0700
@@ -39,7 +39,7 @@
cextern pb_128
cextern pb_2
cextern pw_2
-cextern pw_1023
+cextern pw_pixel_max
cextern pb_movemask
cextern pw_1
cextern hmul_16p
@@ -81,7 +81,7 @@
palignr m2, m3, m5, 15
por m2, m0
- mova m4, [pw_1023]
+ mova m4, [pw_pixel_max]
psignb m2, [pb_128] ; m2 = signLeft
pxor m0, m0
palignr m0, m3, 15
@@ -127,7 +127,7 @@
palignr m2, m3, m5, 15
por m2, m0
- mova m4, [pw_1023]
+ mova m4, [pw_pixel_max]
psignb m2, [pb_128] ; m2 = signLeft
pxor m0, m0
palignr m0, m3, 15
@@ -249,7 +249,7 @@
neg r1b
movd xm1, r1d
vinserti128 m0, m0, xm1, 1
- mova m5, [pw_1023]
+ mova m5, [pw_pixel_max]
mov r1d, r4m
add r1d, r1d
shr r2d, 4
@@ -402,8 +402,8 @@
pmaxsw m7, m0
pmaxsw m5, m0
- pminsw m7, [pw_1023]
- pminsw m5, [pw_1023]
+ pminsw m7, [pw_pixel_max]
+ pminsw m5, [pw_pixel_max]
movu [r0], m7
movu [r0 + 16], m5
@@ -468,7 +468,7 @@
mov r4d, r4m
mova m4, [pb_2]
shr r4d, 4
- mova m0, [pw_1023]
+ mova m0, [pw_pixel_max]
.loop
movu m5, [r0]
movu m3, [r0 + r3]
@@ -559,7 +559,7 @@
add r3d, r3d
mov r4d, r4m
pxor m0, m0 ; m0 = 0
- mova m6, [pw_1023]
+ mova m6, [pw_pixel_max]
mov r5d, r4d
shr r4d, 4
mov r6, r0
@@ -736,7 +736,7 @@
cglobal saoCuOrgE1_2Rows, 4,5,8
add r3d, r3d
mov r4d, r4m
- mova m4, [pw_1023]
+ mova m4, [pw_pixel_max]
vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo
shr r4d, 4
.loop
@@ -884,8 +884,8 @@
paddw m5, m4
pmaxsw m7, m0
pmaxsw m5, m0
- pminsw m7, [pw_1023]
- pminsw m5, [pw_1023]
+ pminsw m7, [pw_pixel_max]
+ pminsw m5, [pw_pixel_max]
movu [r0], m7
movu [r0 + 16], m5
@@ -960,7 +960,7 @@
movq xm4, [r0 + r4 * 2]
movhps xm4, [r1 + r4]
vbroadcasti128 m5, [r3]
- mova m6, [pw_1023]
+ mova m6, [pw_pixel_max]
.loop
movu m1, [r0]
movu m3, [r0 + r5 + 2]
@@ -1086,8 +1086,8 @@
paddw m7, m6
pmaxsw m1, m0
pmaxsw m7, m0
- pminsw m1, [pw_1023]
- pminsw m7, [pw_1023]
+ pminsw m1, [pw_pixel_max]
+ pminsw m7, [pw_pixel_max]
movu [r0], m1
movu [r0 + 32], m7
@@ -1212,8 +1212,8 @@
paddw m5, m4
pmaxsw m7, m0
pmaxsw m5, m0
- pminsw m7, [pw_1023]
- pminsw m5, [pw_1023]
+ pminsw m7, [pw_pixel_max]
+ pminsw m5, [pw_pixel_max]
movu [r0], m7
movu [r0 + 16], m5
@@ -1333,7 +1333,7 @@
paddw m1, m3
pxor m0, m0
pmaxsw m1, m0
- pminsw m1, [pw_1023]
+ pminsw m1, [pw_pixel_max]
movu [r0], m1
psubb xm0, xm2
@@ -1461,8 +1461,8 @@
pxor m0, m0
pmaxsw m1, m0
pmaxsw m7, m0
- pminsw m1, [pw_1023]
- pminsw m7, [pw_1023]
+ pminsw m1, [pw_pixel_max]
+ pminsw m7, [pw_pixel_max]
movu [r0], m1
movu [r0 + 32], m7
@@ -1565,8 +1565,8 @@
.loopW
movu m2, [r0 + r6]
movu m5, [r0 + r6 + 16]
- psrlw m0, m2, 5
- psrlw m6, m5, 5
+ psrlw m0, m2, (BIT_DEPTH - 5)
+ psrlw m6, m5, (BIT_DEPTH - 5)
packuswb m0, m6
pand m0, [pb_31] ; m0 = [index]
@@ -1584,8 +1584,8 @@
paddw m5, m6
pmaxsw m2, m7
pmaxsw m5, m7
- pminsw m2, [pw_1023]
- pminsw m5, [pw_1023]
+ pminsw m2, [pw_pixel_max]
+ pminsw m5, [pw_pixel_max]
movu [r0 + r6], m2
movu [r0 + r6 + 16], m5
@@ -1656,7 +1656,7 @@
sub r1d, r2d
sub r1d, r2d
shr r2d, 4
- mova m7, [pw_1023]
+ mova m7, [pw_pixel_max]
mov r6d, r3d
shr r3d, 1
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/mc-a.asm Tue Jul 21 14:30:11 2015 -0700
@@ -32,6 +32,19 @@
%include "x86inc.asm"
%include "x86util.asm"
+%if BIT_DEPTH==8
+ %define ADDAVG_FACTOR 256
+ %define ADDAVG_ROUND 128
+%elif BIT_DEPTH==10
+ %define ADDAVG_FACTOR 1024
+ %define ADDAVG_ROUND 512
+%elif BIT_DEPTH==12
+ %define ADDAVG_FACTOR 4096
+ %define ADDAVG_ROUND 2048
+%else
+ %error Unsupport bit depth!
+%endif
+
SECTION_RODATA 32
ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
@@ -54,6 +67,8 @@
cextern pw_512
cextern pw_1023
cextern pw_1024
+cextern pw_2048
+cextern pw_4096
cextern pw_00ff
cextern pw_pixel_max
cextern pd_32
@@ -92,23 +107,24 @@
punpcklqdq m1, m2
punpcklqdq m3, m5
paddw m1, m3
- pmulhrsw m1, [pw_1024]
- paddw m1, [pw_512]
+ pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR]
+ paddw m1, [pw_ %+ ADDAVG_ROUND]
pxor m0, m0
pmaxsw m1, m0
- pminsw m1, [pw_1023]
+ pminsw m1, [pw_pixel_max]
movd [r2], m1
pextrd [r2 + r5], m1, 1
lea r2, [r2 + 2 * r5]
pextrd [r2], m1, 2
pextrd [r2 + r5], m1, 3
-
RET
+
+
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m0, [pw_512]
+ mova m0, [pw_ %+ ADDAVG_ROUND]
pxor m7, m7
add r3, r3
add r4, r4
@@ -136,11 +152,11 @@
punpcklqdq m1, m2
punpcklqdq m3, m5
paddw m1, m3
- pmulhrsw m1, [pw_1024]
+ pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR]
paddw m1, m0
pmaxsw m1, m7
- pminsw m1, [pw_1023]
+ pminsw m1, [pw_pixel_max]
movd [r2], m1
pextrd [r2 + r5], m1, 1
lea r2, [r2 + 2 * r5]
@@ -156,8 +172,8 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m6, [pw_1023]
- mova m7, [pw_1024]
+ mova m6, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
mov r6d, 16/4
add r3, r3
add r4, r4
@@ -183,7 +199,7 @@
punpcklqdq m3, m5
paddw m1, m3
pmulhrsw m1, m7
- paddw m1, [pw_512]
+ paddw m1, [pw_ %+ ADDAVG_ROUND]
pxor m0, m0
pmaxsw m1, m0
pminsw m1, m6
@@ -213,21 +229,21 @@
punpcklqdq m0, m1
punpcklqdq m2, m3
paddw m0, m2
- pmulhrsw m0, [pw_1024]
- paddw m0, [pw_512]
+ pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR]
+ paddw m0, [pw_ %+ ADDAVG_ROUND]
pxor m6, m6
pmaxsw m0, m6
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
movh [r2], m0
movhps [r2 + r5], m0
RET
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -264,9 +280,9 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
mov r6d, 16/2
add r3, r3
@@ -300,9 +316,9 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -331,9 +347,9 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -370,9 +386,9 @@
%macro ADDAVG_W4_H4 1
INIT_XMM sse4
cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -420,9 +436,9 @@
%macro ADDAVG_W8_H4 1
INIT_XMM sse4
cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -470,9 +486,9 @@
%macro ADDAVG_W12_H4 1
INIT_XMM sse4
cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -532,9 +548,9 @@
%macro ADDAVG_W16_H4 1
INIT_XMM sse4
cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -601,9 +617,9 @@
%macro ADDAVG_W24_H2 2
INIT_XMM sse4
cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -683,9 +699,9 @@
%macro ADDAVG_W32_H2 1
INIT_XMM sse4
cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -787,9 +803,9 @@
%macro ADDAVG_W48_H2 1
INIT_XMM sse4
cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -921,9 +937,9 @@
%macro ADDAVG_W64_H1 1
INIT_XMM sse4
cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m7, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m7, [pw_ %+ ADDAVG_FACTOR]
pxor m6, m6
add r3, r3
add r4, r4
@@ -1029,19 +1045,19 @@
paddw m0, m1
pxor m1, m1
- pmulhrsw m0, [pw_1024]
- paddw m0, [pw_512]
+ pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR]
+ paddw m0, [pw_ %+ ADDAVG_ROUND]
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
vextracti128 xm1, m0, 1
movu [r2], xm0
movu [r2 + r5 * 2], xm1
RET
cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m1, m1
add r3d, r3d
add r4d, r4d
@@ -1100,9 +1116,9 @@
%macro ADDAVG_W8_H4_AVX2 1
cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m1, m1
add r3d, r3d
add r4d, r4d
@@ -1159,9 +1175,9 @@
ADDAVG_W8_H4_AVX2 64
cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m1, m1
add r3, r3
add r4, r4
@@ -1201,8 +1217,8 @@
RET
cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
paddw m3, m4, m4
pxor m1, m1
add r3, r3
@@ -1244,9 +1260,9 @@
%macro ADDAVG_W16_H4_AVX2 1
cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m2, m2
add r3, r3
add r4, r4
@@ -1291,9 +1307,9 @@
ADDAVG_W16_H4_AVX2 64
cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m1, m1
add r3, r3
add r4, r4
@@ -1347,8 +1363,8 @@
RET
cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
paddw m3, m4, m4
pxor m1, m1
add r3, r3
@@ -1404,9 +1420,9 @@
%macro ADDAVG_W32_H2_AVX2 1
cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m2, m2
add r3, r3
add r4, r4
@@ -1468,9 +1484,9 @@
ADDAVG_W32_H2_AVX2 64
cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m2, m2
add r3, r3
add r4, r4
@@ -1543,9 +1559,9 @@
%macro ADDAVG_W64_H1_AVX2 1
cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_512]
- mova m5, [pw_1023]
- mova m3, [pw_1024]
+ mova m4, [pw_ %+ ADDAVG_ROUND]
+ mova m5, [pw_pixel_max]
+ mova m3, [pw_ %+ ADDAVG_FACTOR]
pxor m2, m2
add r3d, r3d
add r4d, r4d
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/pixel-util8.asm Tue Jul 21 14:30:11 2015 -0700
@@ -879,8 +879,8 @@
%if HIGH_BIT_DEPTH
cmp r3d, 32767
jle .skip
- shr r3d, 2
- sub r4d, 2
+ shr r3d, (BIT_DEPTH - 8)
+ sub r4d, (BIT_DEPTH - 8)
.skip:
%endif
movd m0, r4d ; m0 = shift
@@ -1273,13 +1273,7 @@
INIT_XMM sse4
cglobal weight_pp, 4,7,7
%define correction (14 - BIT_DEPTH)
-%if BIT_DEPTH == 10
- mova m6, [pw_1023]
-%elif BIT_DEPTH == 12
- mova m6, [pw_3fff]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m6, [pw_pixel_max]
mov r6d, r6m
mov r4d, r4m
mov r5d, r5m
@@ -1423,7 +1417,7 @@
movd xm1, r7m
vpbroadcastd m2, r8m
mova m5, [pw_1]
- mova m6, [pw_1023]
+ mova m6, [pw_pixel_max]
add r2d, r2d
add r3d, r3d
sub r2d, r3d
@@ -1516,13 +1510,7 @@
%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal weight_sp, 6,7,8
-%if BIT_DEPTH == 10
- mova m1, [pw_1023]
-%elif BIT_DEPTH == 12
- mova m1, [pw_3fff]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m1, [pw_pixel_max]
mova m2, [pw_1]
mov r6d, r7m
shl r6d, 16
@@ -1681,7 +1669,7 @@
%if HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal weight_sp, 6,7,9
- mova m1, [pw_1023]
+ mova m1, [pw_pixel_max]
mova m2, [pw_1]
mov r6d, r7m
shl r6d, 16
More information about the x265-devel
mailing list