[x264-devel] [Git][videolan/x264][master] 4 commits: loongarch: Update loongson_asm.S version to 0.4.0
Anton Mitrofanov (@BugMaster)
gitlab at videolan.org
Thu Mar 21 07:22:09 UTC 2024
Anton Mitrofanov pushed to branch master at VideoLAN / x264
Commits:
982d3240 by Xiwei Gu at 2024-03-21T09:17:09+08:00
loongarch: Update loongson_asm.S version to 0.4.0
- - - - -
5a61afdb by Xiwei Gu at 2024-03-21T09:18:00+08:00
loongarch: Add checkasm_call
- - - - -
16262286 by Xiwei Gu at 2024-03-21T09:18:32+08:00
loongarch: Fixed pixel_sa8d_16x16_lasx
Save and restore FPR
- - - - -
7ed753b1 by Xiwei Gu at 2024-03-21T09:18:50+08:00
loongarch: Enhance ultrafast encoding performance
Using the following command, ultrafast encoding
has improved from 182fps to 189fps:
./x264 --preset ultrafast -o out.mkv yuv_1920x1080.yuv
- - - - -
8 changed files:
- Makefile
- common/loongarch/loongson_asm.S
- common/loongarch/pixel-a.S
- common/loongarch/quant-a.S
- common/loongarch/quant.h
- common/quant.c
- + tools/checkasm-loongarch.S
- tools/checkasm.c
Changes:
=====================================
Makefile
=====================================
@@ -229,6 +229,7 @@ ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
+OBJCHK += tools/checkasm-loongarch.o
endif
endif
=====================================
common/loongarch/loongson_asm.S
=====================================
@@ -1,5 +1,5 @@
/*********************************************************************
- * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Copyright (c) 2022-2024 Loongson Technology Corporation Limited
* Contributed by Xiwei Gu <guxiwei-hf at loongson.cn>
* Shiyou Yin <yinshiyou-hf at loongson.cn>
*
@@ -31,12 +31,19 @@
*/
#define LML_VERSION_MAJOR 0
-#define LML_VERSION_MINOR 2
-#define LML_VERSION_MICRO 2
+#define LML_VERSION_MINOR 4
+#define LML_VERSION_MICRO 0
#define ASM_PREF
#define DEFAULT_ALIGN 5
+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+
.macro function name, align=DEFAULT_ALIGN
.macro endfunc
jirl $r0, $r1, 0x0
@@ -99,6 +106,39 @@ ASM_PREF\name: ;
#define sp $sp
#define ra $ra
+#define fa0 $fa0
+#define fa1 $fa1
+#define fa2 $fa2
+#define fa3 $fa3
+#define fa4 $fa4
+#define fa5 $fa5
+#define fa6 $fa6
+#define fa7 $fa7
+#define ft0 $ft0
+#define ft1 $ft1
+#define ft2 $ft2
+#define ft3 $ft3
+#define ft4 $ft4
+#define ft5 $ft5
+#define ft6 $ft6
+#define ft7 $ft7
+#define ft8 $ft8
+#define ft9 $ft9
+#define ft10 $ft10
+#define ft11 $ft11
+#define ft12 $ft12
+#define ft13 $ft13
+#define ft14 $ft14
+#define ft15 $ft15
+#define fs0 $fs0
+#define fs1 $fs1
+#define fs2 $fs2
+#define fs3 $fs3
+#define fs4 $fs4
+#define fs5 $fs5
+#define fs6 $fs6
+#define fs7 $fs7
+
#define f0 $f0
#define f1 $f1
#define f2 $f2
@@ -272,18 +312,17 @@ ASM_PREF\name: ;
.endm
/*
- * Description : Range each element of vector
+ * Description : Range element vj[i] to vk[i] ~ vj[i]
* clip: vj > vk ? vj : vk && vj < va ? vj : va
- * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
*/
.macro vclip.h vd, vj, vk, va
vmax.h \vd, \vj, \vk
vmin.h \vd, \vd, \va
.endm
-.macro vclip255.w vd, vj
- vmaxi.w \vd, \vj, 0
- vsat.wu \vd, \vd, 7
+.macro vclip.w vd, vj, vk, va
+ vmax.w \vd, \vj, \vk
+ vmin.w \vd, \vd, \va
.endm
.macro xvclip.h xd, xj, xk, xa
@@ -291,6 +330,25 @@ ASM_PREF\name: ;
xvmin.h \xd, \xd, \xa
.endm
+.macro xvclip.w xd, xj, xk, xa
+ xvmax.w \xd, \xj, \xk
+ xvmin.w \xd, \xd, \xa
+.endm
+
+/*
+ * Description : Range element vj[i] to 0 ~ 255
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip255.h vd, vj
+ vmaxi.h \vd, \vj, 0
+ vsat.hu \vd, \vd, 7
+.endm
+
+.macro vclip255.w vd, vj
+ vmaxi.w \vd, \vj, 0
+ vsat.wu \vd, \vd, 7
+.endm
+
.macro xvclip255.h xd, xj
xvmaxi.h \xd, \xj, 0
xvsat.hu \xd, \xd, 7
=====================================
common/loongarch/pixel-a.S
=====================================
@@ -1438,6 +1438,9 @@ endfunc_x264
* const Pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_sa8d_16x16_lasx
+ addi.d sp, sp, -8
+ fst.d f24, sp, 0
+
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
@@ -1753,6 +1756,9 @@ function_x264 pixel_sa8d_16x16_lasx
add.d t4, t4, t5
addi.d t4, t4, 2
srli.d a0, t4, 2
+
+ fld.d f24, sp, 0
+ addi.d sp, sp, 8
endfunc_x264
/*
=====================================
common/loongarch/quant-a.S
=====================================
@@ -984,3 +984,248 @@ function_x264 decimate_score64_lsx
jirl $r0, $r1, 0x0
.END_SCORE_64_LSX:
endfunc_x264
+
+/*
+ * int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel )
+ */
+function_x264 coeff_level_run16_lasx
+ addi.w t0, zero, 15
+
+ xvld xr0, a0, 0
+ xvldi xr2, 1
+
+ xvssrlni.bu.h xr0, xr0, 0
+ xvpermi.d xr1, xr0, 0xd8
+ xvsle.bu xr3, xr2, xr1
+ xvsrlni.b.h xr3, xr3, 4
+ xvpickve2gr.du t8, xr3, 0
+ clz.d t1, t8
+
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ st.w t0, a1, 0x00 // Store runlevel->last
+ addi.d t3, a1, 23
+ nor t2, zero, zero
+ addi.d t2, t2, -15
+ and t3, t3, t2 // runlevel->level
+ xor t4, t4, t4 // mask
+ xor t5, t5, t5 // total: number of non-zero elements
+ addi.w t6, zero, 1 // const 1
+.LOOP_COEFF_LEVEL_RUN16_LASX:
+ slli.w t7, t0, 1
+ ldx.h t2, a0, t7
+ st.h t2, t3, 0
+ addi.d t3, t3, 2
+
+ addi.w t5, t5, 1
+ sll.w t2, t6, t0
+ or t4, t4, t2
+ bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX
+
+ addi.w t0, t0, -1
+ slli.w t1, t1, 2
+ addi.w t1, t1, 4
+ sll.d t8, t8, t1
+ clz.d t1, t8
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX
+.END_COEFF_LEVEL_RUN16_LASX:
+ st.w t4, a1, 4
+ move a0, t5
+endfunc_x264
+
+function_x264 coeff_level_run15_lasx
+ addi.w t0, zero, 15
+
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ xvldi xr3, 1
+
+ vinsgr2vr.h vr1, zero, 7
+ xvpermi.q xr1, xr0, 0x20
+
+ xvssrlni.bu.h xr1, xr1, 0
+ xvpermi.d xr2, xr1, 0xd8
+ xvsle.bu xr4, xr3, xr2
+ xvsrlni.b.h xr4, xr4, 4
+ xvpickve2gr.du t8, xr4, 0
+ clz.d t1, t8
+
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ st.w t0, a1, 0x00 // Store runlevel->last
+ addi.d t3, a1, 23
+ nor t2, zero, zero
+ addi.d t2, t2, -15
+ and t3, t3, t2 // runlevel->level
+ xor t4, t4, t4 // mask
+ xor t5, t5, t5 // total: number of non-zero elements
+ addi.w t6, zero, 1 // const 1
+.LOOP_COEFF_LEVEL_RUN15_LASX:
+ slli.w t7, t0, 1
+ ldx.h t2, a0, t7
+ st.h t2, t3, 0
+ addi.d t3, t3, 2
+
+ addi.w t5, t5, 1
+ sll.w t2, t6, t0
+ or t4, t4, t2
+ bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX
+
+ addi.w t0, t0, -1
+ slli.w t1, t1, 2
+ addi.w t1, t1, 4
+ sll.d t8, t8, t1
+ clz.d t1, t8
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX
+.END_COEFF_LEVEL_RUN15_LASX:
+ st.w t4, a1, 4
+ move a0, t5
+endfunc_x264
+
+function_x264 coeff_level_run16_lsx
+ addi.w t0, zero, 15
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vldi vr2, 1
+
+ vssrlni.bu.h vr0, vr0, 0
+ vssrlni.bu.h vr1, vr1, 0
+ vpermi.w vr1, vr0, 0x44
+ vsle.bu vr3, vr2, vr1
+ vsrlni.b.h vr3, vr3, 4
+ vpickve2gr.du t8, vr3, 0
+ clz.d t1, t8
+
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ st.w t0, a1, 0x00 // Store runlevel->last
+ addi.d t3, a1, 23
+ nor t2, zero, zero
+ addi.d t2, t2, -15
+ and t3, t3, t2 // runlevel->level
+ xor t4, t4, t4 // mask
+ xor t5, t5, t5 // total: number of non-zero elements
+ addi.w t6, zero, 1 // const 1
+.LOOP_COEFF_LEVEL_RUN16_LSX:
+ slli.w t7, t0, 1
+ ldx.h t2, a0, t7
+ st.h t2, t3, 0
+ addi.d t3, t3, 2
+
+ addi.w t5, t5, 1
+ sll.w t2, t6, t0
+ or t4, t4, t2
+ bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX
+
+ addi.w t0, t0, -1
+ slli.w t1, t1, 2
+ addi.w t1, t1, 4
+ sll.d t8, t8, t1
+ clz.d t1, t8
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX
+.END_COEFF_LEVEL_RUN16_LSX:
+ st.w t4, a1, 4
+ move a0, t5
+endfunc_x264
+
+function_x264 coeff_level_run15_lsx
+ addi.w t0, zero, 15
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vldi vr2, 1
+ vinsgr2vr.h vr1, zero, 7
+
+ vssrlni.bu.h vr0, vr0, 0
+ vssrlni.bu.h vr1, vr1, 0
+ vpermi.w vr1, vr0, 0x44
+ vsle.bu vr3, vr2, vr1
+ vsrlni.b.h vr3, vr3, 4
+ vpickve2gr.du t8, vr3, 0
+ clz.d t1, t8
+
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ st.w t0, a1, 0x00 // Store runlevel->last
+ addi.d t3, a1, 23
+ nor t2, zero, zero
+ addi.d t2, t2, -15
+ and t3, t3, t2 // runlevel->level
+ xor t4, t4, t4 // mask
+ xor t5, t5, t5 // total: number of non-zero elements
+ addi.w t6, zero, 1 // const 1
+.LOOP_COEFF_LEVEL_RUN15_LSX:
+ slli.w t7, t0, 1
+ ldx.h t2, a0, t7
+ st.h t2, t3, 0
+ addi.d t3, t3, 2
+
+ addi.w t5, t5, 1
+ sll.w t2, t6, t0
+ or t4, t4, t2
+ bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX
+
+ addi.w t0, t0, -1
+ slli.w t1, t1, 2
+ addi.w t1, t1, 4
+ sll.d t8, t8, t1
+ clz.d t1, t8
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX
+.END_COEFF_LEVEL_RUN15_LSX:
+ st.w t4, a1, 4
+ move a0, t5
+endfunc_x264
+
+function_x264 coeff_level_run8_lsx
+ addi.w t0, zero, 15
+ vld vr0, a0, 0
+ vxor.v vr1, vr1, vr1
+ vldi vr2, 1
+
+ vssrlni.bu.h vr0, vr0, 0
+ vpermi.w vr1, vr0, 0x44
+ vsle.bu vr3, vr2, vr1
+ vsrlni.b.h vr3, vr3, 4
+ vpickve2gr.du t8, vr3, 0
+ clz.d t1, t8
+
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ st.w t0, a1, 0x00 // Store runlevel->last
+ addi.d t3, a1, 23
+ nor t2, zero, zero
+ addi.d t2, t2, -15
+ and t3, t3, t2 // runlevel->level
+ xor t4, t4, t4 // mask
+ xor t5, t5, t5 // total: number of non-zero elements
+ addi.w t6, zero, 1 // const 1
+.LOOP_COEFF_LEVEL_RUN8_LSX:
+ slli.w t7, t0, 1
+ ldx.h t2, a0, t7
+ st.h t2, t3, 0
+ addi.d t3, t3, 2
+
+ addi.w t5, t5, 1
+ sll.w t2, t6, t0
+ or t4, t4, t2
+ bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX
+
+ addi.w t0, t0, -1
+ slli.w t1, t1, 2
+ addi.w t1, t1, 4
+ sll.d t8, t8, t1
+ clz.d t1, t8
+ srai.w t1, t1, 2
+ sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
+ bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX
+.END_COEFF_LEVEL_RUN8_LSX:
+ st.w t4, a1, 4
+ move a0, t5
+endfunc_x264
=====================================
common/loongarch/quant.h
=====================================
@@ -81,4 +81,16 @@ void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
+int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
+int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
+
+#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
+int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
+int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
+int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
+
#endif/* X264_LOONGARCH_QUANT_H */
=====================================
common/quant.c
=====================================
@@ -848,11 +848,17 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_lsx;
pf->dequant_8x8 = x264_dequant_8x8_lsx;
pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx;
- pf->coeff_last4 = x264_coeff_last4_lsx;
- pf->coeff_last8 = x264_coeff_last8_lsx;
+ pf->decimate_score15 = x264_decimate_score15_lsx;
+ pf->decimate_score16 = x264_decimate_score16_lsx;
+ pf->decimate_score64 = x264_decimate_score64_lsx;
+ pf->coeff_last4 = x264_coeff_last4_lsx;
+ pf->coeff_last8 = x264_coeff_last8_lsx;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx;
+ pf->coeff_level_run8 = x264_coeff_level_run8_lsx;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx;
}
if( cpu&X264_CPU_LASX )
{
@@ -863,6 +869,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx;
}
#endif
=====================================
tools/checkasm-loongarch.S
=====================================
@@ -0,0 +1,210 @@
+/****************************************************************************
+ * checkasm-loongarch.S: assembly check tool
+ *****************************************************************************
+ * Copyright (C) 2024 x264 project
+ *
+ * Authors: Xiwei Gu <guxiwei-hf at loongson.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "../common/loongarch/loongson_asm.S"
+
+const register_init, align=3
+.quad 0x21f86d66c8ca00ce
+.quad 0x75b6ba21077c48ad
+.quad 0xed56bb2dcb3c7736
+.quad 0x8bda43d3fd1a7e06
+.quad 0xb64a9c9e5d318408
+.quad 0xdf9a54b303f1d3a3
+.quad 0x4a75479abd64e097
+.quad 0x249214109d5d1c88
+.quad 0x1a1b2550a612b48c
+.quad 0x79445c159ce79064
+.quad 0x2eed899d5a28ddcd
+.quad 0x86b2536fcd8cf636
+.quad 0xb0856806085e7943
+.quad 0x3f2bf84fc0fcca4e
+.quad 0xacbd382dcf5b8de2
+.quad 0xd229e1f5b281303f
+.quad 0x71aeaff20b095fd9
+endconst
+
+const error_message
+.asciz "failed to preserve register"
+endconst
+
+.text
+
+// max number of args used by any x264 asm function.
+#define MAX_ARGS 15
+
+#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
+
+// Fill dirty data at stack space
+function x264_checkasm_stack_clobber
+ move t0, sp
+ addi.d t1, zero, CLOBBER_STACK
+1:
+ st.d a0, sp, 0x00
+ st.d a1, sp, -0x08
+ addi.d sp, sp, -0x10
+ addi.d t1, t1, -0x10
+ blt zero,t1, 1b
+ move sp, t0
+endfunc
+
+#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15)
+
+function x264_checkasm_call
+ // Saved s0 - s8, fs0 - fs7
+ move t4, sp
+ addi.d sp, sp, -136
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ fst.d fs0, sp, 72
+ fst.d fs1, sp, 80
+ fst.d fs2, sp, 88
+ fst.d fs3, sp, 96
+ fst.d fs4, sp, 104
+ fst.d fs5, sp, 112
+ fst.d fs6, sp, 120
+ fst.d fs7, sp, 128
+
+ la.local t1, register_init
+ ld.d s0, t1, 0
+ ld.d s1, t1, 8
+ ld.d s2, t1, 16
+ ld.d s3, t1, 24
+ ld.d s4, t1, 32
+ ld.d s5, t1, 40
+ ld.d s6, t1, 48
+ ld.d s7, t1, 56
+ ld.d s8, t1, 64
+ fld.d fs0, t1, 72
+ fld.d fs1, t1, 80
+ fld.d fs2, t1, 88
+ fld.d fs3, t1, 96
+ fld.d fs4, t1, 104
+ fld.d fs5, t1, 112
+ fld.d fs6, t1, 120
+ fld.d fs7, t1, 128
+
+ addi.d sp, sp, -16
+ st.d a1, sp, 0 // ok
+ st.d ra, sp, 8 // Ret address
+
+ addi.d sp, sp, -ARG_STACK
+
+ addi.d t0, zero, 8*8
+ xor t1, t1, t1
+.rept MAX_ARGS - 8
+ // Skip the first 8 args, that are loaded into registers
+ ldx.d t2, t4, t0
+ stx.d t2, sp, t1
+ addi.d t0, t0, 8
+ addi.d t1, t1, 8
+.endr
+ move t3, a0 // Func
+ ld.d a0, t4, 0
+ ld.d a1, t4, 8
+ ld.d a2, t4, 16
+ ld.d a3, t4, 24
+ ld.d a4, t4, 32
+ ld.d a5, t4, 40
+ ld.d a6, t4, 48
+ ld.d a7, t4, 56
+
+ jirl ra, t3, 0
+
+ addi.d sp, sp, ARG_STACK
+ ld.d t2, sp, 0 // ok
+ ld.d ra, sp, 8 // Ret address
+ addi.d sp, sp, 16
+
+ la.local t1, register_init
+ xor t3, t3, t3
+
+.macro check_reg_gr reg1
+ ld.d t0, t1, 0
+ xor t0, $s\reg1, t0
+ or t3, t3, t0
+ addi.d t1, t1, 8
+.endm
+ check_reg_gr 0
+ check_reg_gr 1
+ check_reg_gr 2
+ check_reg_gr 3
+ check_reg_gr 4
+ check_reg_gr 5
+ check_reg_gr 6
+ check_reg_gr 7
+ check_reg_gr 8
+
+.macro check_reg_fr reg1
+ ld.d t0, t1, 0
+ movfr2gr.d t4,$fs\reg1
+ xor t0, t0, t4
+ or t3, t3, t0
+ addi.d t1, t1, 8
+.endm
+ check_reg_fr 0
+ check_reg_fr 1
+ check_reg_fr 2
+ check_reg_fr 3
+ check_reg_fr 4
+ check_reg_fr 5
+ check_reg_fr 6
+ check_reg_fr 7
+
+ beqz t3, 0f
+
+ st.d zero,t2, 0x00 // Set OK to 0
+ la.local a0, error_message
+ addi.d sp, sp, -8
+ st.d ra, sp, 0
+ bl puts
+ ld.d ra, sp, 0
+ addi.d sp, sp, 8
+0:
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ fld.d fs0, sp, 72
+ fld.d fs1, sp, 80
+ fld.d fs2, sp, 88
+ fld.d fs3, sp, 96
+ fld.d fs4, sp, 104
+ fld.d fs5, sp, 112
+ fld.d fs6, sp, 120
+ fld.d fs7, sp, 128
+ addi.d sp, sp, 136
+endfunc
=====================================
tools/checkasm.c
=====================================
@@ -274,6 +274,10 @@ intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... );
intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon;
#endif
+#if ARCH_LOONGARCH
+intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
+#endif
+
#define call_c1(func,...) func(__VA_ARGS__)
#if HAVE_MMX && ARCH_X86_64
@@ -300,6 +304,12 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); })
#elif HAVE_MMX || HAVE_ARMV6
#define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
+#elif ARCH_LOONGARCH && HAVE_LSX
+void x264_checkasm_stack_clobber( uint64_t clobber, ... );
+#define call_a1(func,...) ({ \
+ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
+ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \
+ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); })
#else
#define call_a1 call_c1
#endif
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/585e01997f0c7e6d72c8ca466406d955c07de912...7ed753b10a61d0be95f683289dfb925b800b0676
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/585e01997f0c7e6d72c8ca466406d955c07de912...7ed753b10a61d0be95f683289dfb925b800b0676
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list