[x264-devel] Recent changes ported to AMD64
Josef Zlomek
josef.zlomek at xeris.cz
Wed May 11 11:15:50 CEST 2005
Hi,
recently, two changes of mc-a2.asm for i386 were commited to svn.
The attached patch ports these chages to amd64,
hence it makes the code of i386 and amd64 equivalent again.
The code does not look exactly the same. Because amd64 has more
registers, we do not have to save many values to memory
but we can keep them in additional registers.
Josef Zlomek
Index: common/amd64/mc-a2.asm
===================================================================
--- common/amd64/mc-a2.asm (revision 222)
+++ common/amd64/mc-a2.asm (working copy)
@@ -49,12 +49,8 @@
mmx_dw_5:
times 4 dw -5
-SECTION .data
+%assign tbuffer 0
-buffer:
- dq 0
-
-
;=============================================================================
; Macros
;=============================================================================
@@ -133,117 +129,95 @@
x264_center_filter_mmxext :
push rbp
- mov rbp, rsp
push rbx
push r12
push r13
push r14
push r15
+ mov rbp, rsp
- mov r12, r8 ; src
movsxd r13, r9d ; src_stride
+ mov r12, r8 ; src
+ sub r12, r13
+ sub r12, r13 ; tsrc = src - 2 * src_stride
+
+ ; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
+ lea rax, [r13 + r13 + 24 + tbuffer]
+ sub rsp, rax
+
mov r10, rdx ; dst2
movsxd r11, ecx ; dst2_stride
mov r8, rdi ; dst1
movsxd r9, esi ; dst1_stride
- movsxd r14, dword [rbp + 16] ; width
- movsxd r15, dword [rbp + 24] ; height
+ movsxd r14, dword [rbp + 56] ; width
+ movsxd r15, dword [rbp + 64] ; height
- mov rsi, r12 ; src
+ mov rcx, r13 ; src_stride
+ lea rbx, [r13 + r13 * 2] ; 3 * src_stride
+ lea rdx, [r13 + r13 * 4] ; 5 * src_stride
- mov rcx, r13 ; src_stride
-
- sub rsp, rcx
- sub rsp, rcx ; rsp is now at the beginning of the buffer
- mov [buffer], rsp ; buffer
-
- ;sub rsi, 2
- sub rsi, rcx
- sub rsi, rcx ; rsi - 2 - 2 * stride
- mov r12, rsi
-
- ;sub rdi, 2
-
- lea rbx, [rcx+rcx*2] ; 3 * src_stride
- lea rdx, [rcx+rcx*4] ; 5 * src_stride
-
pxor mm0, mm0 ; 0 ---> mm0
movq mm7, [mmx_dd_one] ; for rounding
- mov rbp, r15
-
loopcy:
- dec rbp
- mov rax, r14
- mov rdi, r8
- mov rsp, [buffer]
- mov rsi, r12
+ xor rax, rax
+ mov rsi, r12 ; tsrc
FILT_ALL rsi
pshufw mm2, mm1, 0
- movq [rsp], mm2
- movq [rsp+8],mm1
+ movq [rsp + tbuffer], mm2
+ movq [rsp + tbuffer + 8], mm1
paddw mm1, [mmx_dw_one]
psraw mm1, 5
- add rsp, 16
packuswb mm1, mm1
- movd [rdi], mm1
+ movd [r8], mm1 ; dst1[0] = mm1
- sub rax, 8
- add rdi, 4
+ add rax, 8
add rsi, 4
+ lea rdi, [r8 - 4] ; rdi = dst1 - 4
loopcx1:
- sub rax, 4
-
FILT_ALL rsi
- movq [rsp], mm1
+ movq [rsp + tbuffer + 2 * rax], mm1
paddw mm1, [mmx_dw_one]
psraw mm1, 5
packuswb mm1, mm1
- movd [rdi], mm1
+ movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
- add rsp, 8
add rsi, 4
- add rdi, 4
- test rax, rax
+ add rax, 4
+ cmp rax, r14 ; cmp rax, width
jnz loopcx1
FILT_ALL rsi
pshufw mm2, mm1, 7
- movq [rsp], mm1
- movq [rsp+8], mm2
+ movq [rsp + tbuffer + 2 * rax], mm1
+ movq [rsp + tbuffer + 2 * rax + 8], mm2
paddw mm1, [mmx_dw_one]
psraw mm1, 5
packuswb mm1, mm1
- movd [rdi], mm1
- add rsp, 8
+ movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
- add r12, rcx
+ add r12, r13 ; tsrc = tsrc + src_stride
- add r8, r9
+ add r8, r9 ; dst1 = dst1 + dst1_stride
- mov rax, r14
- mov rdi, r10
- mov rsp, [buffer]
- add rsp, 4
+ xor rax, rax
loopcx2:
- sub rax, 4
-
- movq mm2, [rsp + 2 * rax + 2]
- movq mm3, [rsp + 2 * rax + 4]
- movq mm4, [rsp + 2 * rax + 6]
- movq mm5, [rsp + 2 * rax + 8]
- movq mm1, [rsp + 2 * rax]
- movq mm6, [rsp + 2 * rax + 10]
+ movq mm2, [rsp + 2 * rax + 2 + 4 + tbuffer]
+ movq mm3, [rsp + 2 * rax + 4 + 4 + tbuffer]
+ movq mm4, [rsp + 2 * rax + 6 + 4 + tbuffer]
+ movq mm5, [rsp + 2 * rax + 8 + 4 + tbuffer]
+ movq mm1, [rsp + 2 * rax + 4 + tbuffer]
+ movq mm6, [rsp + 2 * rax + 10 + 4 + tbuffer]
paddw mm2, mm5
paddw mm3, mm4
paddw mm1, mm6
@@ -278,20 +252,19 @@
packssdw mm2, mm3
packuswb mm2, mm0
- movd [rdi + rax], mm2
+ movd [r10 + rax], mm2 ; dst2[rax] = mm2
- test rax, rax
+ add rax, 4
+ cmp rax, r14 ; cmp rax, width
jnz loopcx2
- add rdi, r11
- mov r10, rdi
+ add r10, r11 ; dst2 += dst2_stride
- test rbp, rbp
+ dec r15 ; height
+ test r15, r15
jnz loopcy
- mov rsp, [buffer]
- shl rcx, 1
- add rsp, rcx
+ mov rsp, rbp
pop r15
pop r14
@@ -314,6 +287,7 @@
x264_horizontal_filter_mmxext :
movsxd r10, esi ; dst_stride
movsxd r11, ecx ; src_stride
+ movsxd r8, r8d ; width
; mov rdi, rdi ; dst
mov rsi, rdx ; src
@@ -328,11 +302,11 @@
loophy:
dec rcx
- movsxd rax, r8d ; width
+ xor rax, rax
loophx:
- sub rax, 8
+ prefetchnta [rsi + rax + 48]
LOAD_4 mm1, mm2, mm3, mm4, [rsi + rax], [rsi + rax + 1], [rsi + rax + 2], [rsi + rax + 3], mm0
FILT_2 mm1, mm2
@@ -359,7 +333,8 @@
packuswb mm1, mm2
movq [rdi + rax], mm1
- test rax, rax
+ add rax, 8
+ cmp rax, r8 ; cmp rax, width
jnz loophx
add rsi, r11 ; src_pitch
--
This is the x264-devel mailing-list
To unsubscribe, go to: http://developers.videolan.org/lists.html
More information about the x264-devel
mailing list