<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:SimSun;
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:DengXian;
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:Aptos;
panose-1:2 11 0 4 2 2 2 2 2 4;}
@font-face
{font-family:"Apple Color Emoji";
panose-1:0 0 0 0 0 0 0 0 0 0;}
@font-face
{font-family:"\@DengXian";
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:"\@SimSun";
panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
font-size:12.0pt;
font-family:"Aptos",sans-serif;}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;
mso-ligatures:none;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
--></style>
</head>
<body lang="EN-US" link="#467886" vlink="#96607D" style="word-wrap:break-word">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt">Hi Chen,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt"><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">Thanks for the feedback!<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt"><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">The patch gives more visible performance uplift on Neoverse N2 and V3?<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt"><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">Some benchmarking statistic:<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">N2: 1.213x speedup<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">V2: 1.053x speedup<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">V3: 1.321x speedup<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt"><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">Regards,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt">Li<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt"><o:p> </o:p></span></p>
<div id="mail-editor-reference-message-container">
<div>
<div>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal" style="margin-bottom:12.0pt"><b><span style="color:black">From:
</span></b><span style="color:black">x265-devel <x265-devel-bounces@videolan.org> on behalf of chen <chenm003@163.com><br>
<b>Date: </b>Thursday, 2025. June 19. at 8:50<br>
<b>To: </b>Development for x265 <x265-devel@videolan.org><br>
<b>Cc: </b>nd <nd@arm.com><br>
<b>Subject: </b>Re: [x265] [PATCH] AArch64: Optimize standard bit-depth Neon transpose8x8<o:p></o:p></span></p>
</div>
<div>
<div id="spnEditorContent">
<p style="margin:0in"><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black">Hi Li,<o:p></o:p></span></p>
<p style="margin:0in"><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p> </o:p></span></p>
<p style="margin:0in"><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black">The patch looks good, but the performance almost no different,<o:p></o:p></span></p>
<p style="margin:0in"><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black">we may keep origin version no change<o:p></o:p></span></p>
<p style="margin:0in"><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p> </o:p></span></p>
<p style="margin:0in"><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black">Regards,<br>
Chen<o:p></o:p></span></p>
</div>
<p><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black">At 2025-06-18 17:21:36, "Li Zhang" <Li.Zhang2@arm.com> wrote:<o:p></o:p></span></p>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt" id="isReplyContent">
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">Hi,</span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black"> </span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">May I ask for some attention on this patch?
</span><span style="font-size:11.0pt;font-family:"Apple Color Emoji";color:black"></span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black"> </span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">Thanks,</span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">Li</span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black"> </span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
<div id="mail-editor-reference-message-container">
<div>
<div>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:12.0pt"><b><span style="font-family:"Arial",sans-serif;color:black">From:
</span></b><span style="font-family:"Arial",sans-serif;color:black">x265-devel <x265-devel-bounces@videolan.org> on behalf of Li Zhang <li.zhang2@arm.com><br>
<b>Date: </b>Tuesday, 2025. May 20. at 13:07<br>
<b>To: </b>x265-devel@videolan.org <x265-devel@videolan.org><br>
<b>Cc: </b>nd <nd@arm.com><br>
<b>Subject: </b>[x265] [PATCH] AArch64: Optimize standard bit-depth Neon transpose8x8</span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:12.0pt"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">Combine 64-bit input vectors into 128-bit vectors before starting the<br>
transpose to make use of the full vector bandwidth.<br>
---<br>
source/common/aarch64/arm64-utils.cpp | 105 ++++++++++++--------------<br>
1 file changed, 49 insertions(+), 56 deletions(-)<br>
<br>
diff --git a/source/common/aarch64/arm64-utils.cpp b/source/common/aarch64/arm64-utils.cpp<br>
index af93729f1..7293b2e72 100644<br>
--- a/source/common/aarch64/arm64-utils.cpp<br>
+++ b/source/common/aarch64/arm64-utils.cpp<br>
@@ -2,6 +2,7 @@<br>
#include "x265.h"<br>
#include "arm64-utils.h"<br>
#include <arm_neon.h><br>
+#include "mem-neon.h"<br>
<br>
namespace X265_NS<br>
{<br>
@@ -10,65 +11,57 @@ namespace X265_NS<br>
<br>
void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)<br>
{<br>
- uint8x8_t a0 = vld1_u8(src + 0 * sstride);<br>
- uint8x8_t a1 = vld1_u8(src + 1 * sstride);<br>
- uint8x8_t a2 = vld1_u8(src + 2 * sstride);<br>
- uint8x8_t a3 = vld1_u8(src + 3 * sstride);<br>
- uint8x8_t a4 = vld1_u8(src + 4 * sstride);<br>
- uint8x8_t a5 = vld1_u8(src + 5 * sstride);<br>
- uint8x8_t a6 = vld1_u8(src + 6 * sstride);<br>
- uint8x8_t a7 = vld1_u8(src + 7 * sstride);<br>
-<br>
- uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));<br>
- uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));<br>
- uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));<br>
- uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));<br>
- uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));<br>
- uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));<br>
- uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));<br>
- uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));<br>
-<br>
- uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0),<br>
- vreinterpret_u16_u32(b2));<br>
- uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1),<br>
- vreinterpret_u16_u32(b3));<br>
- uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0),<br>
- vreinterpret_u16_u32(b2));<br>
- uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1),<br>
- vreinterpret_u16_u32(b3));<br>
- uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4),<br>
- vreinterpret_u16_u32(b6));<br>
- uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5),<br>
- vreinterpret_u16_u32(b7));<br>
- uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4),<br>
- vreinterpret_u16_u32(b6));<br>
- uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5),<br>
- vreinterpret_u16_u32(b7));<br>
-<br>
- uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));<br>
- uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));<br>
- uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));<br>
- uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));<br>
- uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));<br>
- uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));<br>
- uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));<br>
- uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));<br>
-<br>
- vst1_u8(dst + 0 * dstride, d0);<br>
- vst1_u8(dst + 1 * dstride, d1);<br>
- vst1_u8(dst + 2 * dstride, d2);<br>
- vst1_u8(dst + 3 * dstride, d3);<br>
- vst1_u8(dst + 4 * dstride, d4);<br>
- vst1_u8(dst + 5 * dstride, d5);<br>
- vst1_u8(dst + 6 * dstride, d6);<br>
- vst1_u8(dst + 7 * dstride, d7);<br>
+ // a0: 00 01 02 03 04 05 06 07<br>
+ // a1: 10 11 12 13 14 15 16 17<br>
+ // a2: 20 21 22 23 24 25 26 27<br>
+ // a3: 30 31 32 33 34 35 36 37<br>
+ // a4: 40 41 42 43 44 45 46 47<br>
+ // a5: 50 51 52 53 54 55 56 57<br>
+ // a6: 60 61 62 63 64 65 66 67<br>
+ // a7: 70 71 72 73 74 75 76 77<br>
+ uint8x8_t a[8];<br>
+ load_u8x8xn<8>(src, sstride, a);<br>
+<br>
+ // a04: 00 40 01 41 02 42 03 43 04 44 05 45 06 46 07 47<br>
+ // a15: 10 50 11 51 12 52 13 53 14 54 15 55 16 56 17 57<br>
+ // a26: 20 60 21 61 22 62 23 63 24 64 25 65 26 66 27 67<br>
+ // a37: 30 70 31 71 32 72 33 73 34 74 35 75 36 76 37 77<br>
+ // Combine with 0 vector will be optimized away by the compiler<br>
+ // as the load will zero the upper half of the register.<br>
+ uint8x16_t a04 = vzip1q_u8(vcombine_u8(a[0], vdup_n_u8(0)),<br>
+ vcombine_u8(a[4], vdup_n_u8(0)));<br>
+ uint8x16_t a15 = vzip1q_u8(vcombine_u8(a[1], vdup_n_u8(0)),<br>
+ vcombine_u8(a[5], vdup_n_u8(0)));<br>
+ uint8x16_t a26 = vzip1q_u8(vcombine_u8(a[2], vdup_n_u8(0)),<br>
+ vcombine_u8(a[6], vdup_n_u8(0)));<br>
+ uint8x16_t a37 = vzip1q_u8(vcombine_u8(a[3], vdup_n_u8(0)),<br>
+ vcombine_u8(a[7], vdup_n_u8(0)));<br>
+<br>
+ // a0246[0]: 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63<br>
+ // a0246[1]: 04 24 44 64 05 25 45 65 06 26 46 66 07 27 47 67<br>
+ // a1357[0]: 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73<br>
+ // a1357[1]: 14 34 54 74 15 35 55 75 16 36 56 76 17 37 57 77<br>
+ uint8x16x2_t a0246 = vzipq_u8(a04, a26);<br>
+ uint8x16x2_t a1357 = vzipq_u8(a15, a37);<br>
+<br>
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71<br>
+ // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73<br>
+ // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75<br>
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77<br>
+ uint8x16x2_t d0 = vzipq_u8(a0246.val[0],a1357.val[0]);<br>
+ uint8x16x2_t d1 = vzipq_u8(a0246.val[1],a1357.val[1]);<br>
+<br>
+ vst1_u8(dst + 0 * dstride, vget_low_u8(d0.val[0]));<br>
+ vst1_u8(dst + 1 * dstride, vget_high_u8(d0.val[0]));<br>
+ vst1_u8(dst + 2 * dstride, vget_low_u8(d0.val[1]));<br>
+ vst1_u8(dst + 3 * dstride, vget_high_u8(d0.val[1]));<br>
+ vst1_u8(dst + 4 * dstride, vget_low_u8(d1.val[0]));<br>
+ vst1_u8(dst + 5 * dstride, vget_high_u8(d1.val[0]));<br>
+ vst1_u8(dst + 6 * dstride, vget_low_u8(d1.val[1]));<br>
+ vst1_u8(dst + 7 * dstride, vget_high_u8(d1.val[1]));<br>
}<br>
<br>
<br>
-<br>
-<br>
-<br>
-<br>
void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)<br>
{<br>
uint8x16_t a0 = vld1q_u8(src + 0 * sstride);<br>
-- <br>
2.39.5 (Apple Git-154)</span><span style="font-size:10.5pt;font-family:"Arial",sans-serif;color:black"><o:p></o:p></span></p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</blockquote>
</div>
</div>
</div>
</div>
</div>
</body>
</html>