[vlc-devel] [PATCH 05/16] bitslice transform: rewrite

glenvt18 glenvt18 at gmail.com
Fri Jun 26 13:19:59 CEST 2015


What has been done:
1. New optimized matrix transpose routines.
2. New BS_SWAPxx target-specific macros can be defined for further speed-up.
3. Separate 32 bit and all other stream transforms.
4. Packet data are read/written in native endianess. Endianess is now
   handled by transpose routines. No need for endianess conversion anymore.
5. Use unaligned 32-bit memory access for ARM if supported by the target.
---
 configure.ac                       |  15 ++-
 src/Makefile.am                    |  27 ++---
 src/dvbcsa_bs_stream.c             |   4 +-
 src/dvbcsa_bs_transpose.c          | 112 ------------------
 src/dvbcsa_bs_transpose.h          |  71 ++++++++++++
 src/dvbcsa_bs_transpose128.c       | 209 ---------------------------------
 src/dvbcsa_bs_transpose32.c        | 185 -----------------------------
 src/dvbcsa_bs_transpose64.c        | 186 -----------------------------
 src/dvbcsa_bs_transpose_block.c    |  97 ++++++++++++++++
 src/dvbcsa_bs_transpose_stream.c   | 231 +++++++++++++++++++++++++++++++++++++
 src/dvbcsa_bs_transpose_stream32.c | 150 ++++++++++++++++++++++++
 src/dvbcsa_pv.h                    |  78 +++++++++++--
 12 files changed, 637 insertions(+), 728 deletions(-)
 delete mode 100644 src/dvbcsa_bs_transpose.c
 create mode 100644 src/dvbcsa_bs_transpose.h
 delete mode 100644 src/dvbcsa_bs_transpose128.c
 delete mode 100644 src/dvbcsa_bs_transpose32.c
 delete mode 100644 src/dvbcsa_bs_transpose64.c
 create mode 100644 src/dvbcsa_bs_transpose_block.c
 create mode 100644 src/dvbcsa_bs_transpose_stream.c
 create mode 100644 src/dvbcsa_bs_transpose_stream32.c

diff --git a/configure.ac b/configure.ac
index cefdf8a..4dd0726 100644
--- a/configure.ac
+++ b/configure.ac
@@ -32,23 +32,26 @@ AC_C_CONST
 AC_C_INLINE
 AC_CHECK_SIZEOF(long)
 
+AC_C_BIGENDIAN(
+ AC_DEFINE(DVBCSA_ENDIAN_BIG, 1, [Target is big-endian]),
+ AC_DEFINE(DVBCSA_ENDIAN_LITTLE, 1, [Target is little-endian]),
+ AC_MSG_ERROR(unknown endianess),
+ AC_MSG_ERROR(universial endianess not supported)
+)
+
 if test "$enable_mmx" = "yes" ; then
-     transpose_64=yes
      AC_DEFINE(DVBCSA_USE_MMX, 1, Using MMX bitslice.)
      GCC_CFLAGS="$GCC_CFLAGS -mmmx"
 
 elif test "$enable_sse2" = "yes" ; then
-     transpose_128=yes
      AC_DEFINE(DVBCSA_USE_SSE, 1, Using SSE2 bitslice.)
      GCC_CFLAGS="$GCC_CFLAGS -msse -msse2"
 
 elif test "$enable_altivec" = "yes" ; then
-     transpose_128=yes
      AC_DEFINE(DVBCSA_USE_ALTIVEC, 1, Using AltiVec bitslice.)
      GCC_CFLAGS="$GCC_CFLAGS -maltivec -mabi=altivec"
 
 elif test "$enable_neon" = "yes" ; then
-     transpose_128=yes
      AC_DEFINE(DVBCSA_USE_NEON, 1, Using NEON bitslice.)
      GCC_CFLAGS="$GCC_CFLAGS -mfpu=neon"
 
@@ -57,13 +60,11 @@ elif test "$enable_uint32" = "yes" ; then
      AC_DEFINE(DVBCSA_USE_UINT32, 1, Using 32 bits integer bitslice.)
 
 elif test "$enable_uint64" = "yes" ; then
-     transpose_64=yes
      AC_DEFINE(DVBCSA_USE_UINT64, 1, Using 64 bits integer bitslice.)
 
 else
      case $ac_cv_sizeof_long in
      	  8)
-	       transpose_64=yes
 	       AC_DEFINE(DVBCSA_USE_UINT64, 1, Using 64 bits integer bitslice.)
 	  ;;
 	  *)
@@ -73,8 +74,6 @@ else
      esac
 fi
 
-AM_CONDITIONAL(TRANSPOSE_128, test "$transpose_128" = "yes")
-AM_CONDITIONAL(TRANSPOSE_64, test "$transpose_64" = "yes")
 AM_CONDITIONAL(TRANSPOSE_32, test "$transpose_32" = "yes")
 
 if test "$GCC" = "yes" ; then
diff --git a/src/Makefile.am b/src/Makefile.am
index 3bad07a..996ce4d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -3,23 +3,20 @@ SUBDIRS = dvbcsa
 
 lib_LTLIBRARIES = libdvbcsa.la
 
-libdvbcsa_la_SOURCES = dvbcsa_algo.c dvbcsa_block.c dvbcsa_bs_algo.c	\
-	dvbcsa_bs_block.c dvbcsa_bs_key.c dvbcsa_bs_stream.c		\
-	dvbcsa_stream.c dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_uint64.h	\
-	dvbcsa_bs_uint32.h dvbcsa_bs_mmx.h dvbcsa_bs_sse.h		\
-	dvbcsa_bs_altivec.h dvbcsa_bs_neon.h dvbcsa_bs_transpose.c dvbcsa_key.c	\
-	dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h
-
-if TRANSPOSE_128
-libdvbcsa_la_SOURCES += dvbcsa_bs_transpose128.c
-endif
-
-if TRANSPOSE_64
-libdvbcsa_la_SOURCES += dvbcsa_bs_transpose64.c
-endif
+libdvbcsa_la_SOURCES = \
+	dvbcsa_algo.c dvbcsa_block.c dvbcsa_stream.c dvbcsa_key.c \
+	dvbcsa_bs_algo.c dvbcsa_bs_block.c dvbcsa_bs_stream.c dvbcsa_bs_key.c \
+	dvbcsa_bs_transpose_block.c \
+	dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h \
+	dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_transpose.h \
+	dvbcsa_bs_uint32.h ddvbcsa_bs_uint64.h \
+	dvbcsa_bs_mmx.h dvbcsa_bs_sse.h \
+	dvbcsa_bs_altivec.h dvbcsa_bs_neon.h
 
 if TRANSPOSE_32
-libdvbcsa_la_SOURCES += dvbcsa_bs_transpose32.c
+libdvbcsa_la_SOURCES += dvbcsa_bs_transpose_stream32.c
+else
+libdvbcsa_la_SOURCES += dvbcsa_bs_transpose_stream.c
 endif
 
 libdvbcsa_la_LDFLAGS = -version-info 1:1:0 $(libtool_flags)
diff --git a/src/dvbcsa_bs_stream.c b/src/dvbcsa_bs_stream.c
index 688a70d..41c0099 100644
--- a/src/dvbcsa_bs_stream.c
+++ b/src/dvbcsa_bs_stream.c
@@ -80,9 +80,7 @@ dvbcsa_bs_stream_cipher_batch(const struct dvbcsa_bs_key_s *key,
   for (h = 8; h < maxlen; h += 8)
     {
       dvbcsa_bs_stream_cipher_kernel(&regs);
-      for (i = 0; i < 8; i++)
-          dvbcsa_bs_stream_transpose_out(pcks, h + i, regs.cb + i * 8);
-
+      dvbcsa_bs_stream_transpose_out(pcks, h, regs.cb);
     }
 
 }
diff --git a/src/dvbcsa_bs_transpose.c b/src/dvbcsa_bs_transpose.c
deleted file mode 100644
index 57d208e..0000000
--- a/src/dvbcsa_bs_transpose.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
-
-    This file is part of libdvbcsa.
-
-    libdvbcsa is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published
-    by the Free Software Foundation; either version 2 of the License,
-    or (at your option) any later version.
-
-    libdvbcsa is distributed in the hope that it will be useful, but
-    WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with libdvbcsa; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-    02111-1307 USA
-
-    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
-
-    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
-	Block cipher transpose
- */
-
-void dvbcsa_bs_block_transpose_in (dvbcsa_bs_word_t *out,
-				   const struct dvbcsa_bs_batch_s *pcks,
-				   unsigned int offset)
-{
-  uint32_t		*ri = (uint32_t *) out;
-  unsigned int		j, i, k;
-
-  for (i = 0; pcks[i].data; i++)
-    if (offset < (pcks[i].len & (unsigned)~0x7))
-      {
-        ri[i                ] = dvbcsa_load_le32(pcks[i].data + offset);
-        ri[i + BS_BATCH_SIZE] = dvbcsa_load_le32(pcks[i].data + offset + 4);
-      }
-
-  for (j = 0; j < 64; j += 32)
-    for (i = 0; i < 16; i += 8)
-      for (k = 0; k < 8; k++)
-	{
-	  dvbcsa_bs_word_t *r = out + j + i + k;
-	  dvbcsa_bs_word_t t, b;
-
-	  t = r[0];
-	  b = r[16];
-	  r[0]  = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  r[16] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-
-  for (j = 0; j < 64; j += 16)
-    for (k = 0; k < 8; k++)
-      {
-	dvbcsa_bs_word_t *r = out + j + k;
-	dvbcsa_bs_word_t t, b;
-
-	t = r[0];
-	b = r[8];
-	r[0] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-	r[8] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-      }
-}
-
-void dvbcsa_bs_block_transpose_out (dvbcsa_bs_word_t *in,
-				    const struct dvbcsa_bs_batch_s *pcks,
-				    unsigned int offset)
-{
-  uint32_t		*ri = (uint32_t *) in;
-  unsigned int		j, i, k;
-
-  for (j = 0; j < 64; j += 16)
-    for (k = 0; k < 8; k++)
-      {
-	dvbcsa_bs_word_t *r = in + j + k;
-	dvbcsa_bs_word_t t, b;
-	
-	t = r[0];
-	b = r[8];
-	r[0] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-	r[8] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-      }
-
-  for (j = 0; j < 64; j += 32)
-    for (i = 0; i < 16; i += 8)
-      for (k = 0; k < 8; k++)
-	{
-	  dvbcsa_bs_word_t *r = in + j + i + k;
-	  dvbcsa_bs_word_t t, b;
-
-	  t = r[0];
-	  b = r[16];
-	  r[0]  = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  r[16] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-
-  for (i = 0; pcks[i].data; i++)
-    if (offset < (pcks[i].len & (unsigned)~0x7))
-      {
-	dvbcsa_store_le32(pcks[i].data + offset    , ri[i                ]);
-	dvbcsa_store_le32(pcks[i].data + offset + 4, ri[i + BS_BATCH_SIZE]);
-      }
-}
-
diff --git a/src/dvbcsa_bs_transpose.h b/src/dvbcsa_bs_transpose.h
new file mode 100644
index 0000000..56ecf47
--- /dev/null
+++ b/src/dvbcsa_bs_transpose.h
@@ -0,0 +1,71 @@
+#ifndef DVBCSA_BS_TRANSPOSE_H_
+#define DVBCSA_BS_TRANSPOSE_H_
+
+#include "dvbcsa_bs.h"
+
+/*
+    2x2 matrix transpose swap operation:
+
+    t = |a b  c d|  =>  |f b  h d|
+    b = |e f  g h|  =>  |e a  g c|
+
+    'a' and 'e' are MSB of dvbcsa_bs_word_t.
+    (little-endian transpose)
+
+    tmp = (b ^ (t>>j)) & m;
+    b = b ^ tmp;
+    t = t ^ (tmp<<j);
+*/
+
+#define BS_SWAP_BITS_LE(t, b, shift, mask) \
+    { \
+    dvbcsa_bs_word_t tmp; \
+    tmp = BS_AND(BS_XOR(BS_SHR(t, shift), b), mask); \
+    (b) = BS_XOR((b), tmp); \
+    (t) = BS_XOR((t), BS_SHL(tmp, shift)); \
+    }
+
+#ifndef BS_SWAP32_LE
+#define BS_SWAP32_LE(t, b) BS_SWAP_BITS_LE(t, b, 32, BS_VAL64(00000000ffffffff))
+#endif
+
+#ifndef BS_SWAP16_LE
+#define BS_SWAP16_LE(t, b) BS_SWAP_BITS_LE(t, b, 16, BS_VAL32(0000ffff))
+#endif
+
+#ifndef BS_SWAP8_LE
+#define BS_SWAP8_LE(t, b) BS_SWAP_BITS_LE(t, b, 8, BS_VAL16(00ff))
+#endif
+
+#ifndef BS_SWAP4_LE
+#define BS_SWAP4_LE(t, b) BS_SWAP_BITS_LE(t, b, 4, BS_VAL8(0f))
+#endif
+
+#ifndef BS_SWAP2_LE
+#define BS_SWAP2_LE(t, b) BS_SWAP_BITS_LE(t, b, 2, BS_VAL8(33))
+#endif
+
+#ifndef BS_SWAP1_LE
+#define BS_SWAP1_LE(t, b) BS_SWAP_BITS_LE(t, b, 1, BS_VAL8(55))
+#endif
+
+#define BS_SWAP4(t, b)   BS_SWAP4_LE(t, b)
+#define BS_SWAP2(t, b)   BS_SWAP2_LE(t, b)
+#define BS_SWAP1(t, b)   BS_SWAP1_LE(t, b)
+
+#ifdef DVBCSA_ENDIAN_LITTLE
+
+#define BS_SWAP32(t, b)  BS_SWAP32_LE(t, b)
+#define BS_SWAP16(t, b)  BS_SWAP16_LE(t, b)
+#define BS_SWAP8(t, b)   BS_SWAP8_LE(t, b)
+
+#else
+
+#define BS_SWAP32(t, b)  BS_SWAP32_LE(b, t)
+#define BS_SWAP16(t, b)  BS_SWAP16_LE(b, t)
+#define BS_SWAP8(t, b)   BS_SWAP8_LE(b, t)
+
+#endif
+
+#endif
+
diff --git a/src/dvbcsa_bs_transpose128.c b/src/dvbcsa_bs_transpose128.c
deleted file mode 100644
index 8a75d09..0000000
--- a/src/dvbcsa_bs_transpose128.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
-
-    This file is part of libdvbcsa.
-
-    libdvbcsa is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published
-    by the Free Software Foundation; either version 2 of the License,
-    or (at your option) any later version.
-
-    libdvbcsa is distributed in the hope that it will be useful, but
-    WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with libdvbcsa; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-    02111-1307 USA
-
-    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
-
-    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
-	Stream cipher transpose
- */
-
-/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
-
-void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
-{
-  int i, j;
-
-  for (i = 0; pcks->data; i++)
-    {
-      uint64_t t, b;
-
-      if (pcks->data)
-	{
-	  if (pcks->len >= 8)
-	    t = dvbcsa_load_le64(pcks->data);
-	  pcks++;
-	}
-
-      if (pcks->data)
-	{
-	  if (pcks->len >= 8)
-	    b = dvbcsa_load_le64(pcks->data);
-	  pcks++;
-	}
-
-      row[i] = BS_VAL(b, t);
-    }
-
-  for (i = 0; i < 32; i++)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[i];
-      b = row[32 + i];
-      row[i]      = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(BS_AND(b, BS_VAL64(00000000ffffffff)), 4));
-      row[32 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(BS_AND(t, BS_VAL64(ffffffff00000000)), 4));
-    }
-
-  for (j = 0; j < 64; j += 32)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 16; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 16 + i];
-	  row[j + i]      = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-    }
-
-  for (j = 0; j < 64; j += 16)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 8; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 8 + i];
-	  row[j + i]     = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-	  row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-	}
-    }
-
-  for (j = 0; j < 64; j += 8)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 4; i++)
-	{
-	  b = row[j + i];
-	  t = row[j + 4 + i];
-	  row[j + i]     = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
-	  row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
-	}
-    }
-
-  for (j = 0; j < 64; j += 4)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 2; i++)
-	{
-	  b = row[j + i];
-	  t = row[j + 2 + i];
-	  row[j + i]     = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
-	  row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
-	}
-    }
-
-  for (j = 0; j < 64; j += 2)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      b = row[j];
-      t = row[j + 1];
-      row[j]     = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
-      row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
-    }
-}
-
-/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
-
-void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
-				      unsigned int index, dvbcsa_bs_word_t *row)
-{
-  int i, j;
-
-  for (i = 0; i < 4; i++)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[i];
-      b = row[4 + i];
-      row[i]     = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(BS_AND(b, BS_VAL64(00000000ffffffff)), 4));
-      row[4 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(BS_AND(t, BS_VAL64(ffffffff00000000)), 4));
-    }
-
-  for (j = 0; j < 8; j += 4)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 2; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 2 + i];
-	  row[j + i]     = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  row[j + 2 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-    }
-
-  for (j = 0; j < 8; j += 2)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[j];
-      b = row[j + 1];
-      row[j]     = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-      row[j + 1] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-    }
-
-  for (j = 0; j < 8; j++)
-    {
-      dvbcsa_bs_word_t t;
-
-      t = row[j];
-
-      t = BS_OR(       BS_AND(t, BS_VAL64(f0f0f0f00f0f0f0f)),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL64(0f0f0f0f00000000)), 28),
-		BS_SHL(BS_AND(t, BS_VAL64(00000000f0f0f0f0)), 28)));
-
-      t = BS_OR(       BS_AND(t, BS_VAL32(        cccc3333)),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL32(        33330000)), 14),
-		BS_SHL(BS_AND(t, BS_VAL32(        0000cccc)), 14)));
-
-      t = BS_OR(       BS_AND(t, BS_VAL16(            aa55)),
-          BS_OR(BS_SHR(BS_AND(t, BS_VAL16(            5500)), 7 ),
-		BS_SHL(BS_AND(t, BS_VAL16(            00aa)), 7 )));
-
-      for (i = 0; i < BS_BATCH_BYTES; i++)
-	{
-	  static const unsigned int p[16] =
-	    {
-	      0 , 8, 1, 9, 2, 10, 3, 11,
-	      4, 12, 5, 13, 6, 14, 7, 15
-	    };
-
-	  unsigned int k = j * BS_BATCH_BYTES + i;
-
-	  if (!pcks[k].data)
-	    return;
-
-	  if (index < pcks[k].len)
-	  pcks[k].data[index] ^= BS_EXTRACT8(t, p[i]);
-	}
-    }
-}
-
diff --git a/src/dvbcsa_bs_transpose32.c b/src/dvbcsa_bs_transpose32.c
deleted file mode 100644
index 2cfff7a..0000000
--- a/src/dvbcsa_bs_transpose32.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-
-    This file is part of libdvbcsa.
-
-    libdvbcsa is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published
-    by the Free Software Foundation; either version 2 of the License,
-    or (at your option) any later version.
-
-    libdvbcsa is distributed in the hope that it will be useful, but
-    WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with libdvbcsa; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-    02111-1307 USA
-
-    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
-
-    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
-	Stream cipher transpose
- */
-
-/* 64 rows of 32 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
-
-void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
-{
-  int i, j;
-
-  for (i = 0; pcks[i].data; i++)
-    if (pcks[i].len >= 8)
-      {
-	row[i     ] = BS_VAL(dvbcsa_load_le32(pcks[i].data    ));
-	row[i + 32] = BS_VAL(dvbcsa_load_le32(pcks[i].data + 4));
-      }
-
-  for (j = 0; j < 64; j += 32)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 16; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 16 + i];
-	  row[j + i     ] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-    }
-
-  for (j = 0; j < 64; j += 16)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 8; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 8 + i];
-	  row[j + i    ] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-	  row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-	}
-    }
-
-  for (j = 0; j < 64; j += 8)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 4; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 4 + i];
-	  row[j + i    ] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
-	  row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
-	}
-    }
-
-  for (j = 0; j < 64; j += 4)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 2; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 2 + i];
-	  row[j + i    ] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
-	  row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
-	}
-    }
-
-  for (j = 0; j < 64; j += 2)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[j];
-      b = row[j + 1];
-      row[j    ] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
-      row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
-    }
-}
-
-/* 64 rows of 32 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
-
-void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
-				    unsigned int index, dvbcsa_bs_word_t *row)
-{
-  int i, j;
-
-  for (i = 0; i < 4; i++)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[i];
-      b = row[4 + i];
-      row[i    ] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-      row[4 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-    }
-
-  for (j = 0; j < 8; j += 4)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 2; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 2 + i];
-	  row[j + i    ] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-	  row[j + i + 2] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-	}
-    }
-
-  for (j = 0; j < 8; j += 2)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[j];
-      b = row[j + 1];
-      row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(0f)), BS_SHL(BS_AND(b, BS_VAL8(0f)), 4)); //(t & 0x0f0f0f0f) | ((b & 0x0f0f0f0f) << 4);
-      row[j    ] = BS_OR(BS_AND(b, BS_VAL8(f0)), BS_SHR(BS_AND(t, BS_VAL8(f0)), 4));//((t & 0xf0f0f0f0) >> 4) | (b & 0xf0f0f0f0);
-    }
-
-  for (j = 0; j < 8; j++)
-    {
-      dvbcsa_bs_word_t t;
-
-      t = row[j];
-
-      t = BS_OR(       BS_AND(t, BS_VAL32(cccc3333)),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL32(33330000)), 14),
-		BS_SHL(BS_AND(t, BS_VAL32(0000cccc)), 14)));
-
-      t = BS_OR(       BS_AND(t, BS_VAL16(    aa55)),
-          BS_OR(BS_SHR(BS_AND(t, BS_VAL16(    5500)), 7 ),
-		BS_SHL(BS_AND(t, BS_VAL16(    00aa)), 7 )));
-
-      t = BS_OR(       BS_AND(t, BS_VAL8 (      81)),
-
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL8 (      10)), 3 ),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL8 (      20)), 2 ),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL8 (      40)), 1 ),
-
-	  BS_OR(BS_SHL(BS_AND(t, BS_VAL8 (      02)), 1 ),
-	  BS_OR(BS_SHL(BS_AND(t, BS_VAL8 (      04)), 2 ),
-	        BS_SHL(BS_AND(t, BS_VAL8 (      08)), 3 )))))));
-
-      for (i = 0; i < 4; i++)
-	{
-	  unsigned int k = j * 4 + i;
-
-	  if (!pcks[k].data)
-	    return;
-
-	  if (index < pcks[k].len)
-	    pcks[k].data[index] ^= BS_EXTRACT8(t, 3 - i);
-	}
-    }
-}
-
diff --git a/src/dvbcsa_bs_transpose64.c b/src/dvbcsa_bs_transpose64.c
deleted file mode 100644
index c75127b..0000000
--- a/src/dvbcsa_bs_transpose64.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
-
-    This file is part of libdvbcsa.
-
-    libdvbcsa is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published
-    by the Free Software Foundation; either version 2 of the License,
-    or (at your option) any later version.
-
-    libdvbcsa is distributed in the hope that it will be useful, but
-    WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with libdvbcsa; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-    02111-1307 USA
-
-    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
-
-    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
-	Stream cipher transpose
- */
-
-/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
-
-void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
-{
-  int i, j;
-
-  for (i = 0; pcks[i].data; i++)
-    if (pcks[i].len >= 8)
-      row[i] = BS_VAL(dvbcsa_load_le64(pcks[i].data));
-
-  for (i = 0; i < 32; i++)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[i];
-      b = row[32 + i];
-      row[i]      = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(b, 4));
-      row[32 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(t, 4));
-    }
-
-  for (j = 0; j < 64; j += 32)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 16; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 16 + i];
-	  row[j + i]      = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-    }
-
-  for (j = 0; j < 64; j += 16)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 8; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 8 + i];
-	  row[j + i]     = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-	  row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-	}
-    }
-
-  for (j = 0; j < 64; j += 8)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 4; i++)
-	{
-	  b = row[j + i];
-	  t = row[j + 4 + i];
-	  row[j + i]     = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
-	  row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
-	}
-    }
-
-  for (j = 0; j < 64; j += 4)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 2; i++)
-	{
-	  b = row[j + i];
-	  t = row[j + 2 + i];
-	  row[j + i]     = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
-	  row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
-	}
-    }
-
-  for (j = 0; j < 64; j += 2)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      b = row[j];
-      t = row[j + 1];
-      row[j]     = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
-      row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
-    }
-}
-
-/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
-
-void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
-				      unsigned int index, dvbcsa_bs_word_t *row)
-{
-  int i, j;
-
-  for (i = 0; i < 4; i++)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[i];
-      b = row[4 + i];
-      row[i]     = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(b, 4));
-      row[4 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(t, 4));
-    }
-
-  for (j = 0; j < 8; j += 4)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      for (i = 0; i < 2; i++)
-	{
-	  t = row[j + i];
-	  b = row[j + 2 + i];
-	  row[j + i]     = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
-	  row[j + 2 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
-	}
-    }
-
-  for (j = 0; j < 8; j += 2)
-    {
-      dvbcsa_bs_word_t t, b;
-
-      t = row[j];
-      b = row[j + 1];
-      row[j]     = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
-      row[j + 1] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
-    }
-
-  for (j = 0; j < 8; j++)
-    {
-      dvbcsa_bs_word_t t;
-
-      t = row[j];
-
-      t = BS_OR(       BS_AND(t, BS_VAL64(f0f0f0f00f0f0f0f)),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL64(0f0f0f0f00000000)), 28),
-		BS_SHL(BS_AND(t, BS_VAL64(00000000f0f0f0f0)), 28)));
-
-      t = BS_OR(       BS_AND(t, BS_VAL32(        cccc3333)),
-	  BS_OR(BS_SHR(BS_AND(t, BS_VAL32(        33330000)), 14),
-		BS_SHL(BS_AND(t, BS_VAL32(        0000cccc)), 14)));
-
-      t = BS_OR(       BS_AND(t, BS_VAL16(            aa55)),
-          BS_OR(BS_SHR(BS_AND(t, BS_VAL16(            5500)), 7 ),
-		BS_SHL(BS_AND(t, BS_VAL16(            00aa)), 7 )));
-
-      for (i = 0; i < BS_BATCH_BYTES; i++)
-	{
-	  unsigned int k = j * BS_BATCH_BYTES + i;
-
-	  if (!pcks[k].data)
-	    return;
-
-	  if (index < pcks[k].len)
-	  pcks[k].data[index] ^= BS_EXTRACT8(t, i);
-	}
-    }
-}
-
diff --git a/src/dvbcsa_bs_transpose_block.c b/src/dvbcsa_bs_transpose_block.c
new file mode 100644
index 0000000..5dc4472
--- /dev/null
+++ b/src/dvbcsa_bs_transpose_block.c
@@ -0,0 +1,97 @@
+/*
+
+    This file is part of libdvbcsa.
+
+    libdvbcsa is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 2 of the License,
+    or (at your option) any later version.
+
+    libdvbcsa is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with libdvbcsa; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA
+
+    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
+
+    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
+
+*/
+
+#include "dvbcsa/dvbcsa.h"
+#include "dvbcsa_bs.h"
+#include "dvbcsa_bs_transpose.h"
+
+/*
+  Block cipher transpose
+*/
+
+DVBCSA_INLINE static inline void 
+dvbcsa_bs_matrix_transpose_block(dvbcsa_bs_word_t *row)
+{
+  int j;
+
+  for (j = 0; j < 64; j += 32)
+    {
+      int i;
+      for (i = 0; i < 8; i++)
+        {
+          dvbcsa_bs_word_t a0, a1, a2, a3;
+
+          a0 = row[j + i + 0];
+          a1 = row[j + i + 8];
+          a2 = row[j + i + 16];
+          a3 = row[j + i + 24];
+
+          BS_SWAP8(a0, a1);
+          BS_SWAP8(a2, a3);
+          BS_SWAP16(a0, a2);
+          BS_SWAP16(a1, a3);
+
+          row[j + i + 0] = a0;
+          row[j + i + 8] = a1;
+          row[j + i + 16] = a2;
+          row[j + i + 24] = a3;
+        }
+    }
+}
+
+void dvbcsa_bs_block_transpose_in (dvbcsa_bs_word_t *out,
+                const struct dvbcsa_bs_batch_s *pcks,
+                unsigned int offset)
+{
+  uint32_t *ri = (uint32_t *)out;
+  unsigned int i;
+
+  for (i = 0; pcks[i].data; i++)
+    if (offset < (pcks[i].len & (unsigned)~0x7))
+      {
+        dvbcsa_copy_32((uint8_t *)(ri + i), pcks[i].data + offset);
+        dvbcsa_copy_32((uint8_t *)(ri + i + BS_BATCH_SIZE), pcks[i].data + offset + 4);
+      }
+
+  dvbcsa_bs_matrix_transpose_block(out);
+}
+
+void dvbcsa_bs_block_transpose_out (dvbcsa_bs_word_t *in,
+                const struct dvbcsa_bs_batch_s *pcks,
+                unsigned int offset)
+{
+  uint32_t *ri = (uint32_t *) in;
+  unsigned int i;
+
+  dvbcsa_bs_matrix_transpose_block(in);
+
+  for (i = 0; pcks[i].data; i++)
+    if (offset < (pcks[i].len & (unsigned)~0x7))
+      {
+        dvbcsa_copy_32(pcks[i].data + offset, (uint8_t *)(ri + i));
+        dvbcsa_copy_32(pcks[i].data + offset + 4, (uint8_t *)(ri + i + BS_BATCH_SIZE));
+      }
+}
+
diff --git a/src/dvbcsa_bs_transpose_stream.c b/src/dvbcsa_bs_transpose_stream.c
new file mode 100644
index 0000000..8fc1a2d
--- /dev/null
+++ b/src/dvbcsa_bs_transpose_stream.c
@@ -0,0 +1,231 @@
+/*
+
+    This file is part of libdvbcsa.
+
+    libdvbcsa is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 2 of the License,
+    or (at your option) any later version.
+
+    libdvbcsa is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with libdvbcsa; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA
+
+    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
+
+    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
+
+*/
+
+#include "dvbcsa/dvbcsa.h"
+#include "dvbcsa_bs.h"
+#include "dvbcsa_bs_transpose.h"
+
+/*
+  Stream cipher transpose for dvbcsa_bs_word_t sizes = 64, 128, 256, ... bits.
+*/
+
+static void dvbcsa_bs_matrix_transpose_64x(dvbcsa_bs_word_t *row)
+{
+  int j;
+
+#if defined(__i386__)
+
+  /* short of registers  */
+
+  for (j = 0; j < 16; j++)
+    {
+      dvbcsa_bs_word_t a0, a1, a2, a3;
+  
+      a0 = row[j + 0];
+      a1 = row[j + 16];
+      a2 = row[j + 32];
+      a3 = row[j + 48];
+
+      BS_SWAP16(a0, a1);
+      BS_SWAP16(a2, a3);
+      BS_SWAP32(a0, a2);
+      BS_SWAP32(a1, a3);
+
+      row[j + 0]  = a0;
+      row[j + 16] = a1;
+      row[j + 32] = a2;
+      row[j + 48] = a3;
+    }
+
+  for (j = 0; j < 64; j += 16)
+    {
+      int i;
+      for (i = 0; i < 4; i++)
+        {
+          dvbcsa_bs_word_t a0, a1, a2, a3;
+
+          a0 = row[j + i + 0];
+          a1 = row[j + i + 4];
+          a2 = row[j + i + 8];
+          a3 = row[j + i + 12];
+
+          BS_SWAP4(a0, a1);
+          BS_SWAP4(a2, a3);
+          BS_SWAP8(a0, a2);
+          BS_SWAP8(a1, a3);
+
+          row[j + i + 0] = a0;
+          row[j + i + 4] = a1;
+          row[j + i + 8] = a2;
+          row[j + i + 12] = a3;
+        }
+    }
+
+  for (j = 0; j < 64; j += 4)
+    {
+      dvbcsa_bs_word_t a0, a1, a2, a3;
+      
+      a0 = row[j + 0];
+      a1 = row[j + 1];
+      a2 = row[j + 2];
+      a3 = row[j + 3];
+
+      BS_SWAP1(a0, a1);
+      BS_SWAP1(a2, a3);
+      BS_SWAP2(a0, a2);
+      BS_SWAP2(a1, a3);
+
+      row[j + 0] = a0;
+      row[j + 1] = a1;
+      row[j + 2] = a2;
+      row[j + 3] = a3;
+    }
+
+#else
+
+  for (j = 0; j < 64; j += 8)
+    {
+      dvbcsa_bs_word_t a0, a1, a2, a3, b0, b1, b2, b3;
+      
+      a0 = row[j + 0];
+      a1 = row[j + 1];
+      a2 = row[j + 2];
+      a3 = row[j + 3];
+
+      b0 = row[j + 4];
+      b1 = row[j + 5];
+      b2 = row[j + 6];
+      b3 = row[j + 7];
+
+      BS_SWAP1(a0, a1);
+      BS_SWAP1(a2, a3);
+      BS_SWAP1(b0, b1);
+      BS_SWAP1(b2, b3);
+
+      BS_SWAP2(a0, a2);
+      BS_SWAP2(a1, a3);
+      BS_SWAP2(b0, b2);
+      BS_SWAP2(b1, b3);
+
+      BS_SWAP4(a0, b0);
+      BS_SWAP4(a1, b1);
+      BS_SWAP4(a2, b2);
+      BS_SWAP4(a3, b3);
+
+      row[j + 0] = a0;
+      row[j + 1] = a1;
+      row[j + 2] = a2;
+      row[j + 3] = a3;
+
+      row[j + 4] = b0;
+      row[j + 5] = b1;
+      row[j + 6] = b2;
+      row[j + 7] = b3;
+    }
+
+  for (j = 0; j < 8; j++)
+    {
+      dvbcsa_bs_word_t a0, a1, a2, a3, b0, b1, b2, b3;
+
+      a0 = row[j + 0];
+      a1 = row[j + 8];
+      a2 = row[j + 16];
+      a3 = row[j + 24];
+
+      b0 = row[j + 32];
+      b1 = row[j + 40];
+      b2 = row[j + 48];
+      b3 = row[j + 56];
+
+      BS_SWAP8(a0, a1);
+      BS_SWAP8(a2, a3);
+      BS_SWAP8(b0, b1);
+      BS_SWAP8(b2, b3);
+
+      BS_SWAP16(a0, a2);
+      BS_SWAP16(a1, a3);
+      BS_SWAP16(b0, b2);
+      BS_SWAP16(b1, b3);
+
+      BS_SWAP32(a0, b0);
+      BS_SWAP32(a1, b1);
+      BS_SWAP32(a2, b2);
+      BS_SWAP32(a3, b3);
+
+      row[j + 0]  = a0;
+      row[j + 8]  = a1;
+      row[j + 16] = a2;
+      row[j + 24] = a3;
+                      
+      row[j + 32] = b0;
+      row[j + 40] = b1;
+      row[j + 48] = b2;
+      row[j + 56] = b3;
+    }
+
+#endif
+}
+
+
+/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
+
+void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
+{
+  uint64_t *p;
+
+  for (p = (uint64_t *)row; pcks->data; p++, pcks++)
+    {
+      if (pcks->len >= 8)
+        dvbcsa_copy_64((uint8_t *)p, pcks->data);
+    }
+
+  dvbcsa_bs_matrix_transpose_64x(row);
+}
+
+/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
+
+void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
+                unsigned int index, dvbcsa_bs_word_t *row)
+{
+  int i, j;
+  uint8_t *p;
+
+  dvbcsa_bs_matrix_transpose_64x(row);
+
+  for (p = (uint8_t *)row; pcks->data; pcks++)
+    {
+      if (index + 8 <= pcks->len)
+        {
+          dvbcsa_xor_64(pcks->data + index, p);
+        }
+      else
+        {
+          for (j = 0, i = index; i < pcks->len; i++, j++)
+            pcks->data[i] ^= p[j];
+        }
+      p += 8;
+    }
+}
+
diff --git a/src/dvbcsa_bs_transpose_stream32.c b/src/dvbcsa_bs_transpose_stream32.c
new file mode 100644
index 0000000..fc2f847
--- /dev/null
+++ b/src/dvbcsa_bs_transpose_stream32.c
@@ -0,0 +1,150 @@
+/*
+
+    This file is part of libdvbcsa.
+
+    libdvbcsa is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 2 of the License,
+    or (at your option) any later version.
+
+    libdvbcsa is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with libdvbcsa; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA
+
+    Based on FFdecsa, Copyright (C) 2003-2004  fatih89r
+
+    (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
+
+*/
+
+#include "dvbcsa/dvbcsa.h"
+#include "dvbcsa_bs.h"
+#include "dvbcsa_bs_transpose.h"
+
+/*
+  Stream cipher transpose for dvbcsa_bs_word_t size = 32 bits.
+*/
+
+static void dvbcsa_bs_matrix_transpose_64x32(dvbcsa_bs_word_t *row)
+{
+  int j;
+
+  for (j = 0; j < 16; j++)
+    {
+      dvbcsa_bs_word_t a0, a1, a2, a3;
+
+      a0 = row[j + 0];
+      a1 = row[j + 16];
+      a2 = row[j + 32];
+      a3 = row[j + 48];
+
+      BS_SWAP16(a0, a1);
+      BS_SWAP16(a2, a3);
+  
+      row[j + 0]  = a0;
+      row[j + 16] = a1;
+      row[j + 32] = a2;
+      row[j + 48] = a3;
+    }
+
+  for (j = 0; j < 64; j += 16)
+    {
+      int i;
+      for (i = 0; i < 4; i++)
+        {
+          dvbcsa_bs_word_t a0, a1, a2, a3;
+
+          a0 = row[j + i + 0];
+          a1 = row[j + i + 4];
+          a2 = row[j + i + 8];
+          a3 = row[j + i + 12];
+
+          BS_SWAP4(a0, a1);
+          BS_SWAP4(a2, a3);
+          BS_SWAP8(a0, a2);
+          BS_SWAP8(a1, a3);
+
+          row[j + i + 0] = a0;
+          row[j + i + 4] = a1;
+          row[j + i + 8] = a2;
+          row[j + i + 12] = a3;
+        }
+    }
+
+  for (j = 0; j < 64; j += 4)
+    {
+      dvbcsa_bs_word_t a0, a1, a2, a3;
+
+      a0 = row[j + 0];
+      a1 = row[j + 1];
+      a2 = row[j + 2];
+      a3 = row[j + 3];
+
+      BS_SWAP1(a0, a1);
+      BS_SWAP1(a2, a3);
+      BS_SWAP2(a0, a2);
+      BS_SWAP2(a1, a3);
+
+      row[j + 0] = a0;
+      row[j + 1] = a1;
+      row[j + 2] = a2;
+      row[j + 3] = a3;
+    }
+}
+
+void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
+{
+  int i;
+
+  for (i = 0; pcks[i].data; i++)
+    if (pcks[i].len >= 8)
+      {
+        dvbcsa_copy_32((uint8_t *)(row + i), pcks[i].data);
+        dvbcsa_copy_32((uint8_t *)(row + i + 32), pcks[i].data + 4);
+      }
+
+  dvbcsa_bs_matrix_transpose_64x32(row);
+}
+
+void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
+                unsigned int index, dvbcsa_bs_word_t *row)
+{
+  int i, j;
+  uint8_t *p1, *p2;
+
+  dvbcsa_bs_matrix_transpose_64x32(row);
+
+  p1 = (uint8_t *)row;
+  p2 = (uint8_t *)(row + BS_BATCH_SIZE);
+  for (; pcks->data; pcks++, p1 += 4, p2 += 4)
+    {
+      if (index + 4 <= pcks->len)
+        {
+            dvbcsa_xor_32(pcks->data + index, p1);
+        }
+      else
+        {
+          for (j = 0, i = index; i < pcks->len; i++, j++)
+             pcks->data[i] ^= p1[j];
+          continue;
+        }
+      if (index + 8 <= pcks->len)
+        {
+            dvbcsa_xor_32(pcks->data + index + 4, p2);
+        }
+      else
+        {
+          for (j = 0, i = index + 4; i < pcks->len; i++, j++)
+             pcks->data[i] ^= p2[j];
+          continue;
+        }
+    }
+
+}
+
diff --git a/src/dvbcsa_pv.h b/src/dvbcsa_pv.h
index d92bc98..d0ce8eb 100644
--- a/src/dvbcsa_pv.h
+++ b/src/dvbcsa_pv.h
@@ -83,12 +83,26 @@ void dvbcsa_stream_xor (const dvbcsa_cw_t cw, const dvbcsa_block_t iv,
 
 void dvbcsa_key_schedule_block(const dvbcsa_cw_t cw, uint8_t * kk);
 
+/* target support for 32 and 64 bit unaligned memory access */
+
+#if defined(__i386__) || defined(__x86_64__)
+#define DVBCSA_UNALIGNED_ACCESS_32 1
+#define DVBCSA_UNALIGNED_ACCESS_64 1
+#endif
+
+#if defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)
+/* only 32 bit unaligned access is allowed for armv6, armv7, armv8 */
+#define DVBCSA_UNALIGNED_ACCESS_32 1
+#endif
+
 DVBCSA_INLINE static inline void
 dvbcsa_xor_64 (uint8_t *b, const uint8_t *a)
 {
-#if defined(__i386__) || defined(__x86_64__)
-  /* target support non aligned memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_64)
   *(uint64_t*)b ^= *(uint64_t*)a;
+#elif defined(DVBCSA_UNALIGNED_ACCESS_32)
+  ((uint32_t *)b)[0] ^= ((uint32_t *)a)[0];
+  ((uint32_t *)b)[1] ^= ((uint32_t *)a)[1];
 #else
   unsigned int i;
 
@@ -97,11 +111,55 @@ dvbcsa_xor_64 (uint8_t *b, const uint8_t *a)
 #endif
 }
 
+DVBCSA_INLINE static inline void
+dvbcsa_xor_32 (uint8_t *b, const uint8_t *a)
+{
+#ifdef DVBCSA_UNALIGNED_ACCESS_32
+  /* target supports non aligned memory access */
+  *(uint32_t*)b ^= *(uint32_t*)a;
+#else
+  unsigned int i;
+
+  for (i = 0; i < 4; i++)
+    b[i] ^= a[i];
+#endif
+}
+
+DVBCSA_INLINE static inline void
+dvbcsa_copy_64 (uint8_t *b, const uint8_t *a)
+{
+#if defined(DVBCSA_UNALIGNED_ACCESS_64)
+  *(uint64_t*)b = *(uint64_t*)a;
+#elif defined(DVBCSA_UNALIGNED_ACCESS_32)
+  ((uint32_t *)b)[0] = ((uint32_t *)a)[0];
+  ((uint32_t *)b)[1] = ((uint32_t *)a)[1];
+#else
+  unsigned int i;
+
+  for (i = 0; i < 8; i++)
+    b[i] = a[i];
+#endif
+}
+
+DVBCSA_INLINE static inline void
+dvbcsa_copy_32 (uint8_t *b, const uint8_t *a)
+{
+#ifdef DVBCSA_UNALIGNED_ACCESS_32
+  /* target supports non aligned memory access */
+  *(uint32_t*)b = *(uint32_t*)a;
+#else
+  unsigned int i;
+
+  for (i = 0; i < 4; i++)
+    b[i] = a[i];
+#endif
+}
+
 DVBCSA_INLINE static inline uint32_t
 dvbcsa_load_le32(const uint8_t *p)
 {
-#if defined(__i386__) || defined(__x86_64__)
-  /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_32) && defined(DVBCSA_ENDIAN_LITTLE)
+  /* target supports non aligned le memory access */
   return *(uint32_t*)p;
 #else
   return ((uint32_t)p[3] << 24) |
@@ -114,8 +172,8 @@ dvbcsa_load_le32(const uint8_t *p)
 DVBCSA_INLINE static inline uint64_t
 dvbcsa_load_le64(const uint8_t *p)
 {
-#if defined(__i386__) || defined(__x86_64__)
-  /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_64) && defined(DVBCSA_ENDIAN_LITTLE)
+  /* target supports non aligned le memory access */
   return *(uint64_t*)p;
 #else
   return (uint64_t)( ((uint64_t)p[7] << 56) |
@@ -133,8 +191,8 @@ dvbcsa_load_le64(const uint8_t *p)
 DVBCSA_INLINE static inline void
 dvbcsa_store_le32(uint8_t *p, const uint32_t w)
 {
-#if defined(__i386__) || defined(__x86_64__)
-  /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_32) && defined(DVBCSA_ENDIAN_LITTLE)
+  /* target supports non aligned le memory access */
   *(uint32_t*)p = w;
 #else
   p[3] = (w >> 24);
@@ -147,8 +205,8 @@ dvbcsa_store_le32(uint8_t *p, const uint32_t w)
 DVBCSA_INLINE static inline void
 dvbcsa_store_le64(uint8_t *p, const uint64_t w)
 {
-#if defined(__i386__) || defined(__x86_64__)
-  /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_64) && defined(DVBCSA_ENDIAN_LITTLE)
+  /* target supports non aligned le memory access */
   *(uint64_t*)p = w;
 #else
   p[7] = (w >> 56);
-- 
1.9.1





More information about the vlc-devel mailing list