[vlc-devel] [PATCH 05/16] bitslice transform: rewrite
glenvt18
glenvt18 at gmail.com
Fri Jun 26 13:19:59 CEST 2015
What has been done:
1. New optimized matrix transpose routines.
2. New BS_SWAPxx target-specific macros can be defined for further speed-up.
3. Separate 32 bit and all other stream transforms.
4. Packet data are read/written in native endianess. Endianess is now
handled by transpose routines. No need for endianess conversion anymore.
5. Use unaligned 32-bit memory access for ARM if supported by the target.
---
configure.ac | 15 ++-
src/Makefile.am | 27 ++---
src/dvbcsa_bs_stream.c | 4 +-
src/dvbcsa_bs_transpose.c | 112 ------------------
src/dvbcsa_bs_transpose.h | 71 ++++++++++++
src/dvbcsa_bs_transpose128.c | 209 ---------------------------------
src/dvbcsa_bs_transpose32.c | 185 -----------------------------
src/dvbcsa_bs_transpose64.c | 186 -----------------------------
src/dvbcsa_bs_transpose_block.c | 97 ++++++++++++++++
src/dvbcsa_bs_transpose_stream.c | 231 +++++++++++++++++++++++++++++++++++++
src/dvbcsa_bs_transpose_stream32.c | 150 ++++++++++++++++++++++++
src/dvbcsa_pv.h | 78 +++++++++++--
12 files changed, 637 insertions(+), 728 deletions(-)
delete mode 100644 src/dvbcsa_bs_transpose.c
create mode 100644 src/dvbcsa_bs_transpose.h
delete mode 100644 src/dvbcsa_bs_transpose128.c
delete mode 100644 src/dvbcsa_bs_transpose32.c
delete mode 100644 src/dvbcsa_bs_transpose64.c
create mode 100644 src/dvbcsa_bs_transpose_block.c
create mode 100644 src/dvbcsa_bs_transpose_stream.c
create mode 100644 src/dvbcsa_bs_transpose_stream32.c
diff --git a/configure.ac b/configure.ac
index cefdf8a..4dd0726 100644
--- a/configure.ac
+++ b/configure.ac
@@ -32,23 +32,26 @@ AC_C_CONST
AC_C_INLINE
AC_CHECK_SIZEOF(long)
+AC_C_BIGENDIAN(
+ AC_DEFINE(DVBCSA_ENDIAN_BIG, 1, [Target is big-endian]),
+ AC_DEFINE(DVBCSA_ENDIAN_LITTLE, 1, [Target is little-endian]),
+ AC_MSG_ERROR(unknown endianess),
+ AC_MSG_ERROR(universial endianess not supported)
+)
+
if test "$enable_mmx" = "yes" ; then
- transpose_64=yes
AC_DEFINE(DVBCSA_USE_MMX, 1, Using MMX bitslice.)
GCC_CFLAGS="$GCC_CFLAGS -mmmx"
elif test "$enable_sse2" = "yes" ; then
- transpose_128=yes
AC_DEFINE(DVBCSA_USE_SSE, 1, Using SSE2 bitslice.)
GCC_CFLAGS="$GCC_CFLAGS -msse -msse2"
elif test "$enable_altivec" = "yes" ; then
- transpose_128=yes
AC_DEFINE(DVBCSA_USE_ALTIVEC, 1, Using AltiVec bitslice.)
GCC_CFLAGS="$GCC_CFLAGS -maltivec -mabi=altivec"
elif test "$enable_neon" = "yes" ; then
- transpose_128=yes
AC_DEFINE(DVBCSA_USE_NEON, 1, Using NEON bitslice.)
GCC_CFLAGS="$GCC_CFLAGS -mfpu=neon"
@@ -57,13 +60,11 @@ elif test "$enable_uint32" = "yes" ; then
AC_DEFINE(DVBCSA_USE_UINT32, 1, Using 32 bits integer bitslice.)
elif test "$enable_uint64" = "yes" ; then
- transpose_64=yes
AC_DEFINE(DVBCSA_USE_UINT64, 1, Using 64 bits integer bitslice.)
else
case $ac_cv_sizeof_long in
8)
- transpose_64=yes
AC_DEFINE(DVBCSA_USE_UINT64, 1, Using 64 bits integer bitslice.)
;;
*)
@@ -73,8 +74,6 @@ else
esac
fi
-AM_CONDITIONAL(TRANSPOSE_128, test "$transpose_128" = "yes")
-AM_CONDITIONAL(TRANSPOSE_64, test "$transpose_64" = "yes")
AM_CONDITIONAL(TRANSPOSE_32, test "$transpose_32" = "yes")
if test "$GCC" = "yes" ; then
diff --git a/src/Makefile.am b/src/Makefile.am
index 3bad07a..996ce4d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -3,23 +3,20 @@ SUBDIRS = dvbcsa
lib_LTLIBRARIES = libdvbcsa.la
-libdvbcsa_la_SOURCES = dvbcsa_algo.c dvbcsa_block.c dvbcsa_bs_algo.c \
- dvbcsa_bs_block.c dvbcsa_bs_key.c dvbcsa_bs_stream.c \
- dvbcsa_stream.c dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_uint64.h \
- dvbcsa_bs_uint32.h dvbcsa_bs_mmx.h dvbcsa_bs_sse.h \
- dvbcsa_bs_altivec.h dvbcsa_bs_neon.h dvbcsa_bs_transpose.c dvbcsa_key.c \
- dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h
-
-if TRANSPOSE_128
-libdvbcsa_la_SOURCES += dvbcsa_bs_transpose128.c
-endif
-
-if TRANSPOSE_64
-libdvbcsa_la_SOURCES += dvbcsa_bs_transpose64.c
-endif
+libdvbcsa_la_SOURCES = \
+ dvbcsa_algo.c dvbcsa_block.c dvbcsa_stream.c dvbcsa_key.c \
+ dvbcsa_bs_algo.c dvbcsa_bs_block.c dvbcsa_bs_stream.c dvbcsa_bs_key.c \
+ dvbcsa_bs_transpose_block.c \
+ dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h \
+ dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_transpose.h \
+ dvbcsa_bs_uint32.h ddvbcsa_bs_uint64.h \
+ dvbcsa_bs_mmx.h dvbcsa_bs_sse.h \
+ dvbcsa_bs_altivec.h dvbcsa_bs_neon.h
if TRANSPOSE_32
-libdvbcsa_la_SOURCES += dvbcsa_bs_transpose32.c
+libdvbcsa_la_SOURCES += dvbcsa_bs_transpose_stream32.c
+else
+libdvbcsa_la_SOURCES += dvbcsa_bs_transpose_stream.c
endif
libdvbcsa_la_LDFLAGS = -version-info 1:1:0 $(libtool_flags)
diff --git a/src/dvbcsa_bs_stream.c b/src/dvbcsa_bs_stream.c
index 688a70d..41c0099 100644
--- a/src/dvbcsa_bs_stream.c
+++ b/src/dvbcsa_bs_stream.c
@@ -80,9 +80,7 @@ dvbcsa_bs_stream_cipher_batch(const struct dvbcsa_bs_key_s *key,
for (h = 8; h < maxlen; h += 8)
{
dvbcsa_bs_stream_cipher_kernel(®s);
- for (i = 0; i < 8; i++)
- dvbcsa_bs_stream_transpose_out(pcks, h + i, regs.cb + i * 8);
-
+ dvbcsa_bs_stream_transpose_out(pcks, h, regs.cb);
}
}
diff --git a/src/dvbcsa_bs_transpose.c b/src/dvbcsa_bs_transpose.c
deleted file mode 100644
index 57d208e..0000000
--- a/src/dvbcsa_bs_transpose.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
-
- This file is part of libdvbcsa.
-
- libdvbcsa is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2 of the License,
- or (at your option) any later version.
-
- libdvbcsa is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libdvbcsa; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA
-
- Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
-
- (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
- Block cipher transpose
- */
-
-void dvbcsa_bs_block_transpose_in (dvbcsa_bs_word_t *out,
- const struct dvbcsa_bs_batch_s *pcks,
- unsigned int offset)
-{
- uint32_t *ri = (uint32_t *) out;
- unsigned int j, i, k;
-
- for (i = 0; pcks[i].data; i++)
- if (offset < (pcks[i].len & (unsigned)~0x7))
- {
- ri[i ] = dvbcsa_load_le32(pcks[i].data + offset);
- ri[i + BS_BATCH_SIZE] = dvbcsa_load_le32(pcks[i].data + offset + 4);
- }
-
- for (j = 0; j < 64; j += 32)
- for (i = 0; i < 16; i += 8)
- for (k = 0; k < 8; k++)
- {
- dvbcsa_bs_word_t *r = out + j + i + k;
- dvbcsa_bs_word_t t, b;
-
- t = r[0];
- b = r[16];
- r[0] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- r[16] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
-
- for (j = 0; j < 64; j += 16)
- for (k = 0; k < 8; k++)
- {
- dvbcsa_bs_word_t *r = out + j + k;
- dvbcsa_bs_word_t t, b;
-
- t = r[0];
- b = r[8];
- r[0] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- r[8] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
-}
-
-void dvbcsa_bs_block_transpose_out (dvbcsa_bs_word_t *in,
- const struct dvbcsa_bs_batch_s *pcks,
- unsigned int offset)
-{
- uint32_t *ri = (uint32_t *) in;
- unsigned int j, i, k;
-
- for (j = 0; j < 64; j += 16)
- for (k = 0; k < 8; k++)
- {
- dvbcsa_bs_word_t *r = in + j + k;
- dvbcsa_bs_word_t t, b;
-
- t = r[0];
- b = r[8];
- r[0] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- r[8] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
-
- for (j = 0; j < 64; j += 32)
- for (i = 0; i < 16; i += 8)
- for (k = 0; k < 8; k++)
- {
- dvbcsa_bs_word_t *r = in + j + i + k;
- dvbcsa_bs_word_t t, b;
-
- t = r[0];
- b = r[16];
- r[0] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- r[16] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
-
- for (i = 0; pcks[i].data; i++)
- if (offset < (pcks[i].len & (unsigned)~0x7))
- {
- dvbcsa_store_le32(pcks[i].data + offset , ri[i ]);
- dvbcsa_store_le32(pcks[i].data + offset + 4, ri[i + BS_BATCH_SIZE]);
- }
-}
-
diff --git a/src/dvbcsa_bs_transpose.h b/src/dvbcsa_bs_transpose.h
new file mode 100644
index 0000000..56ecf47
--- /dev/null
+++ b/src/dvbcsa_bs_transpose.h
@@ -0,0 +1,71 @@
+#ifndef DVBCSA_BS_TRANSPOSE_H_
+#define DVBCSA_BS_TRANSPOSE_H_
+
+#include "dvbcsa_bs.h"
+
+/*
+ 2x2 matrix transpose swap operation:
+
+ t = |a b c d| => |f b h d|
+ b = |e f g h| => |e a g c|
+
+ 'a' and 'e' are MSB of dvbcsa_bs_word_t.
+ (little-endian transpose)
+
+ tmp = (b ^ (t>>j)) & m;
+ b = b ^ tmp;
+ t = t ^ (tmp<<j);
+*/
+
+#define BS_SWAP_BITS_LE(t, b, shift, mask) \
+ { \
+ dvbcsa_bs_word_t tmp; \
+ tmp = BS_AND(BS_XOR(BS_SHR(t, shift), b), mask); \
+ (b) = BS_XOR((b), tmp); \
+ (t) = BS_XOR((t), BS_SHL(tmp, shift)); \
+ }
+
+#ifndef BS_SWAP32_LE
+#define BS_SWAP32_LE(t, b) BS_SWAP_BITS_LE(t, b, 32, BS_VAL64(00000000ffffffff))
+#endif
+
+#ifndef BS_SWAP16_LE
+#define BS_SWAP16_LE(t, b) BS_SWAP_BITS_LE(t, b, 16, BS_VAL32(0000ffff))
+#endif
+
+#ifndef BS_SWAP8_LE
+#define BS_SWAP8_LE(t, b) BS_SWAP_BITS_LE(t, b, 8, BS_VAL16(00ff))
+#endif
+
+#ifndef BS_SWAP4_LE
+#define BS_SWAP4_LE(t, b) BS_SWAP_BITS_LE(t, b, 4, BS_VAL8(0f))
+#endif
+
+#ifndef BS_SWAP2_LE
+#define BS_SWAP2_LE(t, b) BS_SWAP_BITS_LE(t, b, 2, BS_VAL8(33))
+#endif
+
+#ifndef BS_SWAP1_LE
+#define BS_SWAP1_LE(t, b) BS_SWAP_BITS_LE(t, b, 1, BS_VAL8(55))
+#endif
+
+#define BS_SWAP4(t, b) BS_SWAP4_LE(t, b)
+#define BS_SWAP2(t, b) BS_SWAP2_LE(t, b)
+#define BS_SWAP1(t, b) BS_SWAP1_LE(t, b)
+
+#ifdef DVBCSA_ENDIAN_LITTLE
+
+#define BS_SWAP32(t, b) BS_SWAP32_LE(t, b)
+#define BS_SWAP16(t, b) BS_SWAP16_LE(t, b)
+#define BS_SWAP8(t, b) BS_SWAP8_LE(t, b)
+
+#else
+
+#define BS_SWAP32(t, b) BS_SWAP32_LE(b, t)
+#define BS_SWAP16(t, b) BS_SWAP16_LE(b, t)
+#define BS_SWAP8(t, b) BS_SWAP8_LE(b, t)
+
+#endif
+
+#endif
+
diff --git a/src/dvbcsa_bs_transpose128.c b/src/dvbcsa_bs_transpose128.c
deleted file mode 100644
index 8a75d09..0000000
--- a/src/dvbcsa_bs_transpose128.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
-
- This file is part of libdvbcsa.
-
- libdvbcsa is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2 of the License,
- or (at your option) any later version.
-
- libdvbcsa is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libdvbcsa; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA
-
- Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
-
- (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
- Stream cipher transpose
- */
-
-/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
-
-void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
-{
- int i, j;
-
- for (i = 0; pcks->data; i++)
- {
- uint64_t t, b;
-
- if (pcks->data)
- {
- if (pcks->len >= 8)
- t = dvbcsa_load_le64(pcks->data);
- pcks++;
- }
-
- if (pcks->data)
- {
- if (pcks->len >= 8)
- b = dvbcsa_load_le64(pcks->data);
- pcks++;
- }
-
- row[i] = BS_VAL(b, t);
- }
-
- for (i = 0; i < 32; i++)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[i];
- b = row[32 + i];
- row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(BS_AND(b, BS_VAL64(00000000ffffffff)), 4));
- row[32 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(BS_AND(t, BS_VAL64(ffffffff00000000)), 4));
- }
-
- for (j = 0; j < 64; j += 32)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 16; i++)
- {
- t = row[j + i];
- b = row[j + 16 + i];
- row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
- }
-
- for (j = 0; j < 64; j += 16)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 8; i++)
- {
- t = row[j + i];
- b = row[j + 8 + i];
- row[j + i] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
- }
-
- for (j = 0; j < 64; j += 8)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 4; i++)
- {
- b = row[j + i];
- t = row[j + 4 + i];
- row[j + i] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
- row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
- }
- }
-
- for (j = 0; j < 64; j += 4)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 2; i++)
- {
- b = row[j + i];
- t = row[j + 2 + i];
- row[j + i] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
- row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
- }
- }
-
- for (j = 0; j < 64; j += 2)
- {
- dvbcsa_bs_word_t t, b;
-
- b = row[j];
- t = row[j + 1];
- row[j] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
- row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
- }
-}
-
-/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
-
-void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
- unsigned int index, dvbcsa_bs_word_t *row)
-{
- int i, j;
-
- for (i = 0; i < 4; i++)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[i];
- b = row[4 + i];
- row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(BS_AND(b, BS_VAL64(00000000ffffffff)), 4));
- row[4 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(BS_AND(t, BS_VAL64(ffffffff00000000)), 4));
- }
-
- for (j = 0; j < 8; j += 4)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 2; i++)
- {
- t = row[j + i];
- b = row[j + 2 + i];
- row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- row[j + 2 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
- }
-
- for (j = 0; j < 8; j += 2)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[j];
- b = row[j + 1];
- row[j] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- row[j + 1] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
-
- for (j = 0; j < 8; j++)
- {
- dvbcsa_bs_word_t t;
-
- t = row[j];
-
- t = BS_OR( BS_AND(t, BS_VAL64(f0f0f0f00f0f0f0f)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL64(0f0f0f0f00000000)), 28),
- BS_SHL(BS_AND(t, BS_VAL64(00000000f0f0f0f0)), 28)));
-
- t = BS_OR( BS_AND(t, BS_VAL32( cccc3333)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL32( 33330000)), 14),
- BS_SHL(BS_AND(t, BS_VAL32( 0000cccc)), 14)));
-
- t = BS_OR( BS_AND(t, BS_VAL16( aa55)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL16( 5500)), 7 ),
- BS_SHL(BS_AND(t, BS_VAL16( 00aa)), 7 )));
-
- for (i = 0; i < BS_BATCH_BYTES; i++)
- {
- static const unsigned int p[16] =
- {
- 0 , 8, 1, 9, 2, 10, 3, 11,
- 4, 12, 5, 13, 6, 14, 7, 15
- };
-
- unsigned int k = j * BS_BATCH_BYTES + i;
-
- if (!pcks[k].data)
- return;
-
- if (index < pcks[k].len)
- pcks[k].data[index] ^= BS_EXTRACT8(t, p[i]);
- }
- }
-}
-
diff --git a/src/dvbcsa_bs_transpose32.c b/src/dvbcsa_bs_transpose32.c
deleted file mode 100644
index 2cfff7a..0000000
--- a/src/dvbcsa_bs_transpose32.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-
- This file is part of libdvbcsa.
-
- libdvbcsa is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2 of the License,
- or (at your option) any later version.
-
- libdvbcsa is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libdvbcsa; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA
-
- Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
-
- (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
- Stream cipher transpose
- */
-
-/* 64 rows of 32 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
-
-void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
-{
- int i, j;
-
- for (i = 0; pcks[i].data; i++)
- if (pcks[i].len >= 8)
- {
- row[i ] = BS_VAL(dvbcsa_load_le32(pcks[i].data ));
- row[i + 32] = BS_VAL(dvbcsa_load_le32(pcks[i].data + 4));
- }
-
- for (j = 0; j < 64; j += 32)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 16; i++)
- {
- t = row[j + i];
- b = row[j + 16 + i];
- row[j + i ] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
- }
-
- for (j = 0; j < 64; j += 16)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 8; i++)
- {
- t = row[j + i];
- b = row[j + 8 + i];
- row[j + i ] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
- }
-
- for (j = 0; j < 64; j += 8)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 4; i++)
- {
- t = row[j + i];
- b = row[j + 4 + i];
- row[j + i ] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
- row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
- }
- }
-
- for (j = 0; j < 64; j += 4)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 2; i++)
- {
- t = row[j + i];
- b = row[j + 2 + i];
- row[j + i ] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
- row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
- }
- }
-
- for (j = 0; j < 64; j += 2)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[j];
- b = row[j + 1];
- row[j ] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
- row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
- }
-}
-
-/* 64 rows of 32 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
-
-void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
- unsigned int index, dvbcsa_bs_word_t *row)
-{
- int i, j;
-
- for (i = 0; i < 4; i++)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[i];
- b = row[4 + i];
- row[i ] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- row[4 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
-
- for (j = 0; j < 8; j += 4)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 2; i++)
- {
- t = row[j + i];
- b = row[j + 2 + i];
- row[j + i ] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- row[j + i + 2] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
- }
-
- for (j = 0; j < 8; j += 2)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[j];
- b = row[j + 1];
- row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(0f)), BS_SHL(BS_AND(b, BS_VAL8(0f)), 4)); //(t & 0x0f0f0f0f) | ((b & 0x0f0f0f0f) << 4);
- row[j ] = BS_OR(BS_AND(b, BS_VAL8(f0)), BS_SHR(BS_AND(t, BS_VAL8(f0)), 4));//((t & 0xf0f0f0f0) >> 4) | (b & 0xf0f0f0f0);
- }
-
- for (j = 0; j < 8; j++)
- {
- dvbcsa_bs_word_t t;
-
- t = row[j];
-
- t = BS_OR( BS_AND(t, BS_VAL32(cccc3333)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL32(33330000)), 14),
- BS_SHL(BS_AND(t, BS_VAL32(0000cccc)), 14)));
-
- t = BS_OR( BS_AND(t, BS_VAL16( aa55)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL16( 5500)), 7 ),
- BS_SHL(BS_AND(t, BS_VAL16( 00aa)), 7 )));
-
- t = BS_OR( BS_AND(t, BS_VAL8 ( 81)),
-
- BS_OR(BS_SHR(BS_AND(t, BS_VAL8 ( 10)), 3 ),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL8 ( 20)), 2 ),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL8 ( 40)), 1 ),
-
- BS_OR(BS_SHL(BS_AND(t, BS_VAL8 ( 02)), 1 ),
- BS_OR(BS_SHL(BS_AND(t, BS_VAL8 ( 04)), 2 ),
- BS_SHL(BS_AND(t, BS_VAL8 ( 08)), 3 )))))));
-
- for (i = 0; i < 4; i++)
- {
- unsigned int k = j * 4 + i;
-
- if (!pcks[k].data)
- return;
-
- if (index < pcks[k].len)
- pcks[k].data[index] ^= BS_EXTRACT8(t, 3 - i);
- }
- }
-}
-
diff --git a/src/dvbcsa_bs_transpose64.c b/src/dvbcsa_bs_transpose64.c
deleted file mode 100644
index c75127b..0000000
--- a/src/dvbcsa_bs_transpose64.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
-
- This file is part of libdvbcsa.
-
- libdvbcsa is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2 of the License,
- or (at your option) any later version.
-
- libdvbcsa is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libdvbcsa; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA
-
- Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
-
- (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
-
-*/
-
-#include "dvbcsa/dvbcsa.h"
-#include "dvbcsa_bs.h"
-
-/***********************************************************************
- Stream cipher transpose
- */
-
-/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
-
-void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
-{
- int i, j;
-
- for (i = 0; pcks[i].data; i++)
- if (pcks[i].len >= 8)
- row[i] = BS_VAL(dvbcsa_load_le64(pcks[i].data));
-
- for (i = 0; i < 32; i++)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[i];
- b = row[32 + i];
- row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(b, 4));
- row[32 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(t, 4));
- }
-
- for (j = 0; j < 64; j += 32)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 16; i++)
- {
- t = row[j + i];
- b = row[j + 16 + i];
- row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
- }
-
- for (j = 0; j < 64; j += 16)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 8; i++)
- {
- t = row[j + i];
- b = row[j + 8 + i];
- row[j + i] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
- }
-
- for (j = 0; j < 64; j += 8)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 4; i++)
- {
- b = row[j + i];
- t = row[j + 4 + i];
- row[j + i] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
- row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
- }
- }
-
- for (j = 0; j < 64; j += 4)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 2; i++)
- {
- b = row[j + i];
- t = row[j + 2 + i];
- row[j + i] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
- row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
- }
- }
-
- for (j = 0; j < 64; j += 2)
- {
- dvbcsa_bs_word_t t, b;
-
- b = row[j];
- t = row[j + 1];
- row[j] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
- row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
- }
-}
-
-/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
-
-void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
- unsigned int index, dvbcsa_bs_word_t *row)
-{
- int i, j;
-
- for (i = 0; i < 4; i++)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[i];
- b = row[4 + i];
- row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(b, 4));
- row[4 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(t, 4));
- }
-
- for (j = 0; j < 8; j += 4)
- {
- dvbcsa_bs_word_t t, b;
-
- for (i = 0; i < 2; i++)
- {
- t = row[j + i];
- b = row[j + 2 + i];
- row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
- row[j + 2 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
- }
- }
-
- for (j = 0; j < 8; j += 2)
- {
- dvbcsa_bs_word_t t, b;
-
- t = row[j];
- b = row[j + 1];
- row[j] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
- row[j + 1] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
- }
-
- for (j = 0; j < 8; j++)
- {
- dvbcsa_bs_word_t t;
-
- t = row[j];
-
- t = BS_OR( BS_AND(t, BS_VAL64(f0f0f0f00f0f0f0f)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL64(0f0f0f0f00000000)), 28),
- BS_SHL(BS_AND(t, BS_VAL64(00000000f0f0f0f0)), 28)));
-
- t = BS_OR( BS_AND(t, BS_VAL32( cccc3333)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL32( 33330000)), 14),
- BS_SHL(BS_AND(t, BS_VAL32( 0000cccc)), 14)));
-
- t = BS_OR( BS_AND(t, BS_VAL16( aa55)),
- BS_OR(BS_SHR(BS_AND(t, BS_VAL16( 5500)), 7 ),
- BS_SHL(BS_AND(t, BS_VAL16( 00aa)), 7 )));
-
- for (i = 0; i < BS_BATCH_BYTES; i++)
- {
- unsigned int k = j * BS_BATCH_BYTES + i;
-
- if (!pcks[k].data)
- return;
-
- if (index < pcks[k].len)
- pcks[k].data[index] ^= BS_EXTRACT8(t, i);
- }
- }
-}
-
diff --git a/src/dvbcsa_bs_transpose_block.c b/src/dvbcsa_bs_transpose_block.c
new file mode 100644
index 0000000..5dc4472
--- /dev/null
+++ b/src/dvbcsa_bs_transpose_block.c
@@ -0,0 +1,97 @@
+/*
+
+ This file is part of libdvbcsa.
+
+ libdvbcsa is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2 of the License,
+ or (at your option) any later version.
+
+ libdvbcsa is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libdvbcsa; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA
+
+ Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
+
+ (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
+
+*/
+
+#include "dvbcsa/dvbcsa.h"
+#include "dvbcsa_bs.h"
+#include "dvbcsa_bs_transpose.h"
+
+/*
+ Block cipher transpose
+*/
+
+DVBCSA_INLINE static inline void
+dvbcsa_bs_matrix_transpose_block(dvbcsa_bs_word_t *row)
+{
+ int j;
+
+ for (j = 0; j < 64; j += 32)
+ {
+ int i;
+ for (i = 0; i < 8; i++)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + i + 0];
+ a1 = row[j + i + 8];
+ a2 = row[j + i + 16];
+ a3 = row[j + i + 24];
+
+ BS_SWAP8(a0, a1);
+ BS_SWAP8(a2, a3);
+ BS_SWAP16(a0, a2);
+ BS_SWAP16(a1, a3);
+
+ row[j + i + 0] = a0;
+ row[j + i + 8] = a1;
+ row[j + i + 16] = a2;
+ row[j + i + 24] = a3;
+ }
+ }
+}
+
+void dvbcsa_bs_block_transpose_in (dvbcsa_bs_word_t *out,
+ const struct dvbcsa_bs_batch_s *pcks,
+ unsigned int offset)
+{
+ uint32_t *ri = (uint32_t *)out;
+ unsigned int i;
+
+ for (i = 0; pcks[i].data; i++)
+ if (offset < (pcks[i].len & (unsigned)~0x7))
+ {
+ dvbcsa_copy_32((uint8_t *)(ri + i), pcks[i].data + offset);
+ dvbcsa_copy_32((uint8_t *)(ri + i + BS_BATCH_SIZE), pcks[i].data + offset + 4);
+ }
+
+ dvbcsa_bs_matrix_transpose_block(out);
+}
+
+void dvbcsa_bs_block_transpose_out (dvbcsa_bs_word_t *in,
+ const struct dvbcsa_bs_batch_s *pcks,
+ unsigned int offset)
+{
+ uint32_t *ri = (uint32_t *) in;
+ unsigned int i;
+
+ dvbcsa_bs_matrix_transpose_block(in);
+
+ for (i = 0; pcks[i].data; i++)
+ if (offset < (pcks[i].len & (unsigned)~0x7))
+ {
+ dvbcsa_copy_32(pcks[i].data + offset, (uint8_t *)(ri + i));
+ dvbcsa_copy_32(pcks[i].data + offset + 4, (uint8_t *)(ri + i + BS_BATCH_SIZE));
+ }
+}
+
diff --git a/src/dvbcsa_bs_transpose_stream.c b/src/dvbcsa_bs_transpose_stream.c
new file mode 100644
index 0000000..8fc1a2d
--- /dev/null
+++ b/src/dvbcsa_bs_transpose_stream.c
@@ -0,0 +1,231 @@
+/*
+
+ This file is part of libdvbcsa.
+
+ libdvbcsa is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2 of the License,
+ or (at your option) any later version.
+
+ libdvbcsa is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libdvbcsa; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA
+
+ Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
+
+ (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
+
+*/
+
+#include "dvbcsa/dvbcsa.h"
+#include "dvbcsa_bs.h"
+#include "dvbcsa_bs_transpose.h"
+
+/*
+ Stream cipher transpose for dvbcsa_bs_word_t sizes = 64, 128, 256, ... bits.
+*/
+
+static void dvbcsa_bs_matrix_transpose_64x(dvbcsa_bs_word_t *row)
+{
+ int j;
+
+#if defined(__i386__)
+
+ /* short of registers */
+
+ for (j = 0; j < 16; j++)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + 0];
+ a1 = row[j + 16];
+ a2 = row[j + 32];
+ a3 = row[j + 48];
+
+ BS_SWAP16(a0, a1);
+ BS_SWAP16(a2, a3);
+ BS_SWAP32(a0, a2);
+ BS_SWAP32(a1, a3);
+
+ row[j + 0] = a0;
+ row[j + 16] = a1;
+ row[j + 32] = a2;
+ row[j + 48] = a3;
+ }
+
+ for (j = 0; j < 64; j += 16)
+ {
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + i + 0];
+ a1 = row[j + i + 4];
+ a2 = row[j + i + 8];
+ a3 = row[j + i + 12];
+
+ BS_SWAP4(a0, a1);
+ BS_SWAP4(a2, a3);
+ BS_SWAP8(a0, a2);
+ BS_SWAP8(a1, a3);
+
+ row[j + i + 0] = a0;
+ row[j + i + 4] = a1;
+ row[j + i + 8] = a2;
+ row[j + i + 12] = a3;
+ }
+ }
+
+ for (j = 0; j < 64; j += 4)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + 0];
+ a1 = row[j + 1];
+ a2 = row[j + 2];
+ a3 = row[j + 3];
+
+ BS_SWAP1(a0, a1);
+ BS_SWAP1(a2, a3);
+ BS_SWAP2(a0, a2);
+ BS_SWAP2(a1, a3);
+
+ row[j + 0] = a0;
+ row[j + 1] = a1;
+ row[j + 2] = a2;
+ row[j + 3] = a3;
+ }
+
+#else
+
+ for (j = 0; j < 64; j += 8)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3, b0, b1, b2, b3;
+
+ a0 = row[j + 0];
+ a1 = row[j + 1];
+ a2 = row[j + 2];
+ a3 = row[j + 3];
+
+ b0 = row[j + 4];
+ b1 = row[j + 5];
+ b2 = row[j + 6];
+ b3 = row[j + 7];
+
+ BS_SWAP1(a0, a1);
+ BS_SWAP1(a2, a3);
+ BS_SWAP1(b0, b1);
+ BS_SWAP1(b2, b3);
+
+ BS_SWAP2(a0, a2);
+ BS_SWAP2(a1, a3);
+ BS_SWAP2(b0, b2);
+ BS_SWAP2(b1, b3);
+
+ BS_SWAP4(a0, b0);
+ BS_SWAP4(a1, b1);
+ BS_SWAP4(a2, b2);
+ BS_SWAP4(a3, b3);
+
+ row[j + 0] = a0;
+ row[j + 1] = a1;
+ row[j + 2] = a2;
+ row[j + 3] = a3;
+
+ row[j + 4] = b0;
+ row[j + 5] = b1;
+ row[j + 6] = b2;
+ row[j + 7] = b3;
+ }
+
+ for (j = 0; j < 8; j++)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3, b0, b1, b2, b3;
+
+ a0 = row[j + 0];
+ a1 = row[j + 8];
+ a2 = row[j + 16];
+ a3 = row[j + 24];
+
+ b0 = row[j + 32];
+ b1 = row[j + 40];
+ b2 = row[j + 48];
+ b3 = row[j + 56];
+
+ BS_SWAP8(a0, a1);
+ BS_SWAP8(a2, a3);
+ BS_SWAP8(b0, b1);
+ BS_SWAP8(b2, b3);
+
+ BS_SWAP16(a0, a2);
+ BS_SWAP16(a1, a3);
+ BS_SWAP16(b0, b2);
+ BS_SWAP16(b1, b3);
+
+ BS_SWAP32(a0, b0);
+ BS_SWAP32(a1, b1);
+ BS_SWAP32(a2, b2);
+ BS_SWAP32(a3, b3);
+
+ row[j + 0] = a0;
+ row[j + 8] = a1;
+ row[j + 16] = a2;
+ row[j + 24] = a3;
+
+ row[j + 32] = b0;
+ row[j + 40] = b1;
+ row[j + 48] = b2;
+ row[j + 56] = b3;
+ }
+
+#endif
+}
+
+
+/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
+
+void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
+{
+ uint64_t *p;
+
+ for (p = (uint64_t *)row; pcks->data; p++, pcks++)
+ {
+ if (pcks->len >= 8)
+ dvbcsa_copy_64((uint8_t *)p, pcks->data);
+ }
+
+ dvbcsa_bs_matrix_transpose_64x(row);
+}
+
+/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
+
+void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
+ unsigned int index, dvbcsa_bs_word_t *row)
+{
+ int i, j;
+ uint8_t *p;
+
+ dvbcsa_bs_matrix_transpose_64x(row);
+
+ for (p = (uint8_t *)row; pcks->data; pcks++)
+ {
+ if (index + 8 <= pcks->len)
+ {
+ dvbcsa_xor_64(pcks->data + index, p);
+ }
+ else
+ {
+ for (j = 0, i = index; i < pcks->len; i++, j++)
+ pcks->data[i] ^= p[j];
+ }
+ p += 8;
+ }
+}
+
diff --git a/src/dvbcsa_bs_transpose_stream32.c b/src/dvbcsa_bs_transpose_stream32.c
new file mode 100644
index 0000000..fc2f847
--- /dev/null
+++ b/src/dvbcsa_bs_transpose_stream32.c
@@ -0,0 +1,150 @@
+/*
+
+ This file is part of libdvbcsa.
+
+ libdvbcsa is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2 of the License,
+ or (at your option) any later version.
+
+ libdvbcsa is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libdvbcsa; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA
+
+ Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
+
+ (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
+
+*/
+
+#include "dvbcsa/dvbcsa.h"
+#include "dvbcsa_bs.h"
+#include "dvbcsa_bs_transpose.h"
+
+/*
+ Stream cipher transpose for dvbcsa_bs_word_t size = 32 bits.
+*/
+
+static void dvbcsa_bs_matrix_transpose_64x32(dvbcsa_bs_word_t *row)
+{
+ int j;
+
+ for (j = 0; j < 16; j++)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + 0];
+ a1 = row[j + 16];
+ a2 = row[j + 32];
+ a3 = row[j + 48];
+
+ BS_SWAP16(a0, a1);
+ BS_SWAP16(a2, a3);
+
+ row[j + 0] = a0;
+ row[j + 16] = a1;
+ row[j + 32] = a2;
+ row[j + 48] = a3;
+ }
+
+ for (j = 0; j < 64; j += 16)
+ {
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + i + 0];
+ a1 = row[j + i + 4];
+ a2 = row[j + i + 8];
+ a3 = row[j + i + 12];
+
+ BS_SWAP4(a0, a1);
+ BS_SWAP4(a2, a3);
+ BS_SWAP8(a0, a2);
+ BS_SWAP8(a1, a3);
+
+ row[j + i + 0] = a0;
+ row[j + i + 4] = a1;
+ row[j + i + 8] = a2;
+ row[j + i + 12] = a3;
+ }
+ }
+
+ for (j = 0; j < 64; j += 4)
+ {
+ dvbcsa_bs_word_t a0, a1, a2, a3;
+
+ a0 = row[j + 0];
+ a1 = row[j + 1];
+ a2 = row[j + 2];
+ a3 = row[j + 3];
+
+ BS_SWAP1(a0, a1);
+ BS_SWAP1(a2, a3);
+ BS_SWAP2(a0, a2);
+ BS_SWAP2(a1, a3);
+
+ row[j + 0] = a0;
+ row[j + 1] = a1;
+ row[j + 2] = a2;
+ row[j + 3] = a3;
+ }
+}
+
+void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
+{
+ int i;
+
+ for (i = 0; pcks[i].data; i++)
+ if (pcks[i].len >= 8)
+ {
+ dvbcsa_copy_32((uint8_t *)(row + i), pcks[i].data);
+ dvbcsa_copy_32((uint8_t *)(row + i + 32), pcks[i].data + 4);
+ }
+
+ dvbcsa_bs_matrix_transpose_64x32(row);
+}
+
+void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
+ unsigned int index, dvbcsa_bs_word_t *row)
+{
+ int i, j;
+ uint8_t *p1, *p2;
+
+ dvbcsa_bs_matrix_transpose_64x32(row);
+
+ p1 = (uint8_t *)row;
+ p2 = (uint8_t *)(row + BS_BATCH_SIZE);
+ for (; pcks->data; pcks++, p1 += 4, p2 += 4)
+ {
+ if (index + 4 <= pcks->len)
+ {
+ dvbcsa_xor_32(pcks->data + index, p1);
+ }
+ else
+ {
+ for (j = 0, i = index; i < pcks->len; i++, j++)
+ pcks->data[i] ^= p1[j];
+ continue;
+ }
+ if (index + 8 <= pcks->len)
+ {
+ dvbcsa_xor_32(pcks->data + index + 4, p2);
+ }
+ else
+ {
+ for (j = 0, i = index + 4; i < pcks->len; i++, j++)
+ pcks->data[i] ^= p2[j];
+ continue;
+ }
+ }
+
+}
+
diff --git a/src/dvbcsa_pv.h b/src/dvbcsa_pv.h
index d92bc98..d0ce8eb 100644
--- a/src/dvbcsa_pv.h
+++ b/src/dvbcsa_pv.h
@@ -83,12 +83,26 @@ void dvbcsa_stream_xor (const dvbcsa_cw_t cw, const dvbcsa_block_t iv,
void dvbcsa_key_schedule_block(const dvbcsa_cw_t cw, uint8_t * kk);
+/* target support for 32 and 64 bit unaligned memory access */
+
+#if defined(__i386__) || defined(__x86_64__)
+#define DVBCSA_UNALIGNED_ACCESS_32 1
+#define DVBCSA_UNALIGNED_ACCESS_64 1
+#endif
+
+#if defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)
+/* only 32 bit unaligned access is allowed for armv6, armv7, armv8 */
+#define DVBCSA_UNALIGNED_ACCESS_32 1
+#endif
+
DVBCSA_INLINE static inline void
dvbcsa_xor_64 (uint8_t *b, const uint8_t *a)
{
-#if defined(__i386__) || defined(__x86_64__)
- /* target support non aligned memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_64)
*(uint64_t*)b ^= *(uint64_t*)a;
+#elif defined(DVBCSA_UNALIGNED_ACCESS_32)
+ ((uint32_t *)b)[0] ^= ((uint32_t *)a)[0];
+ ((uint32_t *)b)[1] ^= ((uint32_t *)a)[1];
#else
unsigned int i;
@@ -97,11 +111,55 @@ dvbcsa_xor_64 (uint8_t *b, const uint8_t *a)
#endif
}
+DVBCSA_INLINE static inline void
+dvbcsa_xor_32 (uint8_t *b, const uint8_t *a)
+{
+#ifdef DVBCSA_UNALIGNED_ACCESS_32
+ /* target supports non aligned memory access */
+ *(uint32_t*)b ^= *(uint32_t*)a;
+#else
+ unsigned int i;
+
+ for (i = 0; i < 4; i++)
+ b[i] ^= a[i];
+#endif
+}
+
+DVBCSA_INLINE static inline void
+dvbcsa_copy_64 (uint8_t *b, const uint8_t *a)
+{
+#if defined(DVBCSA_UNALIGNED_ACCESS_64)
+ *(uint64_t*)b = *(uint64_t*)a;
+#elif defined(DVBCSA_UNALIGNED_ACCESS_32)
+ ((uint32_t *)b)[0] = ((uint32_t *)a)[0];
+ ((uint32_t *)b)[1] = ((uint32_t *)a)[1];
+#else
+ unsigned int i;
+
+ for (i = 0; i < 8; i++)
+ b[i] = a[i];
+#endif
+}
+
+DVBCSA_INLINE static inline void
+dvbcsa_copy_32 (uint8_t *b, const uint8_t *a)
+{
+#ifdef DVBCSA_UNALIGNED_ACCESS_32
+ /* target supports non aligned memory access */
+ *(uint32_t*)b = *(uint32_t*)a;
+#else
+ unsigned int i;
+
+ for (i = 0; i < 4; i++)
+ b[i] = a[i];
+#endif
+}
+
DVBCSA_INLINE static inline uint32_t
dvbcsa_load_le32(const uint8_t *p)
{
-#if defined(__i386__) || defined(__x86_64__)
- /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_32) && defined(DVBCSA_ENDIAN_LITTLE)
+ /* target supports non aligned le memory access */
return *(uint32_t*)p;
#else
return ((uint32_t)p[3] << 24) |
@@ -114,8 +172,8 @@ dvbcsa_load_le32(const uint8_t *p)
DVBCSA_INLINE static inline uint64_t
dvbcsa_load_le64(const uint8_t *p)
{
-#if defined(__i386__) || defined(__x86_64__)
- /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_64) && defined(DVBCSA_ENDIAN_LITTLE)
+ /* target supports non aligned le memory access */
return *(uint64_t*)p;
#else
return (uint64_t)( ((uint64_t)p[7] << 56) |
@@ -133,8 +191,8 @@ dvbcsa_load_le64(const uint8_t *p)
DVBCSA_INLINE static inline void
dvbcsa_store_le32(uint8_t *p, const uint32_t w)
{
-#if defined(__i386__) || defined(__x86_64__)
- /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_32) && defined(DVBCSA_ENDIAN_LITTLE)
+ /* target supports non aligned le memory access */
*(uint32_t*)p = w;
#else
p[3] = (w >> 24);
@@ -147,8 +205,8 @@ dvbcsa_store_le32(uint8_t *p, const uint32_t w)
DVBCSA_INLINE static inline void
dvbcsa_store_le64(uint8_t *p, const uint64_t w)
{
-#if defined(__i386__) || defined(__x86_64__)
- /* target support non aligned le memory access */
+#if defined(DVBCSA_UNALIGNED_ACCESS_64) && defined(DVBCSA_ENDIAN_LITTLE)
+ /* target supports non aligned le memory access */
*(uint64_t*)p = w;
#else
p[7] = (w >> 56);
--
1.9.1
More information about the vlc-devel
mailing list