[FFmpeg-cvslog] Merge commit 'ea7ee4b4e381e0fa731458de0cbf740430eeb013'

Clément Bœsch git at videolan.org
Wed Apr 26 17:25:01 EEST 2017


ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Wed Apr 26 16:21:00 2017 +0200| [172b0e2e88832822632841e8e0d3794f974cbc93] | committer: Clément Bœsch

Merge commit 'ea7ee4b4e381e0fa731458de0cbf740430eeb013'

* commit 'ea7ee4b4e381e0fa731458de0cbf740430eeb013':
  ppc: Centralize compiler-specific altivec.h #include handling in one place

Merged-by: Clément Bœsch <u at pkh.me>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=172b0e2e88832822632841e8e0d3794f974cbc93
---

 libavcodec/ppc/audiodsp.c                  | 5 +----
 libavcodec/ppc/blockdsp.c                  | 7 +++----
 libavcodec/ppc/fdctdsp.c                   | 6 +++---
 libavcodec/ppc/fft_init.c                  | 1 -
 libavcodec/ppc/fft_vsx.c                   | 1 -
 libavcodec/ppc/fft_vsx.h                   | 1 -
 libavcodec/ppc/h264chroma_init.c           | 3 ++-
 libavcodec/ppc/h264chroma_template.c       | 1 -
 libavcodec/ppc/h264dsp.c                   | 1 -
 libavcodec/ppc/h264qpel.c                  | 4 +++-
 libavcodec/ppc/h264qpel_template.c         | 1 -
 libavcodec/ppc/hevcdsp.c                   | 5 +----
 libavcodec/ppc/hpeldsp_altivec.c           | 7 ++-----
 libavcodec/ppc/idctdsp.c                   | 9 ++++-----
 libavcodec/ppc/lossless_audiodsp_altivec.c | 6 ++----
 libavcodec/ppc/lossless_videodsp_altivec.c | 5 +----
 libavcodec/ppc/me_cmp.c                    | 5 +----
 libavcodec/ppc/mpegvideo_altivec.c         | 3 ++-
 libavcodec/ppc/mpegvideodsp.c              | 2 +-
 libavcodec/ppc/mpegvideoencdsp.c           | 6 ++----
 libavcodec/ppc/pixblockdsp.c               | 5 +----
 libavcodec/ppc/svq1enc_altivec.c           | 9 +++------
 libavcodec/ppc/vc1dsp_altivec.c            | 3 ++-
 libavcodec/ppc/vorbisdsp_altivec.c         | 6 +++---
 libavcodec/ppc/vp3dsp_altivec.c            | 3 ++-
 libavcodec/ppc/vp8dsp_altivec.c            | 4 +++-
 libpostproc/postprocess.c                  | 5 +----
 libswscale/swscale_internal.h              | 6 +-----
 28 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/libavcodec/ppc/audiodsp.c b/libavcodec/ppc/audiodsp.c
index 4ee3da42d2..2e374737bd 100644
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@ -24,15 +24,12 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/audiodsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c
index 45c492ab3b..d89b77e088 100644
--- a/libavcodec/ppc/blockdsp.c
+++ b/libavcodec/ppc/blockdsp.c
@@ -21,16 +21,15 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+
 #include <string.h>
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/blockdsp.h"
 
 /* ***** WARNING ***** WARNING ***** WARNING ***** */
diff --git a/libavcodec/ppc/fdctdsp.c b/libavcodec/ppc/fdctdsp.c
index 6659046f98..4ab516c6b3 100644
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@ -19,14 +19,14 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/fdctdsp.h"
+
 #include "fdct.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c
index cbeaf98952..57d7c80ea4 100644
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@ -23,7 +23,6 @@
 #include "config.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fft.h"
 
diff --git a/libavcodec/ppc/fft_vsx.c b/libavcodec/ppc/fft_vsx.c
index e92975f74e..c365fa1380 100644
--- a/libavcodec/ppc/fft_vsx.c
+++ b/libavcodec/ppc/fft_vsx.c
@@ -25,7 +25,6 @@
 
 #include "config.h"
 #include "libavutil/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fft.h"
 #include "libavcodec/fft-internal.h"
diff --git a/libavcodec/ppc/fft_vsx.h b/libavcodec/ppc/fft_vsx.h
index a85475d160..1e44031aa5 100644
--- a/libavcodec/ppc/fft_vsx.h
+++ b/libavcodec/ppc/fft_vsx.h
@@ -27,7 +27,6 @@
 
 #include "config.h"
 #include "libavutil/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fft.h"
 #include "libavcodec/fft-internal.h"
diff --git a/libavcodec/ppc/h264chroma_init.c b/libavcodec/ppc/h264chroma_init.c
index 876efeca09..bd0d213bdc 100644
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@ -19,12 +19,13 @@
  */
 
 #include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/h264chroma.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index d9b2a619e4..8f43e5dee1 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -19,7 +19,6 @@
  */
 
 #include "libavutil/mem.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
 /* this code assume that stride % 16 == 0 */
diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index 22a8d4117b..e84a058d04 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -28,7 +28,6 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
 #include "libavcodec/h264dec.h"
diff --git a/libavcodec/ppc/h264qpel.c b/libavcodec/ppc/h264qpel.c
index 575f504d32..bef421fa4f 100644
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@ -19,13 +19,15 @@
  */
 
 #include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/h264qpel.h"
+
 #include "hpeldsp_altivec.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/h264qpel_template.c b/libavcodec/ppc/h264qpel_template.c
index 2f25e74840..304604c63d 100644
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@ -25,7 +25,6 @@
 
 #include "libavutil/avassert.h"
 #include "libavutil/mem.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
 #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
diff --git a/libavcodec/ppc/hevcdsp.c b/libavcodec/ppc/hevcdsp.c
index 120362bebf..4b1037d792 100644
--- a/libavcodec/ppc/hevcdsp.c
+++ b/libavcodec/ppc/hevcdsp.c
@@ -20,13 +20,10 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
+#include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
 #include "libavcodec/hevcdsp.h"
diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 87a1f05b6a..4f19521860 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -22,16 +22,13 @@
 
 #include "config.h"
 
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/hpeldsp.h"
+
 #include "hpeldsp_altivec.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
index f1b42470fb..29f625a01c 100644
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@ -30,17 +30,16 @@
  * IDCT function itself was to factor out the partial transposition, and to
  * perform a full transpose at the end of the function. */
 
+#include "config.h"
+
 #include <stdlib.h>
 #include <string.h>
-#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/idctdsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/lossless_audiodsp_altivec.c b/libavcodec/ppc/lossless_audiodsp_altivec.c
index bdec25223d..298e6c38a0 100644
--- a/libavcodec/ppc/lossless_audiodsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@ -19,14 +19,12 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/lossless_audiodsp.h"
 
 #if HAVE_BIGENDIAN
diff --git a/libavcodec/ppc/lossless_videodsp_altivec.c b/libavcodec/ppc/lossless_videodsp_altivec.c
index 16dd99f8d7..980f85b166 100644
--- a/libavcodec/ppc/lossless_videodsp_altivec.c
+++ b/libavcodec/ppc/lossless_videodsp_altivec.c
@@ -21,15 +21,12 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/lossless_videodsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c
index 9f75ed256a..17f9a4f016 100644
--- a/libavcodec/ppc/me_cmp.c
+++ b/libavcodec/ppc/me_cmp.c
@@ -21,15 +21,12 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
 #include "libavcodec/me_cmp.h"
diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c
index 1b6bda6c36..2c6ff9165b 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -25,11 +25,12 @@
 #include <stdio.h>
 
 #include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/mpegvideo.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/mpegvideodsp.c b/libavcodec/ppc/mpegvideodsp.c
index 021933255b..990a974a4e 100644
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@ -23,8 +23,8 @@
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/mpegvideodsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/mpegvideoencdsp.c b/libavcodec/ppc/mpegvideoencdsp.c
index 3e6765ce15..b96487bf81 100644
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -17,16 +17,14 @@
  */
 
 #include "config.h"
+
 #include <stdint.h>
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/mpegvideoencdsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/pixblockdsp.c b/libavcodec/ppc/pixblockdsp.c
index f5ac8509f0..01d14b4124 100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -21,15 +21,12 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
diff --git a/libavcodec/ppc/svq1enc_altivec.c b/libavcodec/ppc/svq1enc_altivec.c
index 4e25e253f6..f63f086602 100644
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@ -18,18 +18,15 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdint.h>
-
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+
+#include <stdint.h>
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/svq1enc.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
index 83d537f0c1..bbadb2aaee 100644
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@ -20,11 +20,12 @@
  */
 
 #include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/vc1dsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vorbisdsp_altivec.c b/libavcodec/ppc/vorbisdsp_altivec.c
index d7557c815b..4dabf2dc7d 100644
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@ -19,12 +19,12 @@
  */
 
 #include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/vorbisdsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index d2231d090a..a9a48d145b 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -21,11 +21,12 @@
 #include <string.h>
 
 #include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/vp3dsp.h"
 
 #if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c
index 23e4ace7da..31201ed2d8 100644
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -21,12 +21,14 @@
  */
 
 #include "config.h"
+
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
+
 #include "libavcodec/vp8dsp.h"
+
 #include "hpeldsp_altivec.h"
 
 #if HAVE_ALTIVEC
diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
index 1dc719cf93..6aa4ace337 100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@ -89,6 +89,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 #include "postprocess.h"
 #include "postprocess_internal.h"
 #include "libavutil/avstring.h"
+#include "libavutil/ppc/util_altivec.h"
 
 #include "libavutil/ffversion.h"
 const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
@@ -110,10 +111,6 @@ const char *postproc_license(void)
     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
 #define GET_MODE_BUFFER_SIZE 500
 #define OPTIONS_ARRAY_SIZE 10
 #define BLOCK_SIZE 8
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 84d5bee5ff..0f51df95d7 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -22,11 +22,6 @@
 #define SWSCALE_SWSCALE_INTERNAL_H
 
 #include "config.h"
-
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
 #include "version.h"
 
 #include "libavutil/avassert.h"
@@ -36,6 +31,7 @@
 #include "libavutil/log.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/ppc/util_altivec.h"
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 


======================================================================

diff --cc libavcodec/ppc/audiodsp.c
index 4ee3da42d2,371e0d1e2e..2e374737bd
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@@ -31,11 -28,11 +28,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/audiodsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                             int order)
diff --cc libavcodec/ppc/fdctdsp.c
index 6659046f98,36d4b4e4ba..4ab516c6b3
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@@ -26,10 -23,13 +23,13 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
+ #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/fdctdsp.h"
+ 
  #include "fdct.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  #define vs16(v)   ((vector signed short) (v))
  #define vs32(v)     ((vector signed int) (v))
diff --cc libavcodec/ppc/fft_init.c
index cbeaf98952,56eafb91be..57d7c80ea4
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@@ -21,133 -17,13 +21,132 @@@
   */
  
  #include "config.h"
 -
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
 -
 +#include "libavutil/ppc/util_altivec.h"
  #include "libavcodec/fft.h"
  
 +/**
 + * Do a complex FFT with the parameters defined in ff_fft_init().
 + * The input data must be permuted before with s->revtab table.
 + * No 1.0 / sqrt(n) normalization is done.
 + * AltiVec-enabled:
 + * This code assumes that the 'z' pointer is 16 bytes-aligned.
 + * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
 + */
 +
 +#if HAVE_VSX
 +#include "fft_vsx.h"
 +#else
 +void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
  void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
 +#endif
 +
 +#if HAVE_GNU_AS && HAVE_ALTIVEC
 +static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
 +{
 +    int j, k;
 +    int n = 1 << s->mdct_bits;
 +    int n4 = n >> 2;
 +    int n8 = n >> 3;
 +    int n32 = n >> 5;
 +    const uint16_t *revtabj = s->revtab;
 +    const uint16_t *revtabk = s->revtab+n4;
 +    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
 +    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
 +    const vec_f *pin = (const vec_f*)(input+n4);
 +    vec_f *pout = (vec_f*)(output+n4);
 +
 +    /* pre rotation */
 +    k = n32-1;
 +    do {
 +        vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
 +#define CMULA(p,o0,o1,o2,o3)\
 +        a = pin[ k*2+p];                       /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */\
 +        b = pin[-k*2-p-1];                     /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
 +        re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */\
 +        im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */\
 +        cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
 +        sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
 +        r##p = im*cos - re*sin;\
 +        i##p = re*cos + im*sin;
 +#define STORE2(v,dst)\
 +        j = dst;\
 +        vec_ste(v, 0, output+j*2);\
 +        vec_ste(v, 4, output+j*2);
 +#define STORE8(p)\
 +        a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
 +        b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
 +        c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
 +        d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
 +        STORE2(a, revtabk[ p*2-4]);\
 +        STORE2(b, revtabk[ p*2-3]);\
 +        STORE2(c, revtabj[-p*2+2]);\
 +        STORE2(d, revtabj[-p*2+3]);
 +
 +        cos0 = tcos[k];
 +        sin0 = tsin[k];
 +        cos1 = tcos[-k-1];
 +        sin1 = tsin[-k-1];
 +        CMULA(0, 0,1,2,3);
 +        CMULA(1, 2,3,0,1);
 +        STORE8(0);
 +        STORE8(1);
 +        revtabj += 4;
 +        revtabk -= 4;
 +        k--;
 +    } while(k >= 0);
 +
 +#if HAVE_VSX
 +    ff_fft_calc_vsx(s, (FFTComplex*)output);
 +#else
 +    ff_fft_calc_altivec(s, (FFTComplex*)output);
 +#endif
 +
 +    /* post rotation + reordering */
 +    j = -n32;
 +    k = n32-1;
 +    do {
 +        vec_f cos,sin,re,im,a,b,c,d;
 +#define CMULB(d0,d1,o)\
 +        re = pout[o*2];\
 +        im = pout[o*2+1];\
 +        cos = tcos[o];\
 +        sin = tsin[o];\
 +        d0 = im*sin - re*cos;\
 +        d1 = re*sin + im*cos;
 +
 +        CMULB(a,b,j);
 +        CMULB(c,d,k);
 +        pout[2*j]   = vec_perm(a, d, vcprm(0,s3,1,s2));
 +        pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
 +        pout[2*k]   = vec_perm(c, b, vcprm(0,s3,1,s2));
 +        pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
 +        j++;
 +        k--;
 +    } while(k >= 0);
 +}
 +
 +static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
 +{
 +    int k;
 +    int n = 1 << s->mdct_bits;
 +    int n4 = n >> 2;
 +    int n16 = n >> 4;
 +    vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
 +    vec_u32 *p0 = (vec_u32*)(output+n4);
 +    vec_u32 *p1 = (vec_u32*)(output+n4*3);
 +
 +    imdct_half_altivec(s, output + n4, input);
 +
 +    for (k = 0; k < n16; k++) {
 +        vec_u32 a = p0[k] ^ sign;
 +        vec_u32 b = p1[-k-1];
 +        p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
 +        p1[k]    = vec_perm(b, b, vcprm(3,2,1,0));
 +    }
 +}
 +#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
  
  av_cold void ff_fft_init_ppc(FFTContext *s)
  {
diff --cc libavcodec/ppc/fft_vsx.c
index e92975f74e,0000000000..c365fa1380
mode 100644,000000..100644
--- a/libavcodec/ppc/fft_vsx.c
+++ b/libavcodec/ppc/fft_vsx.c
@@@ -1,227 -1,0 +1,226 @@@
 +/*
 + * FFT  transform, optimized with VSX built-in functions
 + * Copyright (c) 2014 Rong Yan
 + *
 + * This algorithm (though not any of the implementation details) is
 + * based on libdjbfft by D. J. Bernstein.
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +
 +#include "config.h"
 +#include "libavutil/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
 +#include "libavutil/ppc/util_altivec.h"
 +#include "libavcodec/fft.h"
 +#include "libavcodec/fft-internal.h"
 +#include "fft_vsx.h"
 +
 +#if HAVE_VSX
 +
 +static void fft32_vsx_interleave(FFTComplex *z)
 +{
 +    fft16_vsx_interleave(z);
 +    fft8_vsx_interleave(z+16);
 +    fft8_vsx_interleave(z+24);
 +    pass_vsx_interleave(z,ff_cos_32,4);
 +}
 +
 +static void fft64_vsx_interleave(FFTComplex *z)
 +{
 +    fft32_vsx_interleave(z);
 +    fft16_vsx_interleave(z+32);
 +    fft16_vsx_interleave(z+48);
 +    pass_vsx_interleave(z,ff_cos_64, 8);
 +}
 +static void fft128_vsx_interleave(FFTComplex *z)
 +{
 +    fft64_vsx_interleave(z);
 +    fft32_vsx_interleave(z+64);
 +    fft32_vsx_interleave(z+96);
 +    pass_vsx_interleave(z,ff_cos_128,16);
 +}
 +static void fft256_vsx_interleave(FFTComplex *z)
 +{
 +    fft128_vsx_interleave(z);
 +    fft64_vsx_interleave(z+128);
 +    fft64_vsx_interleave(z+192);
 +    pass_vsx_interleave(z,ff_cos_256,32);
 +}
 +static void fft512_vsx_interleave(FFTComplex *z)
 +{
 +    fft256_vsx_interleave(z);
 +    fft128_vsx_interleave(z+256);
 +    fft128_vsx_interleave(z+384);
 +    pass_vsx_interleave(z,ff_cos_512,64);
 +}
 +static void fft1024_vsx_interleave(FFTComplex *z)
 +{
 +    fft512_vsx_interleave(z);
 +    fft256_vsx_interleave(z+512);
 +    fft256_vsx_interleave(z+768);
 +    pass_vsx_interleave(z,ff_cos_1024,128);
 +
 +}
 +static void fft2048_vsx_interleave(FFTComplex *z)
 +{
 +    fft1024_vsx_interleave(z);
 +    fft512_vsx_interleave(z+1024);
 +    fft512_vsx_interleave(z+1536);
 +    pass_vsx_interleave(z,ff_cos_2048,256);
 +}
 +static void fft4096_vsx_interleave(FFTComplex *z)
 +{
 +    fft2048_vsx_interleave(z);
 +    fft1024_vsx_interleave(z+2048);
 +    fft1024_vsx_interleave(z+3072);
 +    pass_vsx_interleave(z,ff_cos_4096, 512);
 +}
 +static void fft8192_vsx_interleave(FFTComplex *z)
 +{
 +    fft4096_vsx_interleave(z);
 +    fft2048_vsx_interleave(z+4096);
 +    fft2048_vsx_interleave(z+6144);
 +    pass_vsx_interleave(z,ff_cos_8192,1024);
 +}
 +static void fft16384_vsx_interleave(FFTComplex *z)
 +{
 +    fft8192_vsx_interleave(z);
 +    fft4096_vsx_interleave(z+8192);
 +    fft4096_vsx_interleave(z+12288);
 +    pass_vsx_interleave(z,ff_cos_16384,2048);
 +}
 +static void fft32768_vsx_interleave(FFTComplex *z)
 +{
 +    fft16384_vsx_interleave(z);
 +    fft8192_vsx_interleave(z+16384);
 +    fft8192_vsx_interleave(z+24576);
 +    pass_vsx_interleave(z,ff_cos_32768,4096);
 +}
 +static void fft65536_vsx_interleave(FFTComplex *z)
 +{
 +    fft32768_vsx_interleave(z);
 +    fft16384_vsx_interleave(z+32768);
 +    fft16384_vsx_interleave(z+49152);
 +    pass_vsx_interleave(z,ff_cos_65536,8192);
 +}
 +
 +static void fft32_vsx(FFTComplex *z)
 +{
 +    fft16_vsx(z);
 +    fft8_vsx(z+16);
 +    fft8_vsx(z+24);
 +    pass_vsx(z,ff_cos_32,4);
 +}
 +
 +static void fft64_vsx(FFTComplex *z)
 +{
 +    fft32_vsx(z);
 +    fft16_vsx(z+32);
 +    fft16_vsx(z+48);
 +    pass_vsx(z,ff_cos_64, 8);
 +}
 +static void fft128_vsx(FFTComplex *z)
 +{
 +    fft64_vsx(z);
 +    fft32_vsx(z+64);
 +    fft32_vsx(z+96);
 +    pass_vsx(z,ff_cos_128,16);
 +}
 +static void fft256_vsx(FFTComplex *z)
 +{
 +    fft128_vsx(z);
 +    fft64_vsx(z+128);
 +    fft64_vsx(z+192);
 +    pass_vsx(z,ff_cos_256,32);
 +}
 +static void fft512_vsx(FFTComplex *z)
 +{
 +    fft256_vsx(z);
 +    fft128_vsx(z+256);
 +    fft128_vsx(z+384);
 +    pass_vsx(z,ff_cos_512,64);
 +}
 +static void fft1024_vsx(FFTComplex *z)
 +{
 +    fft512_vsx(z);
 +    fft256_vsx(z+512);
 +    fft256_vsx(z+768);
 +    pass_vsx(z,ff_cos_1024,128);
 +
 +}
 +static void fft2048_vsx(FFTComplex *z)
 +{
 +    fft1024_vsx(z);
 +    fft512_vsx(z+1024);
 +    fft512_vsx(z+1536);
 +    pass_vsx(z,ff_cos_2048,256);
 +}
 +static void fft4096_vsx(FFTComplex *z)
 +{
 +    fft2048_vsx(z);
 +    fft1024_vsx(z+2048);
 +    fft1024_vsx(z+3072);
 +    pass_vsx(z,ff_cos_4096, 512);
 +}
 +static void fft8192_vsx(FFTComplex *z)
 +{
 +    fft4096_vsx(z);
 +    fft2048_vsx(z+4096);
 +    fft2048_vsx(z+6144);
 +    pass_vsx(z,ff_cos_8192,1024);
 +}
 +static void fft16384_vsx(FFTComplex *z)
 +{
 +    fft8192_vsx(z);
 +    fft4096_vsx(z+8192);
 +    fft4096_vsx(z+12288);
 +    pass_vsx(z,ff_cos_16384,2048);
 +}
 +static void fft32768_vsx(FFTComplex *z)
 +{
 +    fft16384_vsx(z);
 +    fft8192_vsx(z+16384);
 +    fft8192_vsx(z+24576);
 +    pass_vsx(z,ff_cos_32768,4096);
 +}
 +static void fft65536_vsx(FFTComplex *z)
 +{
 +    fft32768_vsx(z);
 +    fft16384_vsx(z+32768);
 +    fft16384_vsx(z+49152);
 +    pass_vsx(z,ff_cos_65536,8192);
 +}
 +
 +static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
 +    fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
 +    fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
 +};
 +static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
 +    fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
 +    fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
 +    fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
 +};
 +void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
 +{
 +     fft_dispatch_vsx_interleave[s->nbits-2](z);
 +}
 +void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
 +{
 +     fft_dispatch_vsx[s->nbits-2](z);
 +}
 +#endif /* HAVE_VSX */
diff --cc libavcodec/ppc/fft_vsx.h
index a85475d160,0000000000..1e44031aa5
mode 100644,000000..100644
--- a/libavcodec/ppc/fft_vsx.h
+++ b/libavcodec/ppc/fft_vsx.h
@@@ -1,830 -1,0 +1,829 @@@
 +#ifndef AVCODEC_PPC_FFT_VSX_H
 +#define AVCODEC_PPC_FFT_VSX_H
 +/*
 + * FFT  transform, optimized with VSX built-in functions
 + * Copyright (c) 2014 Rong Yan  Copyright (c) 2009 Loren Merritt
 + *
 + * This algorithm (though not any of the implementation details) is
 + * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +
 +#include "config.h"
 +#include "libavutil/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
 +#include "libavutil/ppc/util_altivec.h"
 +#include "libavcodec/fft.h"
 +#include "libavcodec/fft-internal.h"
 +
 +#if HAVE_VSX
 +
 +void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
 +void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
 +
 +
 +#define byte_2complex (2*sizeof(FFTComplex))
 +#define byte_4complex (4*sizeof(FFTComplex))
 +#define byte_6complex (6*sizeof(FFTComplex))
 +#define byte_8complex (8*sizeof(FFTComplex))
 +#define byte_10complex (10*sizeof(FFTComplex))
 +#define byte_12complex (12*sizeof(FFTComplex))
 +#define byte_14complex (14*sizeof(FFTComplex))
 +
 +inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
 +{
 +    int o1 = n<<1;
 +    int o2 = n<<2;
 +    int o3 = o1+o2;
 +    int i1, i2, i3;
 +    FFTSample* out = (FFTSample*)z;
 +    const FFTSample *wim = wre+o1;
 +    vec_f vz0, vzo1, vzo2, vzo3;
 +    vec_f x0, x1, x2, x3;
 +    vec_f x4, x5, x6, x7;
 +    vec_f x8, x9, x10, x11;
 +    vec_f x12, x13, x14, x15;
 +    vec_f x16, x17, x18, x19;
 +    vec_f x20, x21, x22, x23;
 +    vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
 +    vec_f y0, y1, y2, y3;
 +    vec_f y4, y5, y8, y9;
 +    vec_f y10, y13, y14, y15;
 +    vec_f y16, y17, y18, y19;
 +    vec_f y20, y21, y22, y23;
 +    vec_f wr1, wi1, wr0, wi0;
 +    vec_f wr2, wi2, wr3, wi3;
 +    vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
 +
 +    n = n-2;
 +    i1 = o1*sizeof(FFTComplex);
 +    i2 = o2*sizeof(FFTComplex);
 +    i3 = o3*sizeof(FFTComplex);
 +    vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
 +    vzo2plus1 = vec_ld(i2+16, &(out[0]));
 +    vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
 +    vzo3plus1 = vec_ld(i3+16, &(out[0]));
 +    vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
 +    vz0plus1 = vec_ld(16, &(out[0]));
 +    vzo1 = vec_ld(i1, &(out[0]));  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
 +    vzo1plus1 = vec_ld(i1+16, &(out[0]));
 +
 +    x0 = vec_add(vzo2, vzo3);
 +    x1 = vec_sub(vzo2, vzo3);
 +    y0 = vec_add(vzo2plus1, vzo3plus1);
 +    y1 = vec_sub(vzo2plus1, vzo3plus1);
 +
 +    wr1 = vec_splats(wre[1]);
 +    wi1 = vec_splats(wim[-1]);
 +    wi2 = vec_splats(wim[-2]);
 +    wi3 = vec_splats(wim[-3]);
 +    wr2 = vec_splats(wre[2]);
 +    wr3 = vec_splats(wre[3]);
 +
 +    x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
 +    x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
 +
 +    y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
 +    y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
 +    y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
 +    y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
 +
 +    ymulwi2 = vec_mul(y4, wi2);
 +    ymulwi3 = vec_mul(y5, wi3);
 +    x4 = vec_mul(x2, wr1);
 +    x5 = vec_mul(x3, wi1);
 +    y8 = vec_madd(y2, wr2, ymulwi2);
 +    y9 = vec_msub(y2, wr2, ymulwi2);
 +    x6 = vec_add(x4, x5);
 +    x7 = vec_sub(x4, x5);
 +    y13 = vec_madd(y3, wr3, ymulwi3);
 +    y14 = vec_msub(y3, wr3, ymulwi3);
 +
 +    x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
 +    y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
 +    y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
 +
 +    x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
 +    x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
 +
 +    y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
 +    y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
 +
 +    x11 = vec_add(vz0, x9);
 +    x12 = vec_sub(vz0, x9);
 +    x13 = vec_add(vzo1, x10);
 +    x14 = vec_sub(vzo1, x10);
 +
 +    y18 = vec_add(vz0plus1, y16);
 +    y19 = vec_sub(vz0plus1, y16);
 +    y20 = vec_add(vzo1plus1, y17);
 +    y21 = vec_sub(vzo1plus1, y17);
 +
 +    x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
 +    x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
 +    y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
 +    y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
 +
 +
 +    vec_st(x11, 0, &(out[0]));
 +    vec_st(y18, 16, &(out[0]));
 +    vec_st(x15, i1, &(out[0]));
 +    vec_st(y22, i1+16, &(out[0]));
 +    vec_st(x12, i2, &(out[0]));
 +    vec_st(y19, i2+16, &(out[0]));
 +    vec_st(x16, i3, &(out[0]));
 +    vec_st(y23, i3+16, &(out[0]));
 +
 +    do {
 +        out += 8;
 +        wre += 4;
 +        wim -= 4;
 +        wr0 = vec_splats(wre[0]);
 +        wr1 = vec_splats(wre[1]);
 +        wi0 = vec_splats(wim[0]);
 +        wi1 = vec_splats(wim[-1]);
 +
 +        wr2 = vec_splats(wre[2]);
 +        wr3 = vec_splats(wre[3]);
 +        wi2 = vec_splats(wim[-2]);
 +        wi3 = vec_splats(wim[-3]);
 +
 +        vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
 +        vzo2plus1 = vec_ld(i2+16, &(out[0]));
 +        vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
 +        vzo3plus1 = vec_ld(i3+16, &(out[0]));
 +        vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
 +        vz0plus1 = vec_ld(16, &(out[0]));
 +        vzo1 = vec_ld(i1, &(out[0])); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
 +        vzo1plus1 = vec_ld(i1+16, &(out[0]));
 +
 +        x0 = vec_add(vzo2, vzo3);
 +        x1 = vec_sub(vzo2, vzo3);
 +
 +        y0 = vec_add(vzo2plus1, vzo3plus1);
 +        y1 = vec_sub(vzo2plus1, vzo3plus1);
 +
 +        x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
 +        x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
 +        x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
 +        x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
 +
 +        y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
 +        y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
 +        xmulwi0 = vec_mul(x4, wi0);
 +        xmulwi1 = vec_mul(x5, wi1);
 +
 +        y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
 +        y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
 +
 +        x8 = vec_madd(x2, wr0, xmulwi0);
 +        x9 = vec_msub(x2, wr0, xmulwi0);
 +        ymulwi2 = vec_mul(y4, wi2);
 +        ymulwi3 = vec_mul(y5, wi3);
 +
 +        x13 = vec_madd(x3, wr1, xmulwi1);
 +        x14 = vec_msub(x3, wr1, xmulwi1);
 +
 +        y8 = vec_madd(y2, wr2, ymulwi2);
 +        y9 = vec_msub(y2, wr2, ymulwi2);
 +        y13 = vec_madd(y3, wr3, ymulwi3);
 +        y14 = vec_msub(y3, wr3, ymulwi3);
 +
 +        x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
 +        x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
 +
 +        y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
 +        y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
 +
 +        x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
 +        x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
 +
 +        y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
 +        y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
 +
 +        x18 = vec_add(vz0, x16);
 +        x19 = vec_sub(vz0, x16);
 +        x20 = vec_add(vzo1, x17);
 +        x21 = vec_sub(vzo1, x17);
 +
 +        y18 = vec_add(vz0plus1, y16);
 +        y19 = vec_sub(vz0plus1, y16);
 +        y20 = vec_add(vzo1plus1, y17);
 +        y21 = vec_sub(vzo1plus1, y17);
 +
 +        x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
 +        x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
 +
 +        y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
 +        y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
 +
 +        vec_st(x18, 0, &(out[0]));
 +        vec_st(y18, 16, &(out[0]));
 +        vec_st(x22, i1, &(out[0]));
 +        vec_st(y22, i1+16, &(out[0]));
 +        vec_st(x19, i2, &(out[0]));
 +        vec_st(y19, i2+16, &(out[0]));
 +        vec_st(x23, i3, &(out[0]));
 +        vec_st(y23, i3+16, &(out[0]));
 +    } while (n-=2);
 +}
 +
 +inline static void fft2_vsx_interleave(FFTComplex *z)
 +{
 +    FFTSample r1, i1;
 +
 +    r1 = z[0].re - z[1].re;
 +    z[0].re += z[1].re;
 +    z[1].re = r1;
 +
 +    i1 = z[0].im - z[1].im;
 +    z[0].im += z[1].im;
 +    z[1].im = i1;
 + }
 +
 +inline static void fft4_vsx_interleave(FFTComplex *z)
 +{
 +    vec_f a, b, c, d;
 +    float* out=  (float*)z;
 +    a = vec_ld(0, &(out[0]));
 +    b = vec_ld(byte_2complex, &(out[0]));
 +
 +    c = vec_perm(a, b, vcprm(0,1,s2,s1));
 +    d = vec_perm(a, b, vcprm(2,3,s0,s3));
 +    a = vec_add(c, d);
 +    b = vec_sub(c, d);
 +
 +    c = vec_perm(a, b, vcprm(0,1,s0,s1));
 +    d = vec_perm(a, b, vcprm(2,3,s3,s2));
 +
 +    a = vec_add(c, d);
 +    b = vec_sub(c, d);
 +    vec_st(a, 0, &(out[0]));
 +    vec_st(b, byte_2complex, &(out[0]));
 +}
 +
 +inline static void fft8_vsx_interleave(FFTComplex *z)
 +{
 +    vec_f vz0, vz1, vz2, vz3;
 +    vec_f x0, x1, x2, x3;
 +    vec_f x4, x5, x6, x7;
 +    vec_f x8, x9, x10, x11;
 +    vec_f x12, x13, x14, x15;
 +    vec_f x16, x17, x18, x19;
 +    vec_f x20, x21, x22, x23;
 +    vec_f x24, x25, x26, x27;
 +    vec_f x28, x29, x30, x31;
 +    vec_f x32, x33, x34;
 +
 +    float* out=  (float*)z;
 +    vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 +
 +    vz0 = vec_ld(0, &(out[0]));
 +    vz1 = vec_ld(byte_2complex, &(out[0]));
 +    vz2 = vec_ld(byte_4complex, &(out[0]));
 +    vz3 = vec_ld(byte_6complex, &(out[0]));
 +
 +    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 +    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 +    x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
 +    x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
 +
 +    x4 = vec_add(x0, x1);
 +    x5 = vec_sub(x0, x1);
 +    x6 = vec_add(x2, x3);
 +    x7 = vec_sub(x2, x3);
 +
 +    x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
 +    x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
 +    x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
 +    x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
 +
 +    x12 = vec_add(x8, x9);
 +    x13 = vec_sub(x8, x9);
 +    x14 = vec_add(x10, x11);
 +    x15 = vec_sub(x10, x11);
 +    x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
 +    x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
 +    x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
 +    x19 = vec_add(x16, x18); // z0.r  z2.r  z0.i  z2.i
 +    x20 = vec_sub(x16, x18); // z4.r  z6.r  z4.i  z6.i
 +
 +    x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
 +    x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
 +    x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
 +    x24 = vec_add(x22, x23);
 +    x25 = vec_sub(x22, x23);
 +    x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
 +
 +    x27 = vec_add(x21, x26); // z1.r  z7.r z1.i z3.i
 +    x28 = vec_sub(x21, x26); //z5.r  z3.r z5.i z7.i
 +
 +    x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r  z0.i  z1.r  z1.i
 +    x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r  z2.i  z7.r  z3.i
 +    x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r  z4.i  z5.r  z5.i
 +    x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r  z6.i  z3.r  z7.i
 +    x33 = vec_perm(x30, x32, vcprm(0,1,s2,3));  // z2.r  z2.i  z3.r  z3.i
 +    x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r  z6.i  z7.r  z7.i
 +
 +    vec_st(x29, 0, &(out[0]));
 +    vec_st(x33, byte_2complex, &(out[0]));
 +    vec_st(x31, byte_4complex, &(out[0]));
 +    vec_st(x34, byte_6complex, &(out[0]));
 +}
 +
 +inline static void fft16_vsx_interleave(FFTComplex *z)
 +{
 +    float* out=  (float*)z;
 +    vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 +    vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
 +    vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
 +    vec_f vz0, vz1, vz2, vz3;
 +    vec_f vz4, vz5, vz6, vz7;
 +    vec_f x0, x1, x2, x3;
 +    vec_f x4, x5, x6, x7;
 +    vec_f x8, x9, x10, x11;
 +    vec_f x12, x13, x14, x15;
 +    vec_f x16, x17, x18, x19;
 +    vec_f x20, x21, x22, x23;
 +    vec_f x24, x25, x26, x27;
 +    vec_f x28, x29, x30, x31;
 +    vec_f x32, x33, x34, x35;
 +    vec_f x36, x37, x38, x39;
 +    vec_f x40, x41, x42, x43;
 +    vec_f x44, x45, x46, x47;
 +    vec_f x48, x49, x50, x51;
 +    vec_f x52, x53, x54, x55;
 +    vec_f x56, x57, x58, x59;
 +    vec_f x60, x61, x62, x63;
 +    vec_f x64, x65, x66, x67;
 +    vec_f x68, x69, x70, x71;
 +    vec_f x72, x73, x74, x75;
 +    vec_f x76, x77, x78, x79;
 +    vec_f x80, x81, x82, x83;
 +    vec_f x84, x85, x86;
 +
 +    vz0 = vec_ld(0, &(out[0]));
 +    vz1 = vec_ld(byte_2complex, &(out[0]));
 +    vz2 = vec_ld(byte_4complex, &(out[0]));
 +    vz3 = vec_ld(byte_6complex, &(out[0]));
 +    vz4 = vec_ld(byte_8complex, &(out[0]));
 +    vz5 = vec_ld(byte_10complex, &(out[0]));
 +    vz6 = vec_ld(byte_12complex, &(out[0]));
 +    vz7 = vec_ld(byte_14complex, &(out[0]));
 +
 +    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 +    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 +    x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
 +    x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
 +
 +    x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
 +    x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
 +    x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
 +    x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
 +
 +    x8 = vec_add(x0, x1);
 +    x9 = vec_sub(x0, x1);
 +    x10 = vec_add(x2, x3);
 +    x11 = vec_sub(x2, x3);
 +
 +    x12 = vec_add(x4, x5);
 +    x13 = vec_sub(x4, x5);
 +    x14 = vec_add(x6, x7);
 +    x15 = vec_sub(x6, x7);
 +
 +    x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
 +    x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
 +    x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
 +    x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
 +    x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
 +    x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
 +    x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
 +    x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
 +
 +    x24 = vec_add(x16, x17);
 +    x25 = vec_sub(x16, x17);
 +    x26 = vec_add(x18, x19);
 +    x27 = vec_sub(x18, x19);
 +    x28 = vec_add(x20, x21);
 +    x29 = vec_sub(x20, x21);
 +    x30 = vec_add(x22, x23);
 +    x31 = vec_sub(x22, x23);
 +
 +    x32 = vec_add(x24, x26);
 +    x33 = vec_sub(x24, x26);
 +    x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
 +
 +    x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
 +    x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
 +    x37 = vec_add(x35, x36);
 +    x38 = vec_sub(x35, x36);
 +    x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
 +
 +    x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
 +    x41 = vec_perm(x26,  x37, vcprm(2,3,s3,s2));
 +    x42 = vec_add(x40, x41);
 +    x43 = vec_sub(x40, x41);
 +    x44 = vec_mul(x42, vc0);
 +    x45 = vec_mul(x43, vc0);
 +
 +    x46 = vec_add(x34, x39);  // z0.r  z0.i  z4.r  z4.i
 +    x47 = vec_sub(x34, x39);  // z8.r  z8.i  z12.r  z12.i
 +
 +    x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
 +    x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
 +    x50 = vec_add(x48, x49);
 +    x51 = vec_sub(x48, x49);
 +    x52 = vec_mul(x50, vc1);
 +    x53 = vec_mul(x50, vc2);
 +    x54 = vec_mul(x51, vc1);
 +    x55 = vec_mul(x51, vc2);
 +
 +    x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
 +    x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
 +    x58 = vec_add(x56, x57);
 +    x59 = vec_sub(x56, x57);
 +
 +    x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
 +    x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
 +    x62 = vec_add(x52, x61);
 +    x63 = vec_sub(x52, x61);
 +    x64 = vec_add(x60, x53);
 +    x65 = vec_sub(x60, x53);
 +    x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
 +    x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
 +
 +    x68 = vec_add(x58, x66); // z1.r    z1.i  z3.r    z3.i
 +    x69 = vec_sub(x58, x66); // z9.r    z9.i  z11.r  z11.i
 +    x70 = vec_add(x59, x67); // z5.r    z5.i  z15.r  z15.i
 +    x71 = vec_sub(x59, x67); // z13.r  z13.i z7.r   z7.i
 +
 +    x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
 +    x73 = vec_add(x25, x72);
 +    x74 = vec_sub(x25, x72);
 +    x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
 +    x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
 +    x77 = vec_add(x75, x76); // z2.r   z2.i    z6.r    z6.i
 +    x78 = vec_sub(x75, x76); // z10.r  z10.i  z14.r  z14.i
 +
 +    x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r  z0.i  z1.r  z1.i
 +    x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r  z2.i  z3.r  z3.i
 +    x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r  z4.i  z5.r  z5.i
 +    x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r  z6.i  z7.r  z7.i
 +    vec_st(x79, 0, &(out[0]));
 +    vec_st(x80, byte_2complex, &(out[0]));
 +    vec_st(x81, byte_4complex, &(out[0]));
 +    vec_st(x82, byte_6complex, &(out[0]));
 +    x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r  z8.i  z9.r  z9.i
 +    x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r  z10.i  z11.r  z11.i
 +    x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r  z12.i  z13.r  z13.i
 +    x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r  z14.i  z15.r  z15.i
 +    vec_st(x83, byte_8complex, &(out[0]));
 +    vec_st(x84, byte_10complex, &(out[0]));
 +    vec_st(x85, byte_12complex, &(out[0]));
 +    vec_st(x86, byte_14complex, &(out[0]));
 +}
 +
 +inline static void fft4_vsx(FFTComplex *z)
 +{
 +    vec_f a, b, c, d;
 +    float* out=  (float*)z;
 +    a = vec_ld(0, &(out[0]));
 +    b = vec_ld(byte_2complex, &(out[0]));
 +
 +    c = vec_perm(a, b, vcprm(0,1,s2,s1));
 +    d = vec_perm(a, b, vcprm(2,3,s0,s3));
 +    a = vec_add(c, d);
 +    b = vec_sub(c, d);
 +
 +    c = vec_perm(a,b, vcprm(0,s0,1,s1));
 +    d = vec_perm(a, b, vcprm(2,s3,3,s2));
 +
 +    a = vec_add(c, d);
 +    b = vec_sub(c, d);
 +
 +    c = vec_perm(a, b, vcprm(0,1,s0,s1));
 +    d = vec_perm(a, b, vcprm(2,3,s2,s3));
 +
 +    vec_st(c, 0, &(out[0]));
 +    vec_st(d, byte_2complex, &(out[0]));
 +    return;
 +}
 +
 +inline static void fft8_vsx(FFTComplex *z)
 +{
 +    vec_f vz0, vz1, vz2, vz3;
 +    vec_f vz4, vz5, vz6, vz7, vz8;
 +
 +    float* out=  (float*)z;
 +    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
 +    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
 +    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 +
 +    vz0 = vec_ld(0, &(out[0]));
 +    vz1 = vec_ld(byte_2complex, &(out[0]));
 +    vz2 = vec_ld(byte_4complex, &(out[0]));
 +    vz3 = vec_ld(byte_6complex, &(out[0]));
 +
 +    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
 +    vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
 +    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 +    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 +
 +    vz2 = vec_add(vz6, vz7);
 +    vz3 = vec_sub(vz6, vz7);
 +    vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
 +
 +    vz0 = vec_add(vz4, vz5);
 +    vz1 = vec_sub(vz4, vz5);
 +
 +    vz3 = vec_madd(vz3, vc1, vc0);
 +    vz3 = vec_madd(vz8, vc2, vz3);
 +
 +    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
 +    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
 +    vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
 +    vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
 +
 +    vz0 = vec_add(vz4, vz5);
 +    vz1 = vec_sub(vz4, vz5);
 +    vz2 = vec_add(vz6, vz7);
 +    vz3 = vec_sub(vz6, vz7);
 +
 +    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
 +    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
 +    vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
 +    vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
 +
 +
 +    vz2 = vec_sub(vz4, vz6);
 +    vz3 = vec_sub(vz5, vz7);
 +
 +    vz0 = vec_add(vz4, vz6);
 +    vz1 = vec_add(vz5, vz7);
 +
 +    vec_st(vz0, 0, &(out[0]));
 +    vec_st(vz1, byte_2complex, &(out[0]));
 +    vec_st(vz2, byte_4complex, &(out[0]));
 +    vec_st(vz3, byte_6complex, &(out[0]));
 +    return;
 +}
 +
 +inline static void fft16_vsx(FFTComplex *z)
 +{
 +    float* out=  (float*)z;
 +    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
 +    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
 +    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 +    vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
 +    vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
 +    vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
 +
 +    vec_f vz0, vz1, vz2, vz3;
 +    vec_f vz4, vz5, vz6, vz7;
 +    vec_f vz8, vz9, vz10, vz11;
 +    vec_f vz12, vz13;
 +
 +    vz0 = vec_ld(byte_8complex, &(out[0]));
 +    vz1 = vec_ld(byte_10complex, &(out[0]));
 +    vz2 = vec_ld(byte_12complex, &(out[0]));
 +    vz3 = vec_ld(byte_14complex, &(out[0]));
 +
 +    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 +    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 +    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
 +    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
 +
 +    vz0 = vec_add(vz4, vz5);
 +    vz1= vec_sub(vz4, vz5);
 +    vz2 = vec_add(vz6, vz7);
 +    vz3 = vec_sub(vz6, vz7);
 +
 +    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
 +    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
 +    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
 +    vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
 +
 +    vz0 = vec_add(vz4, vz5);
 +    vz1 = vec_sub(vz4, vz5);
 +    vz2 = vec_add(vz6, vz7);
 +    vz3 = vec_sub(vz6, vz7);
 +
 +    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
 +    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
 +
 +    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
 +    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
 +
 +    vz0 = vec_ld(0, &(out[0]));
 +    vz1 = vec_ld(byte_2complex, &(out[0]));
 +    vz2 = vec_ld(byte_4complex, &(out[0]));
 +    vz3 = vec_ld(byte_6complex, &(out[0]));
 +    vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
 +    vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
 +    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 +    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 +
 +    vz2 = vec_add(vz10, vz11);
 +    vz3 = vec_sub(vz10, vz11);
 +    vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
 +    vz0 = vec_add(vz8, vz9);
 +    vz1 = vec_sub(vz8, vz9);
 +
 +    vz3 = vec_madd(vz3, vc1, vc0);
 +    vz3 = vec_madd(vz12, vc2, vz3);
 +    vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
 +    vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
 +    vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
 +    vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
 +
 +    vz0 = vec_add(vz8, vz9);
 +    vz1 = vec_sub(vz8, vz9);
 +    vz2 = vec_add(vz10, vz11);
 +    vz3 = vec_sub(vz10, vz11);
 +
 +    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
 +    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
 +    vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
 +    vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
 +
 +    vz2 = vec_sub(vz8, vz10);
 +    vz3 = vec_sub(vz9, vz11);
 +    vz0 = vec_add(vz8, vz10);
 +    vz1 = vec_add(vz9, vz11);
 +
 +    vz8 = vec_madd(vz4, vc3, vc0);
 +    vz9 = vec_madd(vz5, vc3, vc0);
 +    vz10 = vec_madd(vz6, vc3, vc0);
 +    vz11 = vec_madd(vz7, vc3, vc0);
 +
 +    vz8 = vec_madd(vz5, vc4, vz8);
 +    vz9 = vec_madd(vz4, vc5, vz9);
 +    vz10 = vec_madd(vz7, vc5, vz10);
 +    vz11 = vec_madd(vz6, vc4, vz11);
 +
 +    vz12 = vec_sub(vz10, vz8);
 +    vz10 = vec_add(vz10, vz8);
 +
 +    vz13 = vec_sub(vz9, vz11);
 +    vz11 = vec_add(vz9, vz11);
 +
 +    vz4 = vec_sub(vz0, vz10);
 +    vz0 = vec_add(vz0, vz10);
 +
 +    vz7= vec_sub(vz3, vz12);
 +    vz3= vec_add(vz3, vz12);
 +
 +    vz5 = vec_sub(vz1, vz11);
 +    vz1 = vec_add(vz1, vz11);
 +
 +    vz6 = vec_sub(vz2, vz13);
 +    vz2 = vec_add(vz2, vz13);
 +
 +    vec_st(vz0, 0, &(out[0]));
 +    vec_st(vz1, byte_2complex, &(out[0]));
 +    vec_st(vz2, byte_4complex, &(out[0]));
 +    vec_st(vz3, byte_6complex, &(out[0]));
 +    vec_st(vz4, byte_8complex, &(out[0]));
 +    vec_st(vz5, byte_10complex, &(out[0]));
 +    vec_st(vz6, byte_12complex, &(out[0]));
 +    vec_st(vz7, byte_14complex, &(out[0]));
 +    return;
 +
 +}
 +inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
 +{
 +    int o1 = n<<1;
 +    int o2 = n<<2;
 +    int o3 = o1+o2;
 +    int i1, i2, i3;
 +    FFTSample* out = (FFTSample*)z;
 +    const FFTSample *wim = wre+o1;
 +    vec_f v0, v1, v2, v3;
 +    vec_f v4, v5, v6, v7;
 +    vec_f v8, v9, v10, v11;
 +    vec_f v12, v13;
 +
 +    n = n-2;
 +    i1 = o1*sizeof(FFTComplex);
 +    i2 = o2*sizeof(FFTComplex);
 +    i3 = o3*sizeof(FFTComplex);
 +
 +    v8 = vec_ld(0, &(wre[0]));
 +    v10 = vec_ld(0, &(wim[0]));
 +    v9 = vec_ld(0, &(wim[-4]));
 +    v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
 +
 +    v4 = vec_ld(i2, &(out[0]));
 +    v5 = vec_ld(i2+16, &(out[0]));
 +    v6 = vec_ld(i3, &(out[0]));
 +    v7 = vec_ld(i3+16, &(out[0]));
 +    v10 = vec_mul(v4, v8); // r2*wre
 +    v11 = vec_mul(v5, v8); // i2*wre
 +    v12 = vec_mul(v6, v8); // r3*wre
 +    v13 = vec_mul(v7, v8); // i3*wre
 +
 +    v0 = vec_ld(0, &(out[0])); // r0
 +    v3 = vec_ld(i1+16, &(out[0])); // i1
 +    v10 = vec_madd(v5, v9, v10); // r2*wim
 +    v11 = vec_nmsub(v4, v9, v11); // i2*wim
 +    v12 = vec_nmsub(v7, v9, v12); // r3*wim
 +    v13 = vec_madd(v6, v9, v13); // i3*wim
 +
 +    v1 = vec_ld(16, &(out[0])); // i0
 +    v2 = vec_ld(i1, &(out[0])); // r1
 +    v8 = vec_sub(v12, v10);
 +    v12 = vec_add(v12, v10);
 +    v9 = vec_sub(v11, v13);
 +    v13 = vec_add(v11, v13);
 +    v4 = vec_sub(v0, v12);
 +    v0 = vec_add(v0, v12);
 +    v7 = vec_sub(v3, v8);
 +    v3 = vec_add(v3, v8);
 +
 +    vec_st(v0, 0, &(out[0])); // r0
 +    vec_st(v3, i1+16, &(out[0])); // i1
 +    vec_st(v4, i2, &(out[0])); // r2
 +    vec_st(v7, i3+16, &(out[0]));// i3
 +
 +    v5 = vec_sub(v1, v13);
 +    v1 = vec_add(v1, v13);
 +    v6 = vec_sub(v2, v9);
 +    v2 = vec_add(v2, v9);
 +
 +    vec_st(v1, 16, &(out[0])); // i0
 +    vec_st(v2, i1, &(out[0])); // r1
 +    vec_st(v5, i2+16, &(out[0])); // i2
 +    vec_st(v6, i3, &(out[0])); // r3
 +
 +    do {
 +        out += 8;
 +        wre += 4;
 +        wim -= 4;
 +
 +        v8 = vec_ld(0, &(wre[0]));
 +        v10 = vec_ld(0, &(wim[0]));
 +        v9 = vec_ld(0, &(wim[-4]));
 +        v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
 +
 +        v4 = vec_ld(i2, &(out[0])); // r2
 +        v5 = vec_ld(i2+16, &(out[0])); // i2
 +        v6 = vec_ld(i3, &(out[0])); // r3
 +        v7 = vec_ld(i3+16, &(out[0]));// i3
 +        v10 = vec_mul(v4, v8); // r2*wre
 +        v11 = vec_mul(v5, v8); // i2*wre
 +        v12 = vec_mul(v6, v8); // r3*wre
 +        v13 = vec_mul(v7, v8); // i3*wre
 +
 +        v0 = vec_ld(0, &(out[0])); // r0
 +        v3 = vec_ld(i1+16, &(out[0])); // i1
 +        v10 = vec_madd(v5, v9, v10); // r2*wim
 +        v11 = vec_nmsub(v4, v9, v11); // i2*wim
 +        v12 = vec_nmsub(v7, v9, v12); // r3*wim
 +        v13 = vec_madd(v6, v9, v13); // i3*wim
 +
 +        v1 = vec_ld(16, &(out[0])); // i0
 +        v2 = vec_ld(i1, &(out[0])); // r1
 +        v8 = vec_sub(v12, v10);
 +        v12 = vec_add(v12, v10);
 +        v9 = vec_sub(v11, v13);
 +        v13 = vec_add(v11, v13);
 +        v4 = vec_sub(v0, v12);
 +        v0 = vec_add(v0, v12);
 +        v7 = vec_sub(v3, v8);
 +        v3 = vec_add(v3, v8);
 +
 +        vec_st(v0, 0, &(out[0])); // r0
 +        vec_st(v3, i1+16, &(out[0])); // i1
 +        vec_st(v4, i2, &(out[0])); // r2
 +        vec_st(v7, i3+16, &(out[0])); // i3
 +
 +        v5 = vec_sub(v1, v13);
 +        v1 = vec_add(v1, v13);
 +        v6 = vec_sub(v2, v9);
 +        v2 = vec_add(v2, v9);
 +
 +        vec_st(v1, 16, &(out[0])); // i0
 +        vec_st(v2, i1, &(out[0])); // r1
 +        vec_st(v5, i2+16, &(out[0])); // i2
 +        vec_st(v6, i3, &(out[0])); // r3
 +    } while (n-=2);
 +}
 +
 +#endif
 +
 +#endif /* AVCODEC_PPC_FFT_VSX_H */
diff --cc libavcodec/ppc/h264chroma_init.c
index 876efeca09,f8392c2ee2..bd0d213bdc
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@@ -23,11 -24,11 +24,11 @@@
  #include "libavutil/cpu.h"
  #include "libavutil/intreadwrite.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/h264chroma.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
  #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
  
diff --cc libavcodec/ppc/h264chroma_template.c
index d9b2a619e4,daa7652128..8f43e5dee1
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@@ -19,8 -19,6 +19,7 @@@
   */
  
  #include "libavutil/mem.h"
- #include "libavutil/ppc/types_altivec.h"
 +#include "libavutil/ppc/util_altivec.h"
  
  /* this code assume that stride % 16 == 0 */
  
diff --cc libavcodec/ppc/h264qpel.c
index 575f504d32,5da09bf46e..bef421fa4f
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@@ -23,12 -24,13 +24,13 @@@
  #include "libavutil/cpu.h"
  #include "libavutil/intreadwrite.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/h264qpel.h"
+ 
  #include "hpeldsp_altivec.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
  #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
diff --cc libavcodec/ppc/h264qpel_template.c
index 2f25e74840,6de063a719..304604c63d
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@@ -18,87 -18,13 +18,86 @@@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
 -#include "libavutil/mem.h"
 +#include "config.h"
 +#if HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
  
 -#ifdef DEBUG
 -#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
 +#include "libavutil/avassert.h"
 +#include "libavutil/mem.h"
- #include "libavutil/ppc/types_altivec.h"
 +#include "libavutil/ppc/util_altivec.h"
 +
 +#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
 +
 +#if HAVE_BIGENDIAN
 +#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
 +    vec_u8 srcR1 = vec_ld(-2, s);\
 +    vec_u8 srcR2 = vec_ld(14, s);\
 +    switch (ali) {\
 +    default: {\
 +        srcM2 = vec_perm(srcR1, srcR2, pm2);\
 +        srcM1 = vec_perm(srcR1, srcR2, pm1);\
 +        srcP0 = vec_perm(srcR1, srcR2, pp0);\
 +        srcP1 = vec_perm(srcR1, srcR2, pp1);\
 +        srcP2 = vec_perm(srcR1, srcR2, pp2);\
 +        srcP3 = vec_perm(srcR1, srcR2, pp3);\
 +    } break;\
 +    case 11: {\
 +        srcM2 = vec_perm(srcR1, srcR2, pm2);\
 +        srcM1 = vec_perm(srcR1, srcR2, pm1);\
 +        srcP0 = vec_perm(srcR1, srcR2, pp0);\
 +        srcP1 = vec_perm(srcR1, srcR2, pp1);\
 +        srcP2 = vec_perm(srcR1, srcR2, pp2);\
 +        srcP3 = srcR2;\
 +    } break;\
 +    case 12: {\
 +        vec_u8 srcR3 = vec_ld(30, s);\
 +        srcM2 = vec_perm(srcR1, srcR2, pm2);\
 +        srcM1 = vec_perm(srcR1, srcR2, pm1);\
 +        srcP0 = vec_perm(srcR1, srcR2, pp0);\
 +        srcP1 = vec_perm(srcR1, srcR2, pp1);\
 +        srcP2 = srcR2;\
 +        srcP3 = vec_perm(srcR2, srcR3, pp3);\
 +    } break;\
 +    case 13: {\
 +        vec_u8 srcR3 = vec_ld(30, s);\
 +        srcM2 = vec_perm(srcR1, srcR2, pm2);\
 +        srcM1 = vec_perm(srcR1, srcR2, pm1);\
 +        srcP0 = vec_perm(srcR1, srcR2, pp0);\
 +        srcP1 = srcR2;\
 +        srcP2 = vec_perm(srcR2, srcR3, pp2);\
 +        srcP3 = vec_perm(srcR2, srcR3, pp3);\
 +    } break;\
 +    case 14: {\
 +        vec_u8 srcR3 = vec_ld(30, s);\
 +        srcM2 = vec_perm(srcR1, srcR2, pm2);\
 +        srcM1 = vec_perm(srcR1, srcR2, pm1);\
 +        srcP0 = srcR2;\
 +        srcP1 = vec_perm(srcR2, srcR3, pp1);\
 +        srcP2 = vec_perm(srcR2, srcR3, pp2);\
 +        srcP3 = vec_perm(srcR2, srcR3, pp3);\
 +    } break;\
 +    case 15: {\
 +        vec_u8 srcR3 = vec_ld(30, s);\
 +        srcM2 = vec_perm(srcR1, srcR2, pm2);\
 +        srcM1 = srcR2;\
 +        srcP0 = vec_perm(srcR2, srcR3, pp0);\
 +        srcP1 = vec_perm(srcR2, srcR3, pp1);\
 +        srcP2 = vec_perm(srcR2, srcR3, pp2);\
 +        srcP3 = vec_perm(srcR2, srcR3, pp3);\
 +    } break;\
 +    }\
 + }
  #else
 -#define ASSERT_ALIGNED(ptr) ;
 -#endif
 +#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
 +    srcM2 =  vec_vsx_ld(-2, s);\
 +    srcM1 = vec_vsx_ld(-1, s);\
 +    srcP0 = vec_vsx_ld(0, s);\
 +    srcP1 = vec_vsx_ld(1, s);\
 +    srcP2 = vec_vsx_ld(2, s);\
 +    srcP3 = vec_vsx_ld(3, s);\
 + }
 +#endif /* HAVE_BIGENDIAN */
  
  /* this code assume stride % 16 == 0 */
  #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
diff --cc libavcodec/ppc/hpeldsp_altivec.c
index 87a1f05b6a,405b91841e..4f19521860
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@@ -25,16 -25,13 +25,13 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/hpeldsp.h"
+ 
  #include "hpeldsp_altivec.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  /* next one assumes that ((line_size % 16) == 0) */
  void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  {
diff --cc libavcodec/ppc/idctdsp.c
index f1b42470fb,dc22e15269..29f625a01c
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@@ -40,10 -38,11 +38,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
+ #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/idctdsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  #define IDCT_HALF                                       \
      /* 1st stage */                                     \
diff --cc libavcodec/ppc/lossless_audiodsp_altivec.c
index bdec25223d,0000000000..298e6c38a0
mode 100644,000000..100644
--- a/libavcodec/ppc/lossless_audiodsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@@ -1,93 -1,0 +1,91 @@@
 +/*
 + * Copyright (c) 2007 Luca Barbato <lu_zero at gentoo.org>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "config.h"
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
 +
 +#include "libavutil/attributes.h"
 +#include "libavutil/cpu.h"
 +#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
++#include "libavutil/ppc/util_altivec.h"
++
 +#include "libavcodec/lossless_audiodsp.h"
 +
 +#if HAVE_BIGENDIAN
 +#define GET_T(tt0,tt1,src,a,b){       \
 +        a = vec_ld(16, src);          \
 +        tt0 = vec_perm(b, a, align);  \
 +        b = vec_ld(32, src);          \
 +        tt1 = vec_perm(a, b, align);  \
 + }
 +#else
 +#define GET_T(tt0,tt1,src,a,b){       \
 +        tt0 = vec_vsx_ld(0, src);     \
 +        tt1 = vec_vsx_ld(16, src);    \
 + }
 +#endif
 +
 +#if HAVE_ALTIVEC
 +static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
 +                                                    const int16_t *v2,
 +                                                    const int16_t *v3,
 +                                                    int order, int mul)
 +{
 +    LOAD_ZERO;
 +    vec_s16 *pv1 = (vec_s16 *) v1;
 +    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
 +    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
 +    register vec_s32 res = zero_s32v;
 +#if HAVE_BIGENDIAN
 +    register vec_u8 align = vec_lvsl(0, v2);
 +    i2 = vec_ld(0, v2);
 +    i3 = vec_ld(0, v3);
 +#endif
 +    int32_t ires;
 +
 +    order >>= 4;
 +    do {
 +        GET_T(t0,t1,v2,i1,i2);
 +        i0     = pv1[0];
 +        i1     = pv1[1];
 +        res    = vec_msum(t0, i0, res);
 +        res    = vec_msum(t1, i1, res);
 +        GET_T(t0,t1,v3,i4,i3);
 +        pv1[0] = vec_mladd(t0, muls, i0);
 +        pv1[1] = vec_mladd(t1, muls, i1);
 +        pv1   += 2;
 +        v2    += 16;
 +        v3    += 16;
 +    } while (--order);
 +    res = vec_splat(vec_sums(res, zero_s32v), 3);
 +    vec_ste(res, 0, &ires);
 +
 +    return ires;
 +}
 +#endif /* HAVE_ALTIVEC */
 +
 +av_cold void ff_llauddsp_init_ppc(LLAudDSPContext *c)
 +{
 +#if HAVE_ALTIVEC
 +    if (!PPC_ALTIVEC(av_get_cpu_flags()))
 +        return;
 +
 +    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
 +#endif /* HAVE_ALTIVEC */
 +}
diff --cc libavcodec/ppc/lossless_videodsp_altivec.c
index 16dd99f8d7,0000000000..980f85b166
mode 100644,000000..100644
--- a/libavcodec/ppc/lossless_videodsp_altivec.c
+++ b/libavcodec/ppc/lossless_videodsp_altivec.c
@@@ -1,62 -1,0 +1,59 @@@
 +/*
 + * Copyright (c) 2002 Brian Foley
 + * Copyright (c) 2002 Dieter Shirley
 + * Copyright (c) 2003-2004 Romain Dolbeau <romain at dolbeau.org>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "config.h"
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
 +
 +#include "libavutil/attributes.h"
 +#include "libavutil/cpu.h"
 +#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
 +#include "libavutil/ppc/util_altivec.h"
++
 +#include "libavcodec/lossless_videodsp.h"
 +
 +#if HAVE_ALTIVEC
 +static void add_bytes_altivec(uint8_t *dst, uint8_t *src, ptrdiff_t w)
 +{
 +    register int i;
 +    register vector unsigned char vdst, vsrc;
 +
 +    /* dst and src are 16 bytes-aligned (guaranteed). */
 +    for (i = 0; i + 15 < w; i += 16) {
 +        vdst = vec_ld(i, (unsigned char *) dst);
 +        vsrc = vec_ld(i, (unsigned char *) src);
 +        vdst = vec_add(vsrc, vdst);
 +        vec_st(vdst, i, (unsigned char *) dst);
 +    }
 +    /* If w is not a multiple of 16. */
 +    for (; i < w; i++)
 +        dst[i] = src[i];
 +}
 +#endif /* HAVE_ALTIVEC */
 +
 +av_cold void ff_llviddsp_init_ppc(LLVidDSPContext *c)
 +{
 +#if HAVE_ALTIVEC
 +    if (!PPC_ALTIVEC(av_get_cpu_flags()))
 +        return;
 +
 +    c->add_bytes = add_bytes_altivec;
 +#endif /* HAVE_ALTIVEC */
 +}
diff --cc libavcodec/ppc/mpegvideo_altivec.c
index 1b6bda6c36,89e15a4a7f..2c6ff9165b
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@@ -28,11 -29,11 +29,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/mpegvideo.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  /* AltiVec version of dct_unquantize_h263
     this code assumes `block' is 16 bytes-aligned */
diff --cc libavcodec/ppc/mpegvideodsp.c
index 021933255b,44ae126774..990a974a4e
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@@ -23,11 -23,11 +23,11 @@@
  #include "libavutil/cpu.h"
  #include "libavutil/mem.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/mpegvideodsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  /* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
   * to preserve proper dst alignment. */
  static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
diff --cc libavcodec/ppc/mpegvideoencdsp.c
index 3e6765ce15,d11f05bf1e..b96487bf81
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@@ -25,40 -23,12 +23,40 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/mpegvideoencdsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
 +#if HAVE_VSX
 +static int pix_norm1_altivec(uint8_t *pix, int line_size)
 +{
 +    int i, s = 0;
 +    const vector unsigned int zero =
 +        (const vector unsigned int) vec_splat_u32(0);
 +    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
 +    vector signed int sum;
 +
 +    for (i = 0; i < 16; i++) {
 +        /* Read the potentially unaligned pixels. */
 +        //vector unsigned char pixl = vec_ld(0,  pix);
 +        //vector unsigned char pixr = vec_ld(15, pix);
 +        //vector unsigned char pixv = vec_perm(pixl, pixr, perm);
 +        vector unsigned char pixv = vec_vsx_ld(0,  pix);
 +
 +        /* Square the values, and add them to our sum. */
 +        sv = vec_msum(pixv, pixv, sv);
 +
 +        pix += line_size;
 +    }
 +    /* Sum up the four partial sums, and put the result into s. */
 +    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 +    sum = vec_splat(sum, 3);
 +    vec_ste(sum, 0, &s);
 +    return s;
 +}
 +#else
  static int pix_norm1_altivec(uint8_t *pix, int line_size)
  {
      int i, s = 0;
diff --cc libavcodec/ppc/svq1enc_altivec.c
index 4e25e253f6,e155f885cd..f63f086602
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@@ -28,11 -25,11 +25,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/svq1enc.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                       int size)
  {
diff --cc libavcodec/ppc/vc1dsp_altivec.c
index 83d537f0c1,fc82502358..bbadb2aaee
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@@ -23,11 -24,11 +24,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/vc1dsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  // main steps of 8x8 transform
  #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
diff --cc libavcodec/ppc/vorbisdsp_altivec.c
index d7557c815b,52c29527ba..4dabf2dc7d
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@@ -25,9 -23,11 +23,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
+ #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/vorbisdsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                              intptr_t blocksize)
  {
diff --cc libavcodec/ppc/vp3dsp_altivec.c
index d2231d090a,2b7cc9d503..a9a48d145b
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@@ -24,11 -25,11 +25,11 @@@
  #include "libavutil/attributes.h"
  #include "libavutil/cpu.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/vp3dsp.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  
  static const vec_s16 constants =
      {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
diff --cc libavcodec/ppc/vp8dsp_altivec.c
index 23e4ace7da,6857e6b6a2..31201ed2d8
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@@ -24,12 -25,13 +25,13 @@@
  #include "libavutil/cpu.h"
  #include "libavutil/mem.h"
  #include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
  #include "libavutil/ppc/util_altivec.h"
+ 
  #include "libavcodec/vp8dsp.h"
+ 
  #include "hpeldsp_altivec.h"
  
 -#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 +#if HAVE_ALTIVEC
  #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
  
  // h subpel filter uses msum to multiply+add 4 pixel taps at once
diff --cc libpostproc/postprocess.c
index 1dc719cf93,0000000000..6aa4ace337
mode 100644,000000..100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@@ -1,1044 -1,0 +1,1041 @@@
 +/*
 + * Copyright (C) 2001-2003 Michael Niedermayer (michaelni at gmx.at)
 + *
 + * AltiVec optimizations (C) 2004 Romain Dolbeau <romain at dolbeau.org>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +/**
 + * @file
 + * postprocessing.
 + */
 +
 +/*
 +                        C       MMX     MMX2    3DNow   AltiVec
 +isVertDC                Ec      Ec                      Ec
 +isVertMinMaxOk          Ec      Ec                      Ec
 +doVertLowPass           E               e       e       Ec
 +doVertDefFilter         Ec      Ec      e       e       Ec
 +isHorizDC               Ec      Ec                      Ec
 +isHorizMinMaxOk         a       E                       Ec
 +doHorizLowPass          E               e       e       Ec
 +doHorizDefFilter        Ec      Ec      e       e       Ec
 +do_a_deblock            Ec      E       Ec      E
 +deRing                  E               e       e*      Ecp
 +Vertical RKAlgo1        E               a       a
 +Horizontal RKAlgo1                      a       a
 +Vertical X1#            a               E       E
 +Horizontal X1#          a               E       E
 +LinIpolDeinterlace      e               E       E*
 +CubicIpolDeinterlace    a               e       e*
 +LinBlendDeinterlace     e               E       E*
 +MedianDeinterlace#      E       Ec      Ec
 +TempDeNoiser#           E               e       e       Ec
 +
 +* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
 +# more or less selfinvented filters so the exactness is not too meaningful
 +E = Exact implementation
 +e = almost exact implementation (slightly different rounding,...)
 +a = alternative / approximate impl
 +c = checked against the other implementations (-vo md5)
 +p = partially optimized, still some work to do
 +*/
 +
 +/*
 +TODO:
 +reduce the time wasted on the mem transfer
 +unroll stuff if instructions depend too much on the prior one
 +move YScale thing to the end instead of fixing QP
 +write a faster and higher quality deblocking filter :)
 +make the mainloop more flexible (variable number of blocks at once
 +        (the if/else stuff per block is slowing things down)
 +compare the quality & speed of all filters
 +split this huge file
 +optimize c versions
 +try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 +...
 +*/
 +
 +//Changelog: use git log
 +
 +#include "config.h"
 +#include "libavutil/avutil.h"
 +#include "libavutil/avassert.h"
 +#include "libavutil/intreadwrite.h"
 +#include <inttypes.h>
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +//#undef HAVE_MMXEXT_INLINE
 +//#define HAVE_AMD3DNOW_INLINE
 +//#undef HAVE_MMX_INLINE
 +//#undef ARCH_X86
 +//#define DEBUG_BRIGHTNESS
 +#include "postprocess.h"
 +#include "postprocess_internal.h"
 +#include "libavutil/avstring.h"
++#include "libavutil/ppc/util_altivec.h"
 +
 +#include "libavutil/ffversion.h"
 +const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
 +
 +unsigned postproc_version(void)
 +{
 +    av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
 +    return LIBPOSTPROC_VERSION_INT;
 +}
 +
 +const char *postproc_configuration(void)
 +{
 +    return FFMPEG_CONFIGURATION;
 +}
 +
 +const char *postproc_license(void)
 +{
 +#define LICENSE_PREFIX "libpostproc license: "
 +    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 +}
 +
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
- 
 +#define GET_MODE_BUFFER_SIZE 500
 +#define OPTIONS_ARRAY_SIZE 10
 +#define BLOCK_SIZE 8
 +#define TEMP_STRIDE 8
 +//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 +
 +#if ARCH_X86 && HAVE_INLINE_ASM
 +DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 +DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 +DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 +DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 +DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 +DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 +DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 +DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 +#endif
 +
 +DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 +
 +
 +static const struct PPFilter filters[]=
 +{
 +    {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 +    {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 +/*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 +    {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 +    {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 +    {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 +    {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 +    {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 +    {"dr", "dering",                1, 5, 6, DERING},
 +    {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 +    {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 +    {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 +    {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 +    {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 +    {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 +    {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 +    {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 +    {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 +    {"be", "bitexact",              1, 0, 0, BITEXACT},
 +    {"vi", "visualize",             1, 0, 0, VISUALIZE},
 +    {NULL, NULL,0,0,0,0} //End Marker
 +};
 +
 +static const char * const replaceTable[]=
 +{
 +    "default",      "hb:a,vb:a,dr:a",
 +    "de",           "hb:a,vb:a,dr:a",
 +    "fast",         "h1:a,v1:a,dr:a",
 +    "fa",           "h1:a,v1:a,dr:a",
 +    "ac",           "ha:a:128:7,va:a,dr:a",
 +    NULL //End Marker
 +};
 +
 +/* The horizontal functions exist only in C because the MMX
 + * code is faster with vertical filters and transposing. */
 +
 +/**
 + * Check if the given 8x8 Block is mostly "flat"
 + */
 +static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 +{
 +    int numEq= 0;
 +    int y;
 +    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 +    const int dcThreshold= dcOffset*2 + 1;
 +
 +    for(y=0; y<BLOCK_SIZE; y++){
 +        numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
 +        src+= stride;
 +    }
 +    return numEq > c->ppMode.flatnessThreshold;
 +}
 +
 +/**
 + * Check if the middle 8x8 Block in the given 8x16 block is flat
 + */
 +static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 +{
 +    int numEq= 0;
 +    int y;
 +    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 +    const int dcThreshold= dcOffset*2 + 1;
 +
 +    src+= stride*4; // src points to begin of the 8x8 Block
 +    for(y=0; y<BLOCK_SIZE-1; y++){
 +        numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
 +        src+= stride;
 +    }
 +    return numEq > c->ppMode.flatnessThreshold;
 +}
 +
 +static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 +{
 +    int i;
 +    for(i=0; i<2; i++){
 +        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 +        src += stride;
 +        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 +        src += stride;
 +        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 +        src += stride;
 +        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 +        src += stride;
 +    }
 +    return 1;
 +}
 +
 +static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 +{
 +    int x;
 +    src+= stride*4;
 +    for(x=0; x<BLOCK_SIZE; x+=4){
 +        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 +        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 +        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 +        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 +    }
 +    return 1;
 +}
 +
 +static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 +{
 +    if( isHorizDC_C(src, stride, c) ){
 +        return isHorizMinMaxOk_C(src, stride, c->QP);
 +    }else{
 +        return 2;
 +    }
 +}
 +
 +static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 +{
 +    if( isVertDC_C(src, stride, c) ){
 +        return isVertMinMaxOk_C(src, stride, c->QP);
 +    }else{
 +        return 2;
 +    }
 +}
 +
 +static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 +{
 +    int y;
 +    for(y=0; y<BLOCK_SIZE; y++){
 +        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 +
 +        if(FFABS(middleEnergy) < 8*c->QP){
 +            const int q=(dst[3] - dst[4])/2;
 +            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 +            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 +
 +            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 +            d= FFMAX(d, 0);
 +
 +            d= (5*d + 32) >> 6;
 +            d*= FFSIGN(-middleEnergy);
 +
 +            if(q>0)
 +            {
 +                d = FFMAX(d, 0);
 +                d = FFMIN(d, q);
 +            }
 +            else
 +            {
 +                d = FFMIN(d, 0);
 +                d = FFMAX(d, q);
 +            }
 +
 +            dst[3]-= d;
 +            dst[4]+= d;
 +        }
 +        dst+= stride;
 +    }
 +}
 +
 +/**
 + * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 + * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 + */
 +static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 +{
 +    int y;
 +    for(y=0; y<BLOCK_SIZE; y++){
 +        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 +        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 +
 +        int sums[10];
 +        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 +        sums[1] = sums[0] - first  + dst[3];
 +        sums[2] = sums[1] - first  + dst[4];
 +        sums[3] = sums[2] - first  + dst[5];
 +        sums[4] = sums[3] - first  + dst[6];
 +        sums[5] = sums[4] - dst[0] + dst[7];
 +        sums[6] = sums[5] - dst[1] + last;
 +        sums[7] = sums[6] - dst[2] + last;
 +        sums[8] = sums[7] - dst[3] + last;
 +        sums[9] = sums[8] - dst[4] + last;
 +
 +        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 +        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 +        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 +        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 +        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 +        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 +        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 +        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 +
 +        dst+= stride;
 +    }
 +}
 +
 +/**
 + * Experimental Filter 1 (Horizontal)
 + * will not damage linear gradients
 + * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 + * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 + * MMX2 version does correct clipping C version does not
 + * not identical with the vertical one
 + */
 +static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 +{
 +    int y;
 +    static uint64_t lut[256];
 +    if(!lut[255])
 +    {
 +        int i;
 +        for(i=0; i<256; i++)
 +        {
 +            int v= i < 128 ? 2*i : 2*(i-256);
 +/*
 +//Simulate 112242211 9-Tap filter
 +            uint64_t a= (v/16)  & 0xFF;
 +            uint64_t b= (v/8)   & 0xFF;
 +            uint64_t c= (v/4)   & 0xFF;
 +            uint64_t d= (3*v/8) & 0xFF;
 +*/
 +//Simulate piecewise linear interpolation
 +            uint64_t a= (v/16)   & 0xFF;
 +            uint64_t b= (v*3/16) & 0xFF;
 +            uint64_t c= (v*5/16) & 0xFF;
 +            uint64_t d= (7*v/16) & 0xFF;
 +            uint64_t A= (0x100 - a)&0xFF;
 +            uint64_t B= (0x100 - b)&0xFF;
 +            uint64_t C= (0x100 - c)&0xFF;
 +            uint64_t D= (0x100 - c)&0xFF;
 +
 +            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 +                       (D<<24) | (C<<16) | (B<<8)  | (A);
 +            //lut[i] = (v<<32) | (v<<24);
 +        }
 +    }
 +
 +    for(y=0; y<BLOCK_SIZE; y++){
 +        int a= src[1] - src[2];
 +        int b= src[3] - src[4];
 +        int c= src[5] - src[6];
 +
 +        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 +
 +        if(d < QP){
 +            int v = d * FFSIGN(-b);
 +
 +            src[1] +=v/8;
 +            src[2] +=v/4;
 +            src[3] +=3*v/8;
 +            src[4] -=3*v/8;
 +            src[5] -=v/4;
 +            src[6] -=v/8;
 +        }
 +        src+=stride;
 +    }
 +}
 +
 +/**
 + * accurate deblock filter
 + */
 +static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 +                                            int stride, const PPContext *c, int mode)
 +{
 +    int y;
 +    const int QP= c->QP;
 +    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 +    const int dcThreshold= dcOffset*2 + 1;
 +//START_TIMER
 +    src+= step*4; // src points to begin of the 8x8 Block
 +    for(y=0; y<8; y++){
 +        int numEq= 0;
 +
 +        numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
 +        numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
 +        if(numEq > c->ppMode.flatnessThreshold){
 +            int min, max, x;
 +
 +            if(src[0] > src[step]){
 +                max= src[0];
 +                min= src[step];
 +            }else{
 +                max= src[step];
 +                min= src[0];
 +            }
 +            for(x=2; x<8; x+=2){
 +                if(src[x*step] > src[(x+1)*step]){
 +                        if(src[x    *step] > max) max= src[ x   *step];
 +                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
 +                }else{
 +                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
 +                        if(src[ x   *step] < min) min= src[ x   *step];
 +                }
 +            }
 +            if(max-min < 2*QP){
 +                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 +                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 +
 +                int sums[10];
 +                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 +                sums[1] = sums[0] - first       + src[3*step];
 +                sums[2] = sums[1] - first       + src[4*step];
 +                sums[3] = sums[2] - first       + src[5*step];
 +                sums[4] = sums[3] - first       + src[6*step];
 +                sums[5] = sums[4] - src[0*step] + src[7*step];
 +                sums[6] = sums[5] - src[1*step] + last;
 +                sums[7] = sums[6] - src[2*step] + last;
 +                sums[8] = sums[7] - src[3*step] + last;
 +                sums[9] = sums[8] - src[4*step] + last;
 +
 +                if (mode & VISUALIZE) {
 +                    src[0*step] =
 +                    src[1*step] =
 +                    src[2*step] =
 +                    src[3*step] =
 +                    src[4*step] =
 +                    src[5*step] =
 +                    src[6*step] =
 +                    src[7*step] = 128;
 +                }
 +                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 +                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 +                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 +                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 +                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 +                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 +                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 +                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 +            }
 +        }else{
 +            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 +
 +            if(FFABS(middleEnergy) < 8*QP){
 +                const int q=(src[3*step] - src[4*step])/2;
 +                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 +                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 +
 +                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 +                d= FFMAX(d, 0);
 +
 +                d= (5*d + 32) >> 6;
 +                d*= FFSIGN(-middleEnergy);
 +
 +                if(q>0){
 +                    d = FFMAX(d, 0);
 +                    d = FFMIN(d, q);
 +                }else{
 +                    d = FFMIN(d, 0);
 +                    d = FFMAX(d, q);
 +                }
 +
 +                if ((mode & VISUALIZE) && d) {
 +                    d= (d < 0) ? 32 : -32;
 +                    src[3*step]= av_clip_uint8(src[3*step] - d);
 +                    src[4*step]= av_clip_uint8(src[4*step] + d);
 +                    d = 0;
 +                }
 +
 +                src[3*step]-= d;
 +                src[4*step]+= d;
 +            }
 +        }
 +
 +        src += stride;
 +    }
 +/*if(step==16){
 +    STOP_TIMER("step16")
 +}else{
 +    STOP_TIMER("stepX")
 +}*/
 +}
 +
 +//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 +//Plain C versions
 +//we always compile C for testing which needs bitexactness
 +#define TEMPLATE_PP_C 1
 +#include "postprocess_template.c"
 +
 +#if HAVE_ALTIVEC
 +#   define TEMPLATE_PP_ALTIVEC 1
 +#   include "postprocess_altivec_template.c"
 +#   include "postprocess_template.c"
 +#endif
 +
 +#if ARCH_X86 && HAVE_INLINE_ASM
 +#    if CONFIG_RUNTIME_CPUDETECT
 +#        define TEMPLATE_PP_MMX 1
 +#        include "postprocess_template.c"
 +#        define TEMPLATE_PP_MMXEXT 1
 +#        include "postprocess_template.c"
 +#        define TEMPLATE_PP_3DNOW 1
 +#        include "postprocess_template.c"
 +#        define TEMPLATE_PP_SSE2 1
 +#        include "postprocess_template.c"
 +#    else
 +#        if HAVE_SSE2_INLINE
 +#            define TEMPLATE_PP_SSE2 1
 +#            include "postprocess_template.c"
 +#        elif HAVE_MMXEXT_INLINE
 +#            define TEMPLATE_PP_MMXEXT 1
 +#            include "postprocess_template.c"
 +#        elif HAVE_AMD3DNOW_INLINE
 +#            define TEMPLATE_PP_3DNOW 1
 +#            include "postprocess_template.c"
 +#        elif HAVE_MMX_INLINE
 +#            define TEMPLATE_PP_MMX 1
 +#            include "postprocess_template.c"
 +#        endif
 +#    endif
 +#endif
 +
 +typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 +                      const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
 +
 +static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 +        const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 +{
 +    pp_fn pp = postProcess_C;
 +    PPContext *c= (PPContext *)vc;
 +    PPMode *ppMode= (PPMode *)vm;
 +    c->ppMode= *ppMode; //FIXME
 +
 +    if (!(ppMode->lumMode & BITEXACT)) {
 +#if CONFIG_RUNTIME_CPUDETECT
 +#if ARCH_X86 && HAVE_INLINE_ASM
 +        // ordered per speed fastest first
 +        if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 +        else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 +        else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 +        else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 +#elif HAVE_ALTIVEC
 +        if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 +#endif
 +#else /* CONFIG_RUNTIME_CPUDETECT */
 +#if     HAVE_SSE2_INLINE
 +        pp = postProcess_SSE2;
 +#elif   HAVE_MMXEXT_INLINE
 +        pp = postProcess_MMX2;
 +#elif HAVE_AMD3DNOW_INLINE
 +        pp = postProcess_3DNow;
 +#elif HAVE_MMX_INLINE
 +        pp = postProcess_MMX;
 +#elif HAVE_ALTIVEC
 +        pp = postProcess_altivec;
 +#endif
 +#endif /* !CONFIG_RUNTIME_CPUDETECT */
 +    }
 +
 +    pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 +}
 +
 +/* -pp Command line Help
 +*/
 +const char pp_help[] =
 +"Available postprocessing filters:\n"
 +"Filters                        Options\n"
 +"short  long name       short   long option     Description\n"
 +"*      *               a       autoq           CPU power dependent enabler\n"
 +"                       c       chrom           chrominance filtering enabled\n"
 +"                       y       nochrom         chrominance filtering disabled\n"
 +"                       n       noluma          luma filtering disabled\n"
 +"hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 +"       1. difference factor: default=32, higher -> more deblocking\n"
 +"       2. flatness threshold: default=39, lower -> more deblocking\n"
 +"                       the h & v deblocking filters share these\n"
 +"                       so you can't set different thresholds for h / v\n"
 +"vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 +"ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 +"va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 +"h1     x1hdeblock                              experimental h deblock filter 1\n"
 +"v1     x1vdeblock                              experimental v deblock filter 1\n"
 +"dr     dering                                  deringing filter\n"
 +"al     autolevels                              automatic brightness / contrast\n"
 +"                       f        fullyrange     stretch luminance to (0..255)\n"
 +"lb     linblenddeint                           linear blend deinterlacer\n"
 +"li     linipoldeint                            linear interpolating deinterlace\n"
 +"ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 +"md     mediandeint                             median deinterlacer\n"
 +"fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 +"l5     lowpass5                                FIR lowpass deinterlacer\n"
 +"de     default                                 hb:a,vb:a,dr:a\n"
 +"fa     fast                                    h1:a,v1:a,dr:a\n"
 +"ac                                             ha:a:128:7,va:a,dr:a\n"
 +"tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 +"                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 +"fq     forceQuant      <quantizer>             force quantizer\n"
 +"Usage:\n"
 +"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 +"long form example:\n"
 +"vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 +"short form example:\n"
 +"vb:a/hb:a/lb                                   de,-vb\n"
 +"more examples:\n"
 +"tn:64:128:256\n"
 +"\n"
 +;
 +
 +pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 +{
 +    char temp[GET_MODE_BUFFER_SIZE];
 +    char *p= temp;
 +    static const char filterDelimiters[] = ",/";
 +    static const char optionDelimiters[] = ":|";
 +    struct PPMode *ppMode;
 +    char *filterToken;
 +
 +    if (!name)  {
 +        av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 +        return NULL;
 +    }
 +
 +    if (!strcmp(name, "help")) {
 +        const char *p;
 +        for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 +            av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 +            av_log(NULL, AV_LOG_INFO, "%s", temp);
 +        }
 +        return NULL;
 +    }
 +
 +    ppMode= av_malloc(sizeof(PPMode));
 +    if (!ppMode)
 +        return NULL;
 +
 +    ppMode->lumMode= 0;
 +    ppMode->chromMode= 0;
 +    ppMode->maxTmpNoise[0]= 700;
 +    ppMode->maxTmpNoise[1]= 1500;
 +    ppMode->maxTmpNoise[2]= 3000;
 +    ppMode->maxAllowedY= 234;
 +    ppMode->minAllowedY= 16;
 +    ppMode->baseDcDiff= 256/8;
 +    ppMode->flatnessThreshold= 56-16-1;
 +    ppMode->maxClippedThreshold= (AVRational){1,100};
 +    ppMode->error=0;
 +
 +    memset(temp, 0, GET_MODE_BUFFER_SIZE);
 +    av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 +
 +    av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 +
 +    for(;;){
 +        const char *filterName;
 +        int q= 1000000; //PP_QUALITY_MAX;
 +        int chrom=-1;
 +        int luma=-1;
 +        const char *option;
 +        const char *options[OPTIONS_ARRAY_SIZE];
 +        int i;
 +        int filterNameOk=0;
 +        int numOfUnknownOptions=0;
 +        int enable=1; //does the user want us to enabled or disabled the filter
 +        char *tokstate;
 +
 +        filterToken= av_strtok(p, filterDelimiters, &tokstate);
 +        if(!filterToken) break;
 +        p+= strlen(filterToken) + 1; // p points to next filterToken
 +        filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
 +        if (!filterName) {
 +            ppMode->error++;
 +            break;
 +        }
 +        av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 +
 +        if(*filterName == '-'){
 +            enable=0;
 +            filterName++;
 +        }
 +
 +        for(;;){ //for all options
 +            option= av_strtok(NULL, optionDelimiters, &tokstate);
 +            if(!option) break;
 +
 +            av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 +            if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 +            else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 +            else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 +            else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 +            else{
 +                options[numOfUnknownOptions] = option;
 +                numOfUnknownOptions++;
 +            }
 +            if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 +        }
 +        options[numOfUnknownOptions] = NULL;
 +
 +        /* replace stuff from the replace Table */
 +        for(i=0; replaceTable[2*i]; i++){
 +            if(!strcmp(replaceTable[2*i], filterName)){
 +                size_t newlen = strlen(replaceTable[2*i + 1]);
 +                int plen;
 +                int spaceLeft;
 +
 +                p--, *p=',';
 +
 +                plen= strlen(p);
 +                spaceLeft= p - temp + plen;
 +                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 +                    ppMode->error++;
 +                    break;
 +                }
 +                memmove(p + newlen, p, plen+1);
 +                memcpy(p, replaceTable[2*i + 1], newlen);
 +                filterNameOk=1;
 +            }
 +        }
 +
 +        for(i=0; filters[i].shortName; i++){
 +            if(   !strcmp(filters[i].longName, filterName)
 +               || !strcmp(filters[i].shortName, filterName)){
 +                ppMode->lumMode &= ~filters[i].mask;
 +                ppMode->chromMode &= ~filters[i].mask;
 +
 +                filterNameOk=1;
 +                if(!enable) break; // user wants to disable it
 +
 +                if(q >= filters[i].minLumQuality && luma)
 +                    ppMode->lumMode|= filters[i].mask;
 +                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 +                    if(q >= filters[i].minChromQuality)
 +                            ppMode->chromMode|= filters[i].mask;
 +
 +                if(filters[i].mask == LEVEL_FIX){
 +                    int o;
 +                    ppMode->minAllowedY= 16;
 +                    ppMode->maxAllowedY= 234;
 +                    for(o=0; options[o]; o++){
 +                        if(  !strcmp(options[o],"fullyrange")
 +                           ||!strcmp(options[o],"f")){
 +                            ppMode->minAllowedY= 0;
 +                            ppMode->maxAllowedY= 255;
 +                            numOfUnknownOptions--;
 +                        }
 +                    }
 +                }
 +                else if(filters[i].mask == TEMP_NOISE_FILTER)
 +                {
 +                    int o;
 +                    int numOfNoises=0;
 +
 +                    for(o=0; options[o]; o++){
 +                        char *tail;
 +                        ppMode->maxTmpNoise[numOfNoises]=
 +                            strtol(options[o], &tail, 0);
 +                        if(tail!=options[o]){
 +                            numOfNoises++;
 +                            numOfUnknownOptions--;
 +                            if(numOfNoises >= 3) break;
 +                        }
 +                    }
 +                }
 +                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 +                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 +                    int o;
 +
 +                    for(o=0; options[o] && o<2; o++){
 +                        char *tail;
 +                        int val= strtol(options[o], &tail, 0);
 +                        if(tail==options[o]) break;
 +
 +                        numOfUnknownOptions--;
 +                        if(o==0) ppMode->baseDcDiff= val;
 +                        else ppMode->flatnessThreshold= val;
 +                    }
 +                }
 +                else if(filters[i].mask == FORCE_QUANT){
 +                    int o;
 +                    ppMode->forcedQuant= 15;
 +
 +                    for(o=0; options[o] && o<1; o++){
 +                        char *tail;
 +                        int val= strtol(options[o], &tail, 0);
 +                        if(tail==options[o]) break;
 +
 +                        numOfUnknownOptions--;
 +                        ppMode->forcedQuant= val;
 +                    }
 +                }
 +            }
 +        }
 +        if(!filterNameOk) ppMode->error++;
 +        ppMode->error += numOfUnknownOptions;
 +    }
 +
 +    av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 +    if(ppMode->error){
 +        av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 +        av_free(ppMode);
 +        return NULL;
 +    }
 +    return ppMode;
 +}
 +
 +void pp_free_mode(pp_mode *mode){
 +    av_free(mode);
 +}
 +
 +static void reallocAlign(void **p, int size){
 +    av_free(*p);
 +    *p= av_mallocz(size);
 +}
 +
 +static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 +    int mbWidth = (width+15)>>4;
 +    int mbHeight= (height+15)>>4;
 +    int i;
 +
 +    c->stride= stride;
 +    c->qpStride= qpStride;
 +
 +    reallocAlign((void **)&c->tempDst, stride*24+32);
 +    reallocAlign((void **)&c->tempSrc, stride*24);
 +    reallocAlign((void **)&c->tempBlocks, 2*16*8);
 +    reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
 +    for(i=0; i<256; i++)
 +            c->yHistogram[i]= width*height/64*15/256;
 +
 +    for(i=0; i<3; i++){
 +        //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 +        reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
 +        reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 +    }
 +
 +    reallocAlign((void **)&c->deintTemp, 2*width+32);
 +    reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 +    reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 +    reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
 +}
 +
 +static const char * context_to_name(void * ptr) {
 +    return "postproc";
 +}
 +
 +static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 +
 +av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
 +    PPContext *c= av_mallocz(sizeof(PPContext));
 +    int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 +    int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 +
 +    if (!c)
 +        return NULL;
 +
 +    c->av_class = &av_codec_context_class;
 +    if(cpuCaps&PP_FORMAT){
 +        c->hChromaSubSample= cpuCaps&0x3;
 +        c->vChromaSubSample= (cpuCaps>>4)&0x3;
 +    }else{
 +        c->hChromaSubSample= 1;
 +        c->vChromaSubSample= 1;
 +    }
 +    if (cpuCaps & PP_CPU_CAPS_AUTO) {
 +        c->cpuCaps = av_get_cpu_flags();
 +    } else {
 +        c->cpuCaps = 0;
 +        if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 +        if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 +        if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 +        if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 +    }
 +
 +    reallocBuffers(c, width, height, stride, qpStride);
 +
 +    c->frameNum=-1;
 +
 +    return c;
 +}
 +
 +av_cold void pp_free_context(void *vc){
 +    PPContext *c = (PPContext*)vc;
 +    int i;
 +
 +    for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
 +        av_free(c->tempBlurred[i]);
 +    for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
 +        av_free(c->tempBlurredPast[i]);
 +
 +    av_free(c->tempBlocks);
 +    av_free(c->yHistogram);
 +    av_free(c->tempDst);
 +    av_free(c->tempSrc);
 +    av_free(c->deintTemp);
 +    av_free(c->stdQPTable);
 +    av_free(c->nonBQPTable);
 +    av_free(c->forcedQPTable);
 +
 +    memset(c, 0, sizeof(PPContext));
 +
 +    av_free(c);
 +}
 +
 +void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 +                     uint8_t * dst[3], const int dstStride[3],
 +                     int width, int height,
 +                     const QP_STORE_T *QP_store,  int QPStride,
 +                     pp_mode *vm,  void *vc, int pict_type)
 +{
 +    int mbWidth = (width+15)>>4;
 +    int mbHeight= (height+15)>>4;
 +    PPMode *mode = vm;
 +    PPContext *c = vc;
 +    int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 +    int absQPStride = FFABS(QPStride);
 +
 +    // c->stride and c->QPStride are always positive
 +    if(c->stride < minStride || c->qpStride < absQPStride)
 +        reallocBuffers(c, width, height,
 +                       FFMAX(minStride, c->stride),
 +                       FFMAX(c->qpStride, absQPStride));
 +
 +    if(!QP_store || (mode->lumMode & FORCE_QUANT)){
 +        int i;
 +        QP_store= c->forcedQPTable;
 +        absQPStride = QPStride = 0;
 +        if(mode->lumMode & FORCE_QUANT)
 +            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 +        else
 +            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 +    }
 +
 +    if(pict_type & PP_PICT_TYPE_QP2){
 +        int i;
 +        const int count= FFMAX(mbHeight * absQPStride, mbWidth);
 +        for(i=0; i<(count>>2); i++){
 +            AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
 +        }
 +        for(i<<=2; i<count; i++){
 +            c->stdQPTable[i] = QP_store[i]>>1;
 +        }
 +        QP_store= c->stdQPTable;
 +        QPStride= absQPStride;
 +    }
 +
 +    if(0){
 +        int x,y;
 +        for(y=0; y<mbHeight; y++){
 +            for(x=0; x<mbWidth; x++){
 +                av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 +            }
 +            av_log(c, AV_LOG_INFO, "\n");
 +        }
 +        av_log(c, AV_LOG_INFO, "\n");
 +    }
 +
 +    if((pict_type&7)!=3){
 +        if (QPStride >= 0){
 +            int i;
 +            const int count= FFMAX(mbHeight * QPStride, mbWidth);
 +            for(i=0; i<(count>>2); i++){
 +                AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
 +            }
 +            for(i<<=2; i<count; i++){
 +                c->nonBQPTable[i] = QP_store[i] & 0x3F;
 +            }
 +        } else {
 +            int i,j;
 +            for(i=0; i<mbHeight; i++) {
 +                for(j=0; j<absQPStride; j++) {
 +                    c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
 +                }
 +            }
 +        }
 +    }
 +
 +    av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
 +           mode->lumMode, mode->chromMode);
 +
 +    postProcess(src[0], srcStride[0], dst[0], dstStride[0],
 +                width, height, QP_store, QPStride, 0, mode, c);
 +
 +    if (!(src[1] && src[2] && dst[1] && dst[2]))
 +        return;
 +
 +    width  = (width )>>c->hChromaSubSample;
 +    height = (height)>>c->vChromaSubSample;
 +
 +    if(mode->chromMode){
 +        postProcess(src[1], srcStride[1], dst[1], dstStride[1],
 +                    width, height, QP_store, QPStride, 1, mode, c);
 +        postProcess(src[2], srcStride[2], dst[2], dstStride[2],
 +                    width, height, QP_store, QPStride, 2, mode, c);
 +    }
 +    else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
 +        linecpy(dst[1], src[1], height, srcStride[1]);
 +        linecpy(dst[2], src[2], height, srcStride[2]);
 +    }else{
 +        int y;
 +        for(y=0; y<height; y++){
 +            memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
 +            memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
 +        }
 +    }
 +}
diff --cc libswscale/swscale_internal.h
index 84d5bee5ff,adfe1708e1..0f51df95d7
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@@ -22,12 -22,6 +22,7 @@@
  #define SWSCALE_SWSCALE_INTERNAL_H
  
  #include "config.h"
- 
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
- 
 +#include "version.h"
  
  #include "libavutil/avassert.h"
  #include "libavutil/avutil.h"



More information about the ffmpeg-cvslog mailing list