[Ffmpeg-devel] [PATCH] SIMD accelected SNOW decoding

Sun Nov 27 22:41:26 CET 2005

Hi there,

I long time ago (6month), yartrebo wrote some 2 routines to speed-up
SNOW decoding (30-40% faster). It never got committed because neither
of the 2 were working on AMD64.

6 month later, I suspect more talented people can look at it.

Find in attachment the work-in-progress patch yartrebo sent me before
going in summer break (never to return again it seems).

See below for the gdb backtrace of one of the routine (both trigger a
segfault). Unfortunately, that doesn't give the very line number the
fails on the ASM (maybe because the program never actually reaches the
asm be fails to call it?).

Guillaume

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 46912537261024 (LWP 9416)]
0x00000000006a0c2c in ff_spatial_idwt_buffered_slice (cs=0x0,
slice_buf=0x2aaaad3f3720, width=720, height=576, stride_line=1,
type=0,
    decomposition_count=-1384878448, y=0) at snow.c:1736
1736            vertical_compose97i_asm
(gdb) bt
#0  0x00000000006a0c2c in ff_spatial_idwt_buffered_slice (cs=0x0,
slice_buf=0x2aaaad3f3720, width=720, height=576, stride_line=1,
type=0,
    decomposition_count=-1384878448, y=0) at snow.c:1736
#1  0x00000000006af1b0 in decode_frame (avctx=0xadb7c0, data=0xadb6a0,
data_size=0x7ffffffbeafc, buf=0x50 <Address 0x50 out of bounds>,
buf_size=16)
    at snow.c:4208
#2  0x000000000056a8d3 in avcodec_decode_video (avctx=0xadb7c0,
picture=0xadb6a0, got_picture_ptr=0x7ffffffbeafc,
    buf=0xab8910
"\u0627\uffffJ\210\v*\232Kq\uffff\216\uffffr38\220\226\uffff\uffff\uffff\uffff@\uffffk1\uffff\uffff\u043e'\uffff\212\uffff\uffff\uffff\2307\uffff\v/\uffff\uffff\uffffr\uffff\221\uffff\221\016\uffffoP5D\uffff\uffff\u026cK\uffff\uffff\"\uffff<\uffff\004\uffff\uffffP\030\220\221\023\uffff\035.h-\004\uffffu\204\uffff\uffff\tV\223\203~/\025\uffffg?9\uffff",
buf_size=106372) at utils.c:905
#3  0x0000000000452955 in decode (sh=0xab41d0, data=0xab8910,
len=106372, flags=0) at vd_ffmpeg.c:818
#4  0x000000000044f2ac in decode_video (sh_video=0xab41d0,
    start=0xab8910
"\u0627\uffffJ\210\v*\232Kq\uffff\216\uffffr38\220\226\uffff\uffff\uffff\uffff@\uffffk1\uffff\uffff\u043e'\uffff\212\uffff\uffff\uffff\2307\uffff\v/\uffff\uffff\uffffr\uffff\221\uffff\221\016\uffffoP5D\uffff\uffff\u026cK\uffff\uffff\"\uffff<\uffff\004\uffff\uffffP\030\220\221\023\uffff\035.h-\004\uffffu\204\uffff\uffff\tV\223\203~/\025\uffffg?9\uffff",
in_size=106372, drop_frame=0) at dec_video.c:316
#5  0x000000000040fd9e in main (argc=11223504, argv=0xffffffff) at
mplayer.c:2659
(gdb)

--
MPlayer's doc is offline. Find some fresh one here:
http://tuxrip.free.fr//MPlayer-DOCS-HTML/en/
http://tuxrip.free.fr//MPlayer-DOCS-HTML/fr/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mmx.h
Type: text/x-chdr
Size: 10869 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20051127/181f974a/attachment.h>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: snow_mmx_sse2.h
Type: text/x-chdr
Size: 46816 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20051127/181f974a/attachment-0001.h>
-------------- next part --------------
Index: snow.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/snow.c,v
retrieving revision 1.63
diff -u -r1.63 snow.c

--- snow.c	21 Sep 2005 23:09:16 -0000	1.63
+++ snow.c	27 Nov 2005 21:34:45 -0000
@@ -25,6 +25,11 @@
 
 #include "mpegvideo.h"
 
+#ifdef HAVE_MMX
+#include "i386/mmx.h"
+#include "i386/snow_mmx_sse2.h"
+#endif
+
 #undef NDEBUG
 #include <assert.h>
 
@@ -181,7 +186,7 @@
 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 };
 
-#define LOG2_OBMC_MAX 6
+#define LOG2_OBMC_MAX 8
 #define OBMC_MAX (1<<(LOG2_OBMC_MAX))
 #if 0 //64*cubic
 static const uint8_t obmc32[1024]={
@@ -240,6 +245,39 @@
 };
 #elif 1 // 64*linear
 static const uint8_t obmc32[1024]={
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+/*
  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
  0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
  0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
@@ -272,9 +310,27 @@
  0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
  0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+*/
  //error:0.000020
 };
 static const uint8_t obmc16[256]={
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+/*
  0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
  1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
  1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
@@ -291,6 +347,7 @@
  1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
  1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
  0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
+*/
 //error:0.000015
 };
 #else //64*cos
@@ -352,6 +409,15 @@
 
 //linear *64
 static const uint8_t obmc8[64]={
+  4, 12, 20, 28, 28, 20, 12,  4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+  4, 12, 20, 28, 28, 20, 12,  4,
+/*
  1, 3, 5, 7, 7, 5, 3, 1,
  3, 9,15,21,21,15, 9, 3,
  5,15,25,35,35,25,15, 5,
@@ -360,15 +426,22 @@
  5,15,25,35,35,25,15, 5,
  3, 9,15,21,21,15, 9, 3,
  1, 3, 5, 7, 7, 5, 3, 1,
+*/
 //error:0.000000
 };
 
 //linear *64
 static const uint8_t obmc4[16]={
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
+ /*
  4,12,12, 4,
 12,36,36,12,
 12,36,36,12,
  4,12,12, 4,
+ */
 //error:0.000000
 };
 
@@ -482,6 +555,8 @@
 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
 {
     int i;
+    
+    line_width = (line_width + 3) & (~3); // Align on 16 byte line.
   
     buf->base_buffer = base_buffer;
     buf->line_count = line_count;
@@ -490,16 +565,20 @@
     buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
     buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
   
-    for (i = 0; i < max_allocated_lines; i++)
+    buf->data_stack[0] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width * line_count);
+    for (i = 1; i < max_allocated_lines; i++)
     {
-      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
+      buf->data_stack[i] = buf->data_stack[0] + line_width * i;
     }
     
     buf->data_stack_top = max_allocated_lines - 1;
 }
 
+int min = 999999;
+
 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 {
+    int i;
     int offset;
     DWTELEM * buffer;
   
@@ -515,13 +594,18 @@
     buf->data_stack_top--;
     buf->line[line] = buffer;
   
-//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
+//    if (buf->data_stack_top + 1 < min)
+//    {
+//        av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
+//        min = buf->data_stack_top + 1;
+//    }
   
     return buffer;
 }
 
 static void slice_buffer_release(slice_buffer * buf, int line)
 {
+    int i;
     int offset;
     DWTELEM * buffer;
 
@@ -555,11 +639,8 @@
     int i;
     slice_buffer_flush(buf);
   
-    for (i = buf->data_count - 1; i >= 0; i--)
-    {
-        assert(buf->data_stack[i]);
-        av_free(buf->data_stack[i]);
-    }
+    assert(buf->data_stack[0]);
+    av_free(buf->data_stack[0]);
     assert(buf->data_stack);
     av_free(buf->data_stack);
     assert(buf->line);
@@ -931,7 +1012,7 @@
     DWTELEM temp[width];
     const int width2= width>>1;
     const int w2= (width+1)>>1;
-    int x;
+    int A1,A2,A3,A4, x;
 
     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
@@ -950,7 +1031,7 @@
 static void horizontal_composeX(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int width2= width>>1;
-    int x;
+    int A1,A2,A3,A4, x;
     const int w2= (width+1)>>1;
 
     memcpy(temp, b, width*sizeof(int));
@@ -1008,7 +1089,7 @@
 static void horizontal_decompose53i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int width2= width>>1;
-    int x;
+    int A1,A2,A3,A4, x;
     const int w2= (width+1)>>1;
 
     for(x=0; x<width2; x++){
@@ -1018,8 +1099,6 @@
     if(width&1)
         temp[x   ]= b[2*x    ];
 #if 0
-    {
-    int A1,A2,A3,A4;
     A2= temp[1       ];
     A4= temp[0       ];
     A1= temp[0+width2];
@@ -1047,7 +1126,6 @@
     A2 += (A1 + A3 + 2)>>2;
     b[width -1] = A3;
     b[width2-1] = A2;
-    }
 #else        
     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
@@ -1269,10 +1347,9 @@
     DWTELEM temp[width];
     const int width2= width>>1;
     const int w2= (width+1)>>1;
-    int x;
+    int A1,A2,A3,A4, x;
 
 #if 0
-    int A1,A2,A3,A4;
     A2= temp[1       ];
     A4= temp[0       ];
     A1= temp[0+width2];
@@ -1301,15 +1378,19 @@
     b[width -1] = A3;
     b[width2-1] = A2;
 #else   
+//{START_TIMER
     lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
     lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
+//if (width > 200){STOP_TIMER("horiz_lifts")}}
 #endif
+//{START_TIMER
     for(x=0; x<width2; x++){
         b[2*x    ]= temp[x   ];
         b[2*x + 1]= temp[x+w2];
     }
     if(width&1)
         b[2*x    ]= temp[x   ];
+//if (width > 200){STOP_TIMER("horiz_copies")}}
 }
 
 static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
@@ -1355,12 +1436,12 @@
 {START_TIMER
         if(mirror1 <= mirror3) vertical_compose53iL0(b1, b2, b3, width);
         if(mirror0 <= mirror2) vertical_compose53iH0(b0, b1, b2, width);
-STOP_TIMER("vertical_compose53i*")}
+if (width>200){STOP_TIMER("vertical_compose53i*")}}
 
 {START_TIMER
         if(y-1 >= 0) horizontal_compose53i(b0, width);
         if(mirror0 <= mirror2) horizontal_compose53i(b1, width);
-STOP_TIMER("horizontal_compose53i")}
+if (width>200){STOP_TIMER("horizontal_compose53i")}}
 
     cs->b0 = b2;
     cs->b1 = b3;
@@ -1396,15 +1477,159 @@
         spatial_compose53i_dy(&cs, buffer, width, height, stride);
 }   
 
- 
+static void horizontal_compose97i_0_mmx(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int width){
+    const int mul = 3;
+    const int add = 4;
+    const int shift = 3;
+    const int w2= (width+1)>>1;
+    const int w= (width>>1) - 1;
+    int i;
+{    
+    START_TIMER
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
+    dst[0] = LIFT(src[0], ((W_DM*2*ref[0]+W_DO)>>W_DS), 1);
+    dst += 1;
+    src += 1;
+    
+    for(i=0; i<w - 7; i+= 8){
+        dst[i+0] = src[i+0] - ((3*(ref[i+0] + ref[(i+1)])+4)>>3);
+        dst[i+1] = src[i+1] - ((3*(ref[i+1] + ref[(i+2)])+4)>>3);
+        dst[i+2] = src[i+2] - ((3*(ref[i+2] + ref[(i+3)])+4)>>3);
+        dst[i+3] = src[i+3] - ((3*(ref[i+3] + ref[(i+4)])+4)>>3);
+        dst[i+4] = src[i+4] - ((3*(ref[i+4] + ref[(i+5)])+4)>>3);
+        dst[i+5] = src[i+5] - ((3*(ref[i+5] + ref[(i+6)])+4)>>3);
+        dst[i+6] = src[i+6] - ((3*(ref[i+6] + ref[(i+7)])+4)>>3);
+        dst[i+7] = src[i+7] - ((3*(ref[i+7] + ref[(i+8)])+4)>>3);
+    }
+    
+    for(; i<w; i++){
+        dst[i] = src[i] - ((3*(ref[i] + ref[(i+1)])+4)>>3);
+    }
+    
+    if(width & 1){
+        dst[w] = LIFT(src[w], ((W_DM*2*ref[w]+W_DO)>>W_DS), 1);
+    }
+    if (width > 400){
+    STOP_TIMER ("horizontal_compose C - 0")}
+}
+}
+
+static void horizontal_compose97i_3_mmx(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int width){
+    const int w= (width>>1) - 1 + (1 & width);
+    int i;
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
+{
+    START_TIMER
+    for(i=0; i<w-7; i+=8){
+        dst[(i+0)*2] = src[i+0] -(-(3*(ref[(i+0)*2] + ref[(i+1)*2]))>>1);
+        dst[(i+1)*2] = src[i+1] -(-(3*(ref[(i+1)*2] + ref[(i+2)*2]))>>1);
+        dst[(i+2)*2] = src[i+2] -(-(3*(ref[(i+2)*2] + ref[(i+3)*2]))>>1);
+        dst[(i+3)*2] = src[i+3] -(-(3*(ref[(i+3)*2] + ref[(i+4)*2]))>>1);
+        dst[(i+4)*2] = src[i+4] -(-(3*(ref[(i+4)*2] + ref[(i+5)*2]))>>1);
+        dst[(i+5)*2] = src[i+5] -(-(3*(ref[(i+5)*2] + ref[(i+6)*2]))>>1);
+        dst[(i+6)*2] = src[i+6] -(-(3*(ref[(i+6)*2] + ref[(i+7)*2]))>>1);
+        dst[(i+7)*2] = src[i+7] -(-(3*(ref[(i+7)*2] + ref[(i+8)*2]))>>1);
+    }
+    
+    for(; i<w; i++){
+        dst[i*2] = src[i] -(-(3*(ref[i*2] + ref[(i+1)*2]))>>1);
+    }
+    
+    if(!(width&1)){
+        dst[w*2] = LIFT(src[w], (-(3*2*ref[w*2])>>1), 1);
+    }
+    if (width > 400){
+    STOP_TIMER ("horizontal_compose C - 3")}
+}
+}
+
+static void horizontal_compose97i_2_mmx(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int width){
+//static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
+    const int w= (width>>1) - 1;
+    int i;
+    int shift = 4;
+    int add = 8;
+    int mul = -1;
+
+#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
+// src - ((ref - 4 * src) >> shift);
+{    
+    START_TIMER
+    dst[0] = LIFTS(src[0], -2*ref[0]+add, 1);
+    dst += 2;
+    src += 1;
+    
+    for(i=0; i<w-7; i+=8){
+        dst[(i+0)*2] = src[i+0] - (((-(ref[i+0] + ref[i + 1])+8) - 4*src[i+0])>> 4);
+        dst[(i+1)*2] = src[i+1] - (((-(ref[i+1] + ref[i + 2])+8) - 4*src[i+1])>> 4);
+        dst[(i+2)*2] = src[i+2] - (((-(ref[i+2] + ref[i + 3])+8) - 4*src[i+2])>> 4);
+        dst[(i+3)*2] = src[i+3] - (((-(ref[i+3] + ref[i + 4])+8) - 4*src[i+3])>> 4);
+        dst[(i+4)*2] = src[i+4] - (((-(ref[i+4] + ref[i + 5])+8) - 4*src[i+4])>> 4);
+        dst[(i+5)*2] = src[i+5] - (((-(ref[i+5] + ref[i + 6])+8) - 4*src[i+5])>> 4);
+        dst[(i+6)*2] = src[i+6] - (((-(ref[i+6] + ref[i + 7])+8) - 4*src[i+6])>> 4);
+        dst[(i+7)*2] = src[i+7] - (((-(ref[i+7] + ref[i + 8])+8) - 4*src[i+7])>> 4);
+    }
+    
+    for(; i<w; i++){
+        dst[i*2] = src[i] - (((-(ref[i] + ref[i + 1])+8) - 4*src[i])>> 4);
+    }
+    
+    if(width&1){
+        dst[w*2] = LIFTS(src[w], -2*ref[w]+add, 1);
+    }
+    if (width > 400){
+    STOP_TIMER ("horizontal_compose C - 2")}
+}
+}
+
+static void horizontal_compose97i_1_mmx(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int width){
+//static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
+    const int w= (width>>1) - 1 + (1 & width);
+    int mul = 1;
+    int add = 0;
+    int shift = 0;
+    int i;
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
+{    
+    START_TIMER
+    for(i=0; i<w - 7; i+= 8){
+        dst[i+0] = src[i+0] - (ref[i+0] + ref[(i+1)]);
+        dst[i+1] = src[i+1] - (ref[i+1] + ref[(i+2)]);
+        dst[i+2] = src[i+2] - (ref[i+2] + ref[(i+3)]);
+        dst[i+3] = src[i+3] - (ref[i+3] + ref[(i+4)]);
+        dst[i+4] = src[i+4] - (ref[i+4] + ref[(i+5)]);
+        dst[i+5] = src[i+5] - (ref[i+5] + ref[(i+6)]);
+        dst[i+6] = src[i+6] - (ref[i+6] + ref[(i+7)]);
+        dst[i+7] = src[i+7] - (ref[i+7] + ref[(i+8)]);
+    }
+    
+    for(; i<w; i++){
+        dst[i] = LIFT(src[i], (((ref[i] + ref[(i+1)]))), 1);
+    }
+    
+    if(!(width&1)){
+        dst[w] = LIFT(src[w], ((2*ref[w])), 1);
+    }
+    if (width > 400){
+    STOP_TIMER ("horizontal_compose C - 1")}
+}
+}
+
 static void horizontal_compose97i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int w2= (width+1)>>1;
 
-    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
-    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
-    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
-    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
+    horizontal_compose97i_0_mmx(temp   , b      , b   +w2, width);
+//    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
+    horizontal_compose97i_1_mmx(temp+w2, b   +w2, temp   , width);
+//    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
+    horizontal_compose97i_2_mmx(b      , temp   , temp+w2, width);
+//    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
+    horizontal_compose97i_3_mmx(b+1    , temp+w2, b      , width);
+//    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
 }
 
 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
@@ -1454,9 +1679,7 @@
     int i;
     
     for(i=0; i<width; i++){
-#ifndef lift5
         int r;
-#endif
         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
 #ifdef lift5
         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
@@ -1509,6 +1732,9 @@
         
 {START_TIMER
     if(y>0 && y+4<height){
+#ifdef HAVE_MMX
+        vertical_compose97i_asm
+#endif
         vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
     }else{
         if(mirror3 <= mirror5) vertical_compose97iL1(b3, b4, b5, width);
@@ -1881,7 +2107,7 @@
 
 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
     const int w= b->width;
-    int y;
+    int x,y;
     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
@@ -2344,6 +2570,91 @@
     }
 }
 
+static void mc_block_16(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+//    av_log(NULL, AV_LOG_DEBUG, "mc_block: w = %d, h = %d\n", b_w, b_h);
+START_TIMER
+{START_TIMER
+    for(y=0; y < b_h+5; y++){
+        for(x=0; x < b_w; x++){
+            int a0= src[x    ];
+            int a1= src[x + 1];
+            int a2= src[x + 2];
+            int a3= src[x + 3];
+            int a4= src[x + 4];
+            int a5= src[x + 5];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dx<8) tmp[x]= (32*a2*( 8-dx) +    am* dx    + 128)>>8;
+            else     tmp[x]= (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
+        }
+        tmp += stride;
+        src += stride;
+    }
+    tmp -= (b_h+5)*stride;
+STOP_TIMER("mc_block-16-top")
+}
+    
+{START_TIMER
+    for(y=0; y < b_h; y+=4){
+        for(x=0; x < b_w; x++){
+            uint16_t temp[16];
+asm(
+           "pxor %%mm7, %%mm7          \n\t"
+           "movd (%%eax), %%mm0        \n\t"
+           "punpcklbw %%mm7, %%mm0;    \n\t"
+           "movd 1(%%eax), %%mm1       \n\t"
+           "punpcklbw %%mm7, %%mm1;    \n\t"
+           "movd 2(%%eax), %%mm2       \n\t"
+           "punpcklbw %%mm7, %%mm2;    \n\t"
+           "movd 3(%%eax), %%mm3       \n\t"
+           "punpcklbw %%mm7, %%mm3;    \n\t"
+           "movd 4(%%eax), %%mm4       \n\t"
+           "punpcklbw %%mm7, %%mm4;    \n\t"
+           "movd 5(%%eax), %%mm5       \n\t"
+           "punpcklbw %%mm7, %%mm5;    \n\t"
+           
+           "movq %%mm2, %%mm6;         \n\t"
+           "paddw %%mm3, %%mm6;        \n\t"
+           "movq %%mm6, (%%ecx);       \n\t"
+           ::
+           "a"(tmp+x),"b"(stride),"c"(temp)
+           );
+            int a0= tmp[x + 0*stride]; // mm0
+            int a1= tmp[x + 1*stride]; // mm1
+            int a2= tmp[x + 2*stride]; // mm2
+            int a3= tmp[x + 3*stride]; // mm3
+            int a4= tmp[x + 4*stride]; // mm4 -> mm0
+            int a5= tmp[x + 5*stride]; // mm5 -> mm1
+            int a6= tmp[x + 6*stride]; // NULL
+            int a7= tmp[x + 7*stride]; // NULL
+            int a8= tmp[x + 8*stride]; // NULL
+            int am0= 20*(a2+a3) - 5*(a1+a4) + (temp[0]+a5);
+            int am1= 20*(a3+a4) - 5*(a2+a5) + (a1+a6);
+            int am2= 20*(a4+a5) - 5*(a3+a6) + (a2+a7);
+            int am3= 20*(a5+a6) - 5*(a4+a7) + (a3+a8);
+
+            if(dy<8){
+                dst[x]= (32*a2*( 8-dy) +    am0* dy    + 128)>>8;
+                dst[x+stride]= (32*a3*( 8-dy) +    am1* dy    + 128)>>8;
+                dst[x+2*stride]= (32*a4*( 8-dy) +    am2* dy    + 128)>>8;
+                dst[x+3*stride]= (32*a5*( 8-dy) +    am3* dy    + 128)>>8;
+            }
+            else{
+                dst[x]= (   am0*(16-dy) + 32*a3*(dy-8) + 128)>>8;
+                dst[x+stride]= (   am1*(16-dy) + 32*a4*(dy-8) + 128)>>8;
+                dst[x+2*stride]= (   am2*(16-dy) + 32*a5*(dy-8) + 128)>>8;
+                dst[x+3*stride]= (   am3*(16-dy) + 32*a6*(dy-8) + 128)>>8;
+            }
+        }
+        dst += stride*4;
+        tmp += stride*4;
+    }
+STOP_TIMER("mc_block-16-bottom")
+}
+STOP_TIMER("mc_block-16")
+}
+
 static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
     int x, y;
 START_TIMER
@@ -2358,6 +2669,7 @@
 //            int am= 9*(a1+a2) - (a0+a3);
             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
 //            int am= 18*(a2+a3) - 2*(a1+a4);
+//            int am = 16*(a2+a3);
 //             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
 //             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
 
@@ -2366,7 +2678,6 @@
             if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
             else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
             
-            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
             if(am&(~255)) am= ~(am>>31);
             
             tmp[x] = am;
@@ -2391,6 +2702,7 @@
             int a5= tmp[x + 5*stride];
             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
 //            int am= 18*(a2+a3) - 2*(a1+a4);
+//            int am = 16*(a2+a3);
 /*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
             
@@ -2398,10 +2710,13 @@
 
             if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
             else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
-
+            
+//            if (new < 0 || new > 255)
+//                av_log(NULL, AV_LOG_DEBUG, "bounds exception, new = %d\n", new);
             if(am&(~255)) am= ~(am>>31);
             
             dst[x] = am;
+
 /*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
             else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
             else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
@@ -2430,6 +2745,7 @@
 mca( 8, 8,8)
 
 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
+//{START_TIMER
     if(block->type){
         int x, y;
         const int color= block->color[plane_index];
@@ -2453,18 +2769,27 @@
             src= tmp + MB_SIZE;
         }
         if((dx&3) || (dy&3) || b_w!=b_h || (b_w!=4 && b_w!=8 && b_w!=16))
-            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
+//            if (b_w == 16 && b_h == 8)
+//                mc_block_16(dst, src, tmp, stride, b_w, b_h, dx, dy);
+//            else
+                mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
         else
             s->dsp.put_h264_qpel_pixels_tab[2-(b_w>>3)][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
     }
+//STOP_TIMER("pred_block")}
 }
 
 static always_inline int same_block(BlockNode *a, BlockNode *b){
     return !((a->mx - b->mx) | (a->my - b->my) | a->type | b->type);
 }
 
+int ycount_1 = 0;
+int ycount_2 = 1;
+int ycount_3 = 2;
+int ycount_4 = 3;
+
 //FIXME name clenup (b_w, block_w, b_width stuff)
-static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
+static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, long src_x, long src_y, long b_w, long b_h, long w, long h, long dst_stride, long src_stride, long obmc_stride, int b_x, int b_y, int add, int plane_index){
     DWTELEM * dst = NULL;
     const int b_width = s->b_width  << s->block_max_depth;
     const int b_height= s->b_height << s->block_max_depth;
@@ -2474,9 +2799,7 @@
     BlockNode *lb= lt+b_stride;
     BlockNode *rb= lb+1;
     uint8_t *block[4]; 
-    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
-    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
-    uint8_t *ptmp;
+    uint8_t tmp[src_stride*(b_h+5)]; //FIXME align
     int x,y;
 
     if(b_x<0){
@@ -2493,6 +2816,10 @@
         lb= lt;
         rb= rt;
     }
+    
+//    int ycount = 1;
+    
+//{START_TIMER
         
     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
         obmc -= src_x;
@@ -2511,21 +2838,19 @@
     
     if(b_w<=0 || b_h<=0) return;
 
-assert(src_stride > 2*MB_SIZE + 5);
+assert(src_stride > 7*MB_SIZE);
 //    old_dst += src_x + src_y*dst_stride;
     dst8+= src_x + src_y*src_stride;
 //    src += src_x + src_y*src_stride;
 
-    ptmp= tmp + 3*tmp_step;
-    block[0]= ptmp;
-    ptmp+=tmp_step;
+    block[0]= tmp+3*MB_SIZE;
     pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);    
 
     if(same_block(lt, rt)){
         block[1]= block[0];
     }else{
-        block[1]= ptmp;
-        ptmp+=tmp_step;
+//        ycount++;
+        block[1]= tmp + 4*MB_SIZE;
         pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
     }
         
@@ -2534,8 +2859,8 @@
     }else if(same_block(rt, lb)){
         block[2]= block[1];
     }else{
-        block[2]= ptmp;
-        ptmp+=tmp_step;
+        block[2]= tmp+5*MB_SIZE;
+//        ycount++;
         pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
     }
 
@@ -2546,9 +2871,23 @@
     }else if(same_block(lb, rb)){
         block[3]= block[2];
     }else{
-        block[3]= ptmp;
+        block[3]= tmp+6*MB_SIZE;
+//        ycount++;
         pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
     }
+//STOP_TIMER("Upper add y block")}
+
+//    if (ycount == 1)
+//        ycount_1++;
+//    else if (ycount == 2)
+//        ycount_2++;
+//    else if (ycount == 3)
+//        ycount_3++;
+//    else if (ycount == 4)
+//        ycount_4++;
+
+//    if ((ycount_1 + ycount_2 + ycount_3 + ycount_4) % 1000 == 0)
+//        av_log(NULL, AV_LOG_DEBUG, "one = %d, two = %d, three = %d, four = %d\n", ycount_1, ycount_2, ycount_3, ycount_4);
 #if 0
     for(y=0; y<b_h; y++){
         for(x=0; x<b_w; x++){
@@ -2586,7 +2925,10 @@
 {
 
     START_TIMER
-    
+
+#ifdef HAVE_MMX
+    inner_add_yblock_mmx
+#endif
     for(y=0; y<b_h; y++){
         //FIXME ugly missue of obmc_stride
         uint8_t *obmc1= obmc + y*obmc_stride;
@@ -2632,9 +2974,7 @@
     BlockNode *lb= lt+b_stride;
     BlockNode *rb= lb+1;
     uint8_t *block[4]; 
-    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
-    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
-    uint8_t *ptmp;
+    uint8_t tmp[src_stride*(b_h+5)]; //FIXME align
     int x,y;
 
     if(b_x<0){
@@ -2669,21 +3009,18 @@
     
     if(b_w<=0 || b_h<=0) return;
 
-assert(src_stride > 2*MB_SIZE + 5);
+assert(src_stride > 7*MB_SIZE);
     dst += src_x + src_y*dst_stride;
     dst8+= src_x + src_y*src_stride;
 //    src += src_x + src_y*src_stride;
 
-    ptmp= tmp + 3*tmp_step;
-    block[0]= ptmp;
-    ptmp+=tmp_step;
+    block[0]= tmp+3*MB_SIZE;
     pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);    
 
     if(same_block(lt, rt)){
         block[1]= block[0];
     }else{
-        block[1]= ptmp;
-        ptmp+=tmp_step;
+        block[1]= tmp + 4*MB_SIZE;
         pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
     }
         
@@ -2692,8 +3029,7 @@
     }else if(same_block(rt, lb)){
         block[2]= block[1];
     }else{
-        block[2]= ptmp;
-        ptmp+=tmp_step;
+        block[2]= tmp+5*MB_SIZE;
         pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
     }
 
@@ -2704,7 +3040,7 @@
     }else if(same_block(lb, rb)){
         block[3]= block[2];
     }else{
-        block[3]= ptmp;
+        block[3]= tmp+6*MB_SIZE;
         pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
     }
 #if 0
@@ -2911,7 +3247,7 @@
     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
     int x,y, thres1, thres2;
-//    START_TIMER
+    START_TIMER
 
     if(s->qlog == LOSSLESS_QLOG) return;
  
@@ -2967,6 +3303,7 @@
 
 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
     const int w= b->width;
+    const int h= b->height;
     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
@@ -2992,6 +3329,34 @@
     }
 }
 
+static void dequantize_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+    START_TIMER
+    
+    if(s->qlog == LOSSLESS_QLOG) return;
+    
+    for(y=0; y<h; y++){
+//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            int i= line[x];
+            if(i<0){
+                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
+        STOP_TIMER("dquant")
+    }
+}
+
 static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
     const int w= b->width;
     const int h= b->height;
@@ -3044,6 +3409,7 @@
 
 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
     const int w= b->width;
+    const int h= b->height;
     int x,y;
     
 //    START_TIMER
@@ -3076,6 +3442,38 @@
 //    STOP_TIMER("correlate")
 }
 
+static void correlate_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+    
+//    START_TIMER
+    
+    DWTELEM * line;
+    DWTELEM * prev;
+    
+    for(y=0; y<h; y++){
+        prev = line;
+//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
+                    else  line[x] += line[x - 1];
+                }else{
+                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
+                    else  line[x] += line[x - 1];
+                }
+            }else{
+                if(y) line[x] += prev[x];
+            }
+        }
+    }
+    
+//    STOP_TIMER("correlate")
+}
+
 static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
     const int w= b->width;
     const int h= b->height;
@@ -3353,9 +3751,9 @@
     SnowContext *s = avctx->priv_data;
     int plane_index;
 
-    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
-        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
-               "use vstrict=-2 / -strict -2 to use it anyway\n");
+    if(avctx->strict_std_compliance >= 0){
+        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it wont be decodeable with future versions!!!\n"
+               "use vstrict=-1 / -strict -1 to use it anyway\n");
         return -1;
     }
  
@@ -3665,17 +4063,41 @@
 {
     SnowContext *s = avctx->priv_data;
     int block_size;
-    
     avctx->pix_fmt= PIX_FMT_YUV420P;
 
     common_init(avctx);
     
     block_size = MB_SIZE >> s->block_max_depth;
-    slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 2)) + 1, s->plane[0].width, s->spatial_dwt_buffer);
+    /* FIXME block_size * 2 is determined empirically. block_size * 1.5 is definitely needed, but I (Robert) cannot figure out why more than that is needed. Perhaps there is a bug, or perhaps I overlooked some demands that are placed on the buffer. */
+    /* FIXME The formula is WRONG. For height > 480, the buffer will overflow. */
+    /* FIXME For now, I will use a full frame of lines. Fortunately, this should not materially effect cache performance because lines are allocated using a stack, so if in fact only 50 out of 496 lines are needed at a time, the other 446 will sit allocated but never accessed. */
+    slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 2)) + 2, s->plane[0].width, s->spatial_dwt_buffer);
+//    slice_buffer_init(&s->sb, s->plane[0].height, s->plane[0].height, s->plane[0].width, s->spatial_dwt_buffer);
     
     return 0;
 }
 
+static void draw_slice(SnowContext *s, int start_y, int end_y){
+    int h, start_cy;
+    int offset[4];
+    
+    assert (s->avctx->draw_horiz_band != NULL);
+    if(s->avctx->draw_horiz_band==NULL) 
+        return;
+        
+    h= end_y - start_y;
+    
+    start_cy= start_y>>1;
+
+    offset[0] = s->current_picture.linesize[0]*start_y;
+    offset[1] = s->current_picture.linesize[1]*start_cy;
+    offset[2] = s->current_picture.linesize[2]*start_cy;
+    offset[3] = 0;
+    emms_c();
+
+    s->avctx->draw_horiz_band(s->avctx, &s->current_picture, offset, start_y, 3, h);
+}
+
 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
     SnowContext *s = avctx->priv_data;
     RangeCoder * const c= &s->c;
@@ -3725,7 +4147,7 @@
     }
     STOP_TIMER("unpack coeffs");
 }
-
+        
 {START_TIMER
     const int mb_h= s->b_height << s->block_max_depth;
     const int block_size = MB_SIZE >> s->block_max_depth;
@@ -3798,6 +4220,8 @@
         }
 
         predict_slice_buffered(s, &s->sb, s->spatial_dwt_buffer, plane_index, 1, mb_y);
+//        if (slice_starty != slice_h)
+//            draw_slice(s, slice_starty, slice_h);
         
         y = FFMIN(p->height, slice_starty);
         end_y = FFMIN(p->height, slice_h);
@@ -3906,7 +4330,7 @@
     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
         
     for(i=-256; i<256; i++){
-START_TIMER
+START_TIMERoil
         put_symbol(&s.c, s.header_state, i*i*i/3*ABS(i), 1);
 STOP_TIMER("put_symbol")
     }