FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vf_fspp.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4  *
5  * This file is part of MPlayer.
6  *
7  * MPlayer is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * MPlayer is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with MPlayer; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21 
22 /*
23  * This implementation is based on an algorithm described in
24  * "Aria Nosratinia Embedded Post-Processing for
25  * Enhancement of Compressed Images (1999)"
26  * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
27  * Futher, with splitting (i)dct into hor/ver passes, one of them can be
28  * performed once per block, not pixel. This allows for much better speed.
29  */
30 
31 /*
32  Heavily optimized version of SPP filter by Nikolaj
33  */
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <inttypes.h>
39 #include <math.h>
40 
41 #include "config.h"
42 
43 #include "mp_msg.h"
44 #include "cpudetect.h"
45 #include "img_format.h"
46 #include "mp_image.h"
47 #include "vf.h"
48 #include "av_helpers.h"
49 #include "libvo/fastmemcpy.h"
50 
51 #include "libavutil/internal.h"
52 #include "libavutil/intreadwrite.h"
53 #include "libavutil/mem.h"
54 #include "libavutil/x86/asm.h"
55 #include "libavcodec/avcodec.h"
56 #include "libavcodec/dsputil.h"
57 
58 #undef free
59 #undef malloc
60 
61 //===========================================================================//
62 #define BLOCKSZ 12
63 
64 static const short custom_threshold[64]=
65 // values (296) can't be too high
66 // -it causes too big quant dependence
67 // or maybe overflow(check), which results in some flashing
68 { 71, 296, 295, 237, 71, 40, 38, 19,
69  245, 193, 185, 121, 102, 73, 53, 27,
70  158, 129, 141, 107, 97, 73, 50, 26,
71  102, 116, 109, 98, 82, 66, 45, 23,
72  71, 94, 95, 81, 70, 56, 38, 20,
73  56, 77, 74, 66, 56, 44, 30, 15,
74  38, 53, 50, 45, 38, 30, 21, 11,
75  20, 27, 26, 23, 20, 15, 11, 5
76 };
77 
78 static const uint8_t __attribute__((aligned(32))) dither[8][8]={
79  { 0, 48, 12, 60, 3, 51, 15, 63, },
80  { 32, 16, 44, 28, 35, 19, 47, 31, },
81  { 8, 56, 4, 52, 11, 59, 7, 55, },
82  { 40, 24, 36, 20, 43, 27, 39, 23, },
83  { 2, 50, 14, 62, 1, 49, 13, 61, },
84  { 34, 18, 46, 30, 33, 17, 45, 29, },
85  { 10, 58, 6, 54, 9, 57, 5, 53, },
86  { 42, 26, 38, 22, 41, 25, 37, 21, },
87 };
88 
89 struct vf_priv_s { //align 16 !
90  uint64_t threshold_mtx_noq[8*2];
91  uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
92 
95  int qp;
96  int mpeg2;
97  int prev_q;
99  int16_t *temp;
100  int bframes;
101  char *non_b_qp;
102 };
103 
104 
105 #if !HAVE_MMX
106 
107 //This func reads from 1 slice, 1 and clears 0 & 1
108 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
109 {int y, x;
110 #define STORE(pos) \
111  temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
112  src[x + pos]=src[x + pos - 8*src_stride]=0; \
113  if(temp & 0x100) temp= ~(temp>>31); \
114  dst[x + pos]= temp;
115 
116  for(y=0; y<height; y++){
117  const uint8_t *d= dither[y];
118  for(x=0; x<width; x+=8){
119  int temp;
120  STORE(0);
121  STORE(1);
122  STORE(2);
123  STORE(3);
124  STORE(4);
125  STORE(5);
126  STORE(6);
127  STORE(7);
128  }
129  src+=src_stride;
130  dst+=dst_stride;
131  }
132 }
133 
134 //This func reads from 2 slices, 0 & 2 and clears 2-nd
135 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
136 {int y, x;
137 #define STORE2(pos) \
138  temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
139  src[x + pos + 16*src_stride]=0; \
140  if(temp & 0x100) temp= ~(temp>>31); \
141  dst[x + pos]= temp;
142 
143  for(y=0; y<height; y++){
144  const uint8_t *d= dither[y];
145  for(x=0; x<width; x+=8){
146  int temp;
147  STORE2(0);
148  STORE2(1);
149  STORE2(2);
150  STORE2(3);
151  STORE2(4);
152  STORE2(5);
153  STORE2(6);
154  STORE2(7);
155  }
156  src+=src_stride;
157  dst+=dst_stride;
158  }
159 }
160 
161 static void mul_thrmat_c(struct vf_priv_s *p,int q)
162 {
163  int a;
164  for(a=0;a<64;a++)
165  ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
166 }
167 
168 static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
169 static void row_idct_c(int16_t* workspace,
170  int16_t* output_adr, int output_stride, int cnt);
171 static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
172 
173 //this is rather ugly, but there is no need for function pointers
174 #define store_slice_s store_slice_c
175 #define store_slice2_s store_slice2_c
176 #define mul_thrmat_s mul_thrmat_c
177 #define column_fidct_s column_fidct_c
178 #define row_idct_s row_idct_c
179 #define row_fdct_s row_fdct_c
180 
181 #else /* HAVE_MMX */
182 
183 //This func reads from 1 slice, 1 and clears 0 & 1
184 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
185 {
186  const uint8_t *od=&dither[0][0];
187  const uint8_t *end=&dither[height][0];
188  width = (width+7)&~7;
189  dst_stride-=width;
190  //src_stride=(src_stride-width)*2;
191  __asm__ volatile(
192  "mov %5, %%"REG_d" \n\t"
193  "mov %6, %%"REG_S" \n\t"
194  "mov %7, %%"REG_D" \n\t"
195  "mov %1, %%"REG_a" \n\t"
196  "movd %%"REG_d", %%mm5 \n\t"
197  "xor $-1, %%"REG_d" \n\t"
198  "mov %%"REG_a", %%"REG_c" \n\t"
199  "add $7, %%"REG_d" \n\t"
200  "neg %%"REG_a" \n\t"
201  "sub %0, %%"REG_c" \n\t"
202  "add %%"REG_c", %%"REG_c" \n\t"
203  "movd %%"REG_d", %%mm2 \n\t"
204  "mov %%"REG_c", %1 \n\t"
205  "mov %2, %%"REG_d" \n\t"
206  "shl $4, %%"REG_a" \n\t"
207 
208  "2: \n\t"
209  "movq (%%"REG_d"), %%mm3 \n\t"
210  "movq %%mm3, %%mm4 \n\t"
211  "pxor %%mm7, %%mm7 \n\t"
212  "punpcklbw %%mm7, %%mm3 \n\t"
213  "punpckhbw %%mm7, %%mm4 \n\t"
214  "mov %0, %%"REG_c" \n\t"
215  "psraw %%mm5, %%mm3 \n\t"
216  "psraw %%mm5, %%mm4 \n\t"
217  "1: \n\t"
218  "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
219  "movq (%%"REG_S"), %%mm0 \n\t"
220  "movq 8(%%"REG_S"), %%mm1 \n\t"
221 
222  "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
223  "paddw %%mm3, %%mm0 \n\t"
224  "paddw %%mm4, %%mm1 \n\t"
225 
226  "movq %%mm7, (%%"REG_S") \n\t"
227  "psraw %%mm2, %%mm0 \n\t"
228  "psraw %%mm2, %%mm1 \n\t"
229 
230  "movq %%mm7, 8(%%"REG_S") \n\t"
231  "packuswb %%mm1, %%mm0 \n\t"
232  "add $16, %%"REG_S" \n\t"
233 
234  "movq %%mm0, (%%"REG_D") \n\t"
235  "add $8, %%"REG_D" \n\t"
236  "sub $8, %%"REG_c" \n\t"
237  "jg 1b \n\t"
238  "add %1, %%"REG_S" \n\t"
239  "add $8, %%"REG_d" \n\t"
240  "add %3, %%"REG_D" \n\t"
241  "cmp %4, %%"REG_d" \n\t"
242  "jl 2b \n\t"
243 
244  :
245  : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
246  "m" (log2_scale), "m" (src), "m" (dst) //input
247  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
248  );
249 }
250 
251 //This func reads from 2 slices, 0 & 2 and clears 2-nd
252 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
253 {
254  const uint8_t *od=&dither[0][0];
255  const uint8_t *end=&dither[height][0];
256  width = (width+7)&~7;
257  dst_stride-=width;
258  //src_stride=(src_stride-width)*2;
259  __asm__ volatile(
260  "mov %5, %%"REG_d" \n\t"
261  "mov %6, %%"REG_S" \n\t"
262  "mov %7, %%"REG_D" \n\t"
263  "mov %1, %%"REG_a" \n\t"
264  "movd %%"REG_d", %%mm5 \n\t"
265  "xor $-1, %%"REG_d" \n\t"
266  "mov %%"REG_a", %%"REG_c" \n\t"
267  "add $7, %%"REG_d" \n\t"
268  "sub %0, %%"REG_c" \n\t"
269  "add %%"REG_c", %%"REG_c" \n\t"
270  "movd %%"REG_d", %%mm2 \n\t"
271  "mov %%"REG_c", %1 \n\t"
272  "mov %2, %%"REG_d" \n\t"
273  "shl $5, %%"REG_a" \n\t"
274 
275  "2: \n\t"
276  "movq (%%"REG_d"), %%mm3 \n\t"
277  "movq %%mm3, %%mm4 \n\t"
278  "pxor %%mm7, %%mm7 \n\t"
279  "punpcklbw %%mm7, %%mm3 \n\t"
280  "punpckhbw %%mm7, %%mm4 \n\t"
281  "mov %0, %%"REG_c" \n\t"
282  "psraw %%mm5, %%mm3 \n\t"
283  "psraw %%mm5, %%mm4 \n\t"
284  "1: \n\t"
285  "movq (%%"REG_S"), %%mm0 \n\t"
286  "movq 8(%%"REG_S"), %%mm1 \n\t"
287  "paddw %%mm3, %%mm0 \n\t"
288 
289  "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
290  "paddw %%mm4, %%mm1 \n\t"
291  "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
292 
293  "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
294  "psraw %%mm2, %%mm0 \n\t"
295  "paddw %%mm6, %%mm1 \n\t"
296 
297  "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
298  "psraw %%mm2, %%mm1 \n\t"
299  "packuswb %%mm1, %%mm0 \n\t"
300 
301  "movq %%mm0, (%%"REG_D") \n\t"
302  "add $16, %%"REG_S" \n\t"
303  "add $8, %%"REG_D" \n\t"
304  "sub $8, %%"REG_c" \n\t"
305  "jg 1b \n\t"
306  "add %1, %%"REG_S" \n\t"
307  "add $8, %%"REG_d" \n\t"
308  "add %3, %%"REG_D" \n\t"
309  "cmp %4, %%"REG_d" \n\t"
310  "jl 2b \n\t"
311 
312  :
313  : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
314  "m" (log2_scale), "m" (src), "m" (dst) //input
315  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
316  );
317 }
318 
319 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
320 {
321  uint64_t *adr=&p->threshold_mtx_noq[0];
322  __asm__ volatile(
323  "movd %0, %%mm7 \n\t"
324  "add $8*8*2, %%"REG_D" \n\t"
325  "movq 0*8(%%"REG_S"), %%mm0 \n\t"
326  "punpcklwd %%mm7, %%mm7 \n\t"
327  "movq 1*8(%%"REG_S"), %%mm1 \n\t"
328  "punpckldq %%mm7, %%mm7 \n\t"
329  "pmullw %%mm7, %%mm0 \n\t"
330 
331  "movq 2*8(%%"REG_S"), %%mm2 \n\t"
332  "pmullw %%mm7, %%mm1 \n\t"
333 
334  "movq 3*8(%%"REG_S"), %%mm3 \n\t"
335  "pmullw %%mm7, %%mm2 \n\t"
336 
337  "movq %%mm0, 0*8(%%"REG_D") \n\t"
338  "movq 4*8(%%"REG_S"), %%mm4 \n\t"
339  "pmullw %%mm7, %%mm3 \n\t"
340 
341  "movq %%mm1, 1*8(%%"REG_D") \n\t"
342  "movq 5*8(%%"REG_S"), %%mm5 \n\t"
343  "pmullw %%mm7, %%mm4 \n\t"
344 
345  "movq %%mm2, 2*8(%%"REG_D") \n\t"
346  "movq 6*8(%%"REG_S"), %%mm6 \n\t"
347  "pmullw %%mm7, %%mm5 \n\t"
348 
349  "movq %%mm3, 3*8(%%"REG_D") \n\t"
350  "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
351  "pmullw %%mm7, %%mm6 \n\t"
352 
353  "movq %%mm4, 4*8(%%"REG_D") \n\t"
354  "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
355  "pmullw %%mm7, %%mm0 \n\t"
356 
357  "movq %%mm5, 5*8(%%"REG_D") \n\t"
358  "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
359  "pmullw %%mm7, %%mm1 \n\t"
360 
361  "movq %%mm6, 6*8(%%"REG_D") \n\t"
362  "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
363  "pmullw %%mm7, %%mm2 \n\t"
364 
365  "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
366  "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
367  "pmullw %%mm7, %%mm3 \n\t"
368 
369  "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
370  "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
371  "pmullw %%mm7, %%mm4 \n\t"
372 
373  "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
374  "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
375  "pmullw %%mm7, %%mm5 \n\t"
376 
377  "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
378  "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
379  "pmullw %%mm7, %%mm6 \n\t"
380 
381  "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
382  "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
383  "pmullw %%mm7, %%mm0 \n\t"
384 
385  "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
386  "pmullw %%mm7, %%mm1 \n\t"
387 
388  "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
389  "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
390  "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
391 
392  : "+g" (q), "+S" (adr), "+D" (adr)
393  :
394  );
395 }
396 
397 static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
398 static void row_idct_mmx(int16_t* workspace,
399  int16_t* output_adr, int output_stride, int cnt);
400 static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
401 
402 #define store_slice_s store_slice_mmx
403 #define store_slice2_s store_slice2_mmx
404 #define mul_thrmat_s mul_thrmat_mmx
405 #define column_fidct_s column_fidct_mmx
406 #define row_idct_s row_idct_mmx
407 #define row_fdct_s row_fdct_mmx
408 #endif // HAVE_MMX
409 
410 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
411  int dst_stride, int src_stride,
412  int width, int height,
413  uint8_t *qp_store, int qp_stride, int is_luma)
414 {
415  int x, x0, y, es, qy, t;
416  const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
417  const int step=6-p->log2_count;
418  const int qps= 3 + is_luma;
419  int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
420  int16_t *block= (int16_t *)block_align;
421  int16_t *block3=(int16_t *)(block_align+4*8*BLOCKSZ);
422 
423  memset(block3, 0, 4*8*BLOCKSZ);
424 
425  //p->src=src-src_stride*8-8;//!
426  if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
427  for(y=0; y<height; y++){
428  int index= 8 + 8*stride + y*stride;
429  fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
430  for(x=0; x<8; x++){
431  p->src[index - x - 1]= p->src[index + x ];
432  p->src[index + width + x ]= p->src[index + width - x - 1];
433  }
434  }
435  for(y=0; y<8; y++){
436  fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
437  fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
438  }
439  //FIXME (try edge emu)
440 
441  for(y=8; y<24; y++)
442  memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
443 
444  for(y=step; y<height+8; y+=step){ //step= 1,2
445  qy=y-4;
446  if (qy>height-1) qy=height-1;
447  if (qy<0) qy=0;
448  qy=(qy>>qps)*qp_stride;
449  row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
450  for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
451  row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
452  if(p->qp)
453  column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
454  else
455  for (x=0; x<8*(BLOCKSZ-1); x+=8) {
456  t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
457  if (t<0) t=0;//t always < width-2
458  t=qp_store[qy+(t>>qps)];
459  t=norm_qscale(t, p->mpeg2);
460  if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
461  column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
462  }
463  row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
464  memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(int16_t)); //cycling
465  memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(int16_t));
466  }
467  //
468  es=width+8-x0; // 8, ...
469  if (es>8)
470  row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
471  column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
472  row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
473  {const int y1=y-8+step;//l5-7 l4-6
474  if (!(y1&7) && y1) {
475  if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
476  dst_stride, stride, width, 8, 5-p->log2_count);
477  else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
478  dst_stride, stride, width, 8, 5-p->log2_count);
479  } }
480  }
481 
482  if (y&7) { // == height & 7
483  if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
484  dst_stride, stride, width, y&7, 5-p->log2_count);
485  else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
486  dst_stride, stride, width, y&7, 5-p->log2_count);
487  }
488 }
489 
490 static int config(struct vf_instance *vf,
491  int width, int height, int d_width, int d_height,
492  unsigned int flags, unsigned int outfmt)
493 {
494  int h= (height+16+15)&(~15);
495 
496  vf->priv->temp_stride= (width+16+15)&(~15);
497  vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
498  //this can also be avoided, see above
499  vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
500 
501  return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
502 }
503 
504 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
505 {
506  if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
507  // ok, we can do pp in-place (or pp disabled):
508  vf->dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
509  mpi->type, mpi->flags, mpi->width, mpi->height);
510  mpi->planes[0]=vf->dmpi->planes[0];
511  mpi->stride[0]=vf->dmpi->stride[0];
512  mpi->width=vf->dmpi->width;
513  if(mpi->flags&MP_IMGFLAG_PLANAR){
514  mpi->planes[1]=vf->dmpi->planes[1];
515  mpi->planes[2]=vf->dmpi->planes[2];
516  mpi->stride[1]=vf->dmpi->stride[1];
517  mpi->stride[2]=vf->dmpi->stride[2];
518  }
519  mpi->flags|=MP_IMGFLAG_DIRECT;
520 }
521 
522 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
523 {
524  mp_image_t *dmpi;
525  if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
526  // no DR, so get a new image! hope we'll get DR buffer:
527  dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
530  mpi->width,mpi->height);
531  ff_vf_clone_mpi_attributes(dmpi, mpi);
532  }else{
533  dmpi=vf->dmpi;
534  }
535 
536  vf->priv->mpeg2= mpi->qscale_type;
537  if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
538  int w = mpi->qstride;
539  int h = (mpi->h + 15) >> 4;
540  if (!w) {
541  w = (mpi->w + 15) >> 4;
542  h = 1;
543  }
544  if(!vf->priv->non_b_qp)
545  vf->priv->non_b_qp= malloc(w*h);
546  fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
547  }
548  if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
549  char *qp_tab= vf->priv->non_b_qp;
550  if(vf->priv->bframes || !qp_tab)
551  qp_tab= mpi->qscale;
552 
553  if(qp_tab || vf->priv->qp){
554  filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
555  mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
556  filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
557  mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
558  filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
559  mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
560  }else{
561  memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
562  memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
563  memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
564  }
565  }
566 
567 #if HAVE_MMX
568  if(ff_gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
569 #endif
570 #if HAVE_MMX2
571  if(ff_gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
572 #endif
573  return ff_vf_next_put_image(vf,dmpi, pts);
574 }
575 
576 static void uninit(struct vf_instance *vf)
577 {
578  if(!vf->priv) return;
579 
580  av_free(vf->priv->temp);
581  vf->priv->temp= NULL;
582  av_free(vf->priv->src);
583  vf->priv->src= NULL;
584  //free(vf->priv->avctx);
585  //vf->priv->avctx= NULL;
586  free(vf->priv->non_b_qp);
587  vf->priv->non_b_qp= NULL;
588 
589  av_free(vf->priv);
590  vf->priv=NULL;
591 }
592 
593 //===========================================================================//
594 
595 static int query_format(struct vf_instance *vf, unsigned int fmt)
596 {
597  switch(fmt){
598  case IMGFMT_YVU9:
599  case IMGFMT_IF09:
600  case IMGFMT_YV12:
601  case IMGFMT_I420:
602  case IMGFMT_IYUV:
603  case IMGFMT_CLPL:
604  case IMGFMT_Y800:
605  case IMGFMT_Y8:
606  case IMGFMT_444P:
607  case IMGFMT_422P:
608  case IMGFMT_411P:
609  return ff_vf_next_query_format(vf,fmt);
610  }
611  return 0;
612 }
613 
614 static int control(struct vf_instance *vf, int request, void* data)
615 {
616  switch(request){
618  return 5;
619  case VFCTRL_SET_PP_LEVEL:
620  vf->priv->log2_count= *((unsigned int*)data);
621  if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
622  return CONTROL_TRUE;
623  }
624  return ff_vf_next_control(vf,request,data);
625 }
626 
627 static int vf_open(vf_instance_t *vf, char *args)
628 {
629  int i=0, bias;
630  int custom_threshold_m[64];
631  int log2c=-1;
632 
633  vf->config=config;
634  vf->put_image=put_image;
635  vf->get_image=get_image;
637  vf->uninit=uninit;
638  vf->control= control;
639  vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
640 
641  ff_init_avcodec();
642 
643  //vf->priv->avctx= avcodec_alloc_context();
644  //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
645 
646  vf->priv->log2_count= 4;
647  vf->priv->bframes = 0;
648 
649  if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
650 
651  if( log2c >=4 && log2c <=5 )
652  vf->priv->log2_count = log2c;
653  else if( log2c >= 6 )
654  vf->priv->log2_count = 5;
655 
656  if(vf->priv->qp < 0)
657  vf->priv->qp = 0;
658 
659  if (i < -15) i = -15;
660  if (i > 32) i = 32;
661 
662  bias= (1<<4)+i; //regulable
663  vf->priv->prev_q=0;
664  //
665  for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
666  custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
667  for(i=0;i<8;i++){
668  vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
669  |(((uint64_t)custom_threshold_m[i*8+6])<<16)
670  |(((uint64_t)custom_threshold_m[i*8+0])<<32)
671  |(((uint64_t)custom_threshold_m[i*8+4])<<48);
672  vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
673  |(((uint64_t)custom_threshold_m[i*8+3])<<16)
674  |(((uint64_t)custom_threshold_m[i*8+1])<<32)
675  |(((uint64_t)custom_threshold_m[i*8+7])<<48);
676  }
677 
678  if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
679 
680  return 1;
681 }
682 
684  "fast simple postprocess",
685  "fspp",
686  "Michael Niedermayer, Nikolaj Poroshin",
687  "",
688  vf_open,
689  NULL
690 };
691 
692 //====================================================================
693 //Specific spp's dct, idct and threshold functions
694 //I'd prefer to have them in the separate file.
695 
696 //#define MANGLE(a) #a
697 
698 //typedef int16_t int16_t; //! only int16_t
699 
700 #define DCTSIZE 8
701 #define DCTSIZE_S "8"
702 
703 #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
704 #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
705 #define FIX64(x,s) C64(FIX(x,s))
706 
707 #define MULTIPLY16H(x,k) (((x)*(k))>>16)
708 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
709 #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
710 
711 #if HAVE_MMX
712 
713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
714 DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_541196100)=FIX64(0.541196100, 14);
715 DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_707106781)=FIX64(0.707106781, 14);
716 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
717 
718 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
719 
720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
722 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
723 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
724 //for t3,t5,t7 == 0 shortcut
725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
726 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
727 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
728 
729 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
730 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
731 
732 #else /* !HAVE_MMX */
733 
734 typedef int32_t int_simd16_t;
735 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
736 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
737 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
738 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
739 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
740 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
741 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
742 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
743 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
744 
745 #endif
746 
747 #if !HAVE_MMX
748 
749 static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
750 {
751  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
752  int_simd16_t tmp10, tmp11, tmp12, tmp13;
753  int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
754  int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
755 
756  int16_t* dataptr;
757  int16_t* wsptr;
758  int16_t *threshold;
759  int ctr;
760 
761  dataptr = data;
762  wsptr = output;
763 
764  for (; cnt > 0; cnt-=2) { //start positions
765  threshold=(int16_t*)thr_adr;//threshold_mtx
766  for (ctr = DCTSIZE; ctr > 0; ctr--) {
767  // Process columns from input, add to output.
768  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
769  tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
770 
771  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
772  tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
773 
774  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
775  tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
776 
777  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
778  tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
779 
780  // Even part of FDCT
781 
782  tmp10 = tmp0 + tmp3;
783  tmp13 = tmp0 - tmp3;
784  tmp11 = tmp1 + tmp2;
785  tmp12 = tmp1 - tmp2;
786 
787  d0 = tmp10 + tmp11;
788  d4 = tmp10 - tmp11;
789 
790  z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
791  d2 = tmp13 + z1;
792  d6 = tmp13 - z1;
793 
794  // Even part of IDCT
795 
796  THRESHOLD(tmp0, d0, threshold[0*8]);
797  THRESHOLD(tmp1, d2, threshold[2*8]);
798  THRESHOLD(tmp2, d4, threshold[4*8]);
799  THRESHOLD(tmp3, d6, threshold[6*8]);
800  tmp0+=2;
801  tmp10 = (tmp0 + tmp2)>>2;
802  tmp11 = (tmp0 - tmp2)>>2;
803 
804  tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides)
805  tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
806 
807  tmp0 = tmp10 + tmp13; //->temps
808  tmp3 = tmp10 - tmp13; //->temps
809  tmp1 = tmp11 + tmp12; //->temps
810  tmp2 = tmp11 - tmp12; //->temps
811 
812  // Odd part of FDCT
813 
814  tmp10 = tmp4 + tmp5;
815  tmp11 = tmp5 + tmp6;
816  tmp12 = tmp6 + tmp7;
817 
818  z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
819  z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
820  z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
821  z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
822 
823  z11 = tmp7 + z3;
824  z13 = tmp7 - z3;
825 
826  d5 = z13 + z2;
827  d3 = z13 - z2;
828  d1 = z11 + z4;
829  d7 = z11 - z4;
830 
831  // Odd part of IDCT
832 
833  THRESHOLD(tmp4, d1, threshold[1*8]);
834  THRESHOLD(tmp5, d3, threshold[3*8]);
835  THRESHOLD(tmp6, d5, threshold[5*8]);
836  THRESHOLD(tmp7, d7, threshold[7*8]);
837 
838  //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
839  z13 = tmp6 + tmp5;
840  z10 = (tmp6 - tmp5)<<1;
841  z11 = tmp4 + tmp7;
842  z12 = (tmp4 - tmp7)<<1;
843 
844  tmp7 = (z11 + z13)>>2; //+2 !
845  tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
846  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
847  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
848  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
849 
850  tmp6 = tmp12 - tmp7;
851  tmp5 = tmp11 - tmp6;
852  tmp4 = tmp10 + tmp5;
853 
854  wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
855  wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
856  wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
857  wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
858  wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
859  wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
860  wsptr[DCTSIZE*6]= (tmp1 - tmp6);
861  wsptr[DCTSIZE*7]= (tmp0 - tmp7);
862  //
863  dataptr++; //next column
864  wsptr++;
865  threshold++;
866  }
867  dataptr+=8; //skip each second start pos
868  wsptr +=8;
869  }
870 }
871 
872 #else /* HAVE_MMX */
873 
874 static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
875 {
876  uint64_t __attribute__((aligned(8))) temps[4];
877  __asm__ volatile(
878  ASMALIGN(4)
879  "1: \n\t"
880  "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
881  //
882  "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
883  "movq %%mm1, %%mm0 \n\t"
884 
885  "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
886  "movq %%mm7, %%mm3 \n\t"
887 
888  "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
889  "movq %%mm1, %%mm5 \n\t"
890 
891  "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
892  "psubw %%mm7, %%mm1 \n\t" //t13
893 
894  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
895  "movq %%mm6, %%mm4 \n\t"
896 
897  "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
898  "paddw %%mm7, %%mm5 \n\t" //t10
899 
900  "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
901  "movq %%mm6, %%mm7 \n\t"
902 
903  "paddw %%mm2, %%mm6 \n\t" //t11
904  "psubw %%mm2, %%mm7 \n\t" //t12
905 
906  "movq %%mm5, %%mm2 \n\t"
907  "paddw %%mm6, %%mm5 \n\t" //d0
908  // i0 t13 t12 i3 i1 d0 - d4
909  "psubw %%mm6, %%mm2 \n\t" //d4
910  "paddw %%mm1, %%mm7 \n\t"
911 
912  "movq 4*16(%%"REG_d"), %%mm6 \n\t"
913  "psllw $2, %%mm7 \n\t"
914 
915  "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
916  "psubw %%mm6, %%mm2 \n\t"
917 
918  "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
919  "paddusw %%mm6, %%mm2 \n\t"
920 
921  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
922  //
923  "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
924  "paddw %%mm6, %%mm2 \n\t"
925 
926  "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
927  "psubusw %%mm6, %%mm2 \n\t"
928 
929 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
930 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
931 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
932  "paddw "MANGLE(MM_2)", %%mm5 \n\t"
933  "movq %%mm2, %%mm6 \n\t"
934 
935  "paddw %%mm5, %%mm2 \n\t"
936  "psubw %%mm6, %%mm5 \n\t"
937 
938  "movq %%mm1, %%mm6 \n\t"
939  "paddw %%mm7, %%mm1 \n\t" //d2
940 
941  "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
942  "psubw %%mm7, %%mm6 \n\t" //d6
943 
944  "movq 6*16(%%"REG_d"), %%mm7 \n\t"
945  "psraw $2, %%mm5 \n\t"
946 
947  "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
948  "psubw %%mm7, %%mm6 \n\t"
949  // t7 d2 /t11 t4 t6 - d6 /t10
950 
951  "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
952  "paddusw %%mm7, %%mm6 \n\t"
953 
954  "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
955  "paddw %%mm7, %%mm6 \n\t"
956 
957  "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
958  "psubusw %%mm7, %%mm6 \n\t"
959 
960  //movq [edi+"DCTSIZE_S"*2*2], mm1
961  //movq [edi+"DCTSIZE_S"*6*2], mm6
962  "movq %%mm1, %%mm7 \n\t"
963  "psraw $2, %%mm2 \n\t"
964 
965  "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
966  "psubw %%mm6, %%mm1 \n\t"
967 
968  "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
969  "paddw %%mm7, %%mm6 \n\t" //'t13
970 
971  "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
972  "movq %%mm2, %%mm7 \n\t"
973 
974  "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
975  "paddw %%mm6, %%mm2 \n\t" //'t0
976 
977  "movq %%mm2, 0*8+%3 \n\t" //!
978  "psubw %%mm6, %%mm7 \n\t" //'t3
979 
980  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
981  "psubw %%mm6, %%mm1 \n\t" //'t12
982 
983  "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
984  "movq %%mm5, %%mm6 \n\t"
985 
986  "movq %%mm7, 3*8+%3 \n\t"
987  "paddw %%mm2, %%mm3 \n\t" //t10
988 
989  "paddw %%mm4, %%mm2 \n\t" //t11
990  "paddw %%mm0, %%mm4 \n\t" //t12
991 
992  "movq %%mm3, %%mm7 \n\t"
993  "psubw %%mm4, %%mm3 \n\t"
994 
995  "psllw $2, %%mm3 \n\t"
996  "psllw $2, %%mm7 \n\t" //opt for P6
997 
998  "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
999  "psllw $2, %%mm4 \n\t"
1000 
1001  "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
1002  "psllw $2, %%mm2 \n\t"
1003 
1004  "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
1005  "paddw %%mm1, %%mm5 \n\t" //'t1
1006 
1007  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
1008  "psubw %%mm1, %%mm6 \n\t" //'t2
1009  // t7 't12 't11 t4 t6 - 't13 't10 ---
1010 
1011  "paddw %%mm3, %%mm7 \n\t" //z2
1012 
1013  "movq %%mm5, 1*8+%3 \n\t"
1014  "paddw %%mm3, %%mm4 \n\t" //z4
1015 
1016  "movq 3*16(%%"REG_d"), %%mm3 \n\t"
1017  "movq %%mm0, %%mm1 \n\t"
1018 
1019  "movq %%mm6, 2*8+%3 \n\t"
1020  "psubw %%mm2, %%mm1 \n\t" //z13
1021 
1022 //===
1023  "paddw %%mm2, %%mm0 \n\t" //z11
1024  "movq %%mm1, %%mm5 \n\t"
1025 
1026  "movq 5*16(%%"REG_d"), %%mm2 \n\t"
1027  "psubw %%mm7, %%mm1 \n\t" //d3
1028 
1029  "paddw %%mm7, %%mm5 \n\t" //d5
1030  "psubw %%mm3, %%mm1 \n\t"
1031 
1032  "movq 1*16(%%"REG_d"), %%mm7 \n\t"
1033  "psubw %%mm2, %%mm5 \n\t"
1034 
1035  "movq %%mm0, %%mm6 \n\t"
1036  "paddw %%mm4, %%mm0 \n\t" //d1
1037 
1038  "paddusw %%mm3, %%mm1 \n\t"
1039  "psubw %%mm4, %%mm6 \n\t" //d7
1040 
1041  // d1 d3 - - - d5 d7 -
1042  "movq 7*16(%%"REG_d"), %%mm4 \n\t"
1043  "psubw %%mm7, %%mm0 \n\t"
1044 
1045  "psubw %%mm4, %%mm6 \n\t"
1046  "paddusw %%mm2, %%mm5 \n\t"
1047 
1048  "paddusw %%mm4, %%mm6 \n\t"
1049  "paddw %%mm3, %%mm1 \n\t"
1050 
1051  "paddw %%mm2, %%mm5 \n\t"
1052  "paddw %%mm4, %%mm6 \n\t"
1053 
1054  "psubusw %%mm3, %%mm1 \n\t"
1055  "psubusw %%mm2, %%mm5 \n\t"
1056 
1057  "psubusw %%mm4, %%mm6 \n\t"
1058  "movq %%mm1, %%mm4 \n\t"
1059 
1060  "por %%mm5, %%mm4 \n\t"
1061  "paddusw %%mm7, %%mm0 \n\t"
1062 
1063  "por %%mm6, %%mm4 \n\t"
1064  "paddw %%mm7, %%mm0 \n\t"
1065 
1066  "packssdw %%mm4, %%mm4 \n\t"
1067  "psubusw %%mm7, %%mm0 \n\t"
1068 
1069  "movd %%mm4, %%"REG_a" \n\t"
1070  "or %%"REG_a", %%"REG_a" \n\t"
1071  "jnz 2f \n\t"
1072  //movq [edi+"DCTSIZE_S"*3*2], mm1
1073  //movq [edi+"DCTSIZE_S"*5*2], mm5
1074  //movq [edi+"DCTSIZE_S"*1*2], mm0
1075  //movq [edi+"DCTSIZE_S"*7*2], mm6
1076  // t4 t5 - - - t6 t7 -
1077  //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
1078 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
1079  "movq 0*8+%3, %%mm4 \n\t"
1080  "movq %%mm0, %%mm1 \n\t"
1081 
1082  "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
1083  "movq %%mm1, %%mm2 \n\t"
1084 
1085  "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
1086  "movq %%mm2, %%mm3 \n\t"
1087 
1088  "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
1089  "paddw %%mm4, %%mm5 \n\t"
1090 
1091  "movq 1*8+%3, %%mm6 \n\t"
1092  //paddw mm3, MM_2
1093  "psraw $2, %%mm3 \n\t" //tmp7
1094 
1095  "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
1096  "psubw %%mm3, %%mm4 \n\t"
1097 
1098  "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
1099  "paddw %%mm3, %%mm5 \n\t"
1100 
1101  "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1102  "paddw %%mm6, %%mm7 \n\t"
1103 
1104  "movq 2*8+%3, %%mm3 \n\t"
1105  "psubw %%mm0, %%mm6 \n\t"
1106 
1107  "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
1108  "paddw %%mm0, %%mm7 \n\t"
1109 
1110  "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1111  "paddw %%mm3, %%mm4 \n\t"
1112 
1113  "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1114  "psubw %%mm1, %%mm3 \n\t"
1115 
1116  "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
1117  "paddw %%mm1, %%mm4 \n\t"
1118 
1119  "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
1120  "paddw %%mm3, %%mm5 \n\t"
1121 
1122  "movq 3*8+%3, %%mm0 \n\t"
1123  "add $8, %%"REG_S" \n\t"
1124 
1125  "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1126  "paddw %%mm0, %%mm6 \n\t"
1127 
1128  "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1129  "psubw %%mm2, %%mm0 \n\t"
1130 
1131  "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
1132  "paddw %%mm2, %%mm6 \n\t"
1133 
1134  "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1135  "paddw %%mm0, %%mm7 \n\t"
1136 
1137  "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1138 
1139  "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1140  "add $8, %%"REG_D" \n\t"
1141  "jmp 4f \n\t"
1142 
1143  "2: \n\t"
1144  //--- non DC2
1145  //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
1146  //psraw mm5, 2
1147  //psraw mm0, 2
1148  //psraw mm6, 2
1149  "movq %%mm5, %%mm3 \n\t"
1150  "psubw %%mm1, %%mm5 \n\t"
1151 
1152  "psllw $1, %%mm5 \n\t" //'z10
1153  "paddw %%mm1, %%mm3 \n\t" //'z13
1154 
1155  "movq %%mm0, %%mm2 \n\t"
1156  "psubw %%mm6, %%mm0 \n\t"
1157 
1158  "movq %%mm5, %%mm1 \n\t"
1159  "psllw $1, %%mm0 \n\t" //'z12
1160 
1161  "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
1162  "paddw %%mm0, %%mm5 \n\t"
1163 
1164  "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
1165  "paddw %%mm6, %%mm2 \n\t" //'z11
1166 
1167  "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
1168  "movq %%mm2, %%mm7 \n\t"
1169 
1170  //---
1171  "movq 0*8+%3, %%mm4 \n\t"
1172  "psubw %%mm3, %%mm2 \n\t"
1173 
1174  "psllw $1, %%mm2 \n\t"
1175  "paddw %%mm3, %%mm7 \n\t" //'t7
1176 
1177  "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
1178  "movq %%mm4, %%mm6 \n\t"
1179  //paddw mm7, MM_2
1180  "psraw $2, %%mm7 \n\t"
1181 
1182  "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
1183  "psubw %%mm7, %%mm6 \n\t"
1184 
1185  "movq 1*8+%3, %%mm3 \n\t"
1186  "paddw %%mm7, %%mm4 \n\t"
1187 
1188  "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1189  "paddw %%mm5, %%mm1 \n\t" //'t12
1190 
1191  "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1192  "psubw %%mm7, %%mm1 \n\t" //'t6
1193 
1194  "movq 2*8+%3, %%mm7 \n\t"
1195  "psubw %%mm5, %%mm0 \n\t" //'t10
1196 
1197  "movq 3*8+%3, %%mm6 \n\t"
1198  "movq %%mm3, %%mm5 \n\t"
1199 
1200  "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
1201  "psubw %%mm1, %%mm5 \n\t"
1202 
1203  "psubw %%mm1, %%mm2 \n\t" //'t5
1204  "paddw %%mm1, %%mm3 \n\t"
1205 
1206  "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1207  "movq %%mm7, %%mm4 \n\t"
1208 
1209  "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
1210  "psubw %%mm2, %%mm4 \n\t"
1211 
1212  "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
1213  "paddw %%mm2, %%mm7 \n\t"
1214 
1215  "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1216  "paddw %%mm2, %%mm0 \n\t" //'t4
1217 
1218  // 't4 't6 't5 - - - - 't7
1219  "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1220  "movq %%mm6, %%mm1 \n\t"
1221 
1222  "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
1223  "psubw %%mm0, %%mm1 \n\t"
1224 
1225  "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
1226  "paddw %%mm0, %%mm6 \n\t"
1227 
1228  "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1229  "add $8, %%"REG_S" \n\t"
1230 
1231  "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1232 
1233  "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1234  "add $8, %%"REG_D" \n\t"
1235 
1236  "4: \n\t"
1237 //=part 2 (the same)===========================================================
1238  "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
1239  //
1240  "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
1241  "movq %%mm1, %%mm0 \n\t"
1242 
1243  "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
1244  "movq %%mm7, %%mm3 \n\t"
1245 
1246  "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
1247  "movq %%mm1, %%mm5 \n\t"
1248 
1249  "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
1250  "psubw %%mm7, %%mm1 \n\t" //t13
1251 
1252  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
1253  "movq %%mm6, %%mm4 \n\t"
1254 
1255  "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
1256  "paddw %%mm7, %%mm5 \n\t" //t10
1257 
1258  "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
1259  "movq %%mm6, %%mm7 \n\t"
1260 
1261  "paddw %%mm2, %%mm6 \n\t" //t11
1262  "psubw %%mm2, %%mm7 \n\t" //t12
1263 
1264  "movq %%mm5, %%mm2 \n\t"
1265  "paddw %%mm6, %%mm5 \n\t" //d0
1266  // i0 t13 t12 i3 i1 d0 - d4
1267  "psubw %%mm6, %%mm2 \n\t" //d4
1268  "paddw %%mm1, %%mm7 \n\t"
1269 
1270  "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
1271  "psllw $2, %%mm7 \n\t"
1272 
1273  "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1274  "psubw %%mm6, %%mm2 \n\t"
1275 
1276  "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1277  "paddusw %%mm6, %%mm2 \n\t"
1278 
1279  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
1280  //
1281  "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1282  "paddw %%mm6, %%mm2 \n\t"
1283 
1284  "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1285  "psubusw %%mm6, %%mm2 \n\t"
1286 
1287 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
1288 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
1289 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
1290  "paddw "MANGLE(MM_2)", %%mm5 \n\t"
1291  "movq %%mm2, %%mm6 \n\t"
1292 
1293  "paddw %%mm5, %%mm2 \n\t"
1294  "psubw %%mm6, %%mm5 \n\t"
1295 
1296  "movq %%mm1, %%mm6 \n\t"
1297  "paddw %%mm7, %%mm1 \n\t" //d2
1298 
1299  "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1300  "psubw %%mm7, %%mm6 \n\t" //d6
1301 
1302  "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
1303  "psraw $2, %%mm5 \n\t"
1304 
1305  "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1306  "psubw %%mm7, %%mm6 \n\t"
1307  // t7 d2 /t11 t4 t6 - d6 /t10
1308 
1309  "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1310  "paddusw %%mm7, %%mm6 \n\t"
1311 
1312  "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1313  "paddw %%mm7, %%mm6 \n\t"
1314 
1315  "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
1316  "psubusw %%mm7, %%mm6 \n\t"
1317 
1318  //movq [edi+"DCTSIZE_S"*2*2], mm1
1319  //movq [edi+"DCTSIZE_S"*6*2], mm6
1320  "movq %%mm1, %%mm7 \n\t"
1321  "psraw $2, %%mm2 \n\t"
1322 
1323  "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
1324  "psubw %%mm6, %%mm1 \n\t"
1325 
1326  "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
1327  "paddw %%mm7, %%mm6 \n\t" //'t13
1328 
1329  "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
1330  "movq %%mm2, %%mm7 \n\t"
1331 
1332  "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
1333  "paddw %%mm6, %%mm2 \n\t" //'t0
1334 
1335  "movq %%mm2, 0*8+%3 \n\t" //!
1336  "psubw %%mm6, %%mm7 \n\t" //'t3
1337 
1338  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
1339  "psubw %%mm6, %%mm1 \n\t" //'t12
1340 
1341  "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
1342  "movq %%mm5, %%mm6 \n\t"
1343 
1344  "movq %%mm7, 3*8+%3 \n\t"
1345  "paddw %%mm2, %%mm3 \n\t" //t10
1346 
1347  "paddw %%mm4, %%mm2 \n\t" //t11
1348  "paddw %%mm0, %%mm4 \n\t" //t12
1349 
1350  "movq %%mm3, %%mm7 \n\t"
1351  "psubw %%mm4, %%mm3 \n\t"
1352 
1353  "psllw $2, %%mm3 \n\t"
1354  "psllw $2, %%mm7 \n\t" //opt for P6
1355 
1356  "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
1357  "psllw $2, %%mm4 \n\t"
1358 
1359  "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
1360  "psllw $2, %%mm2 \n\t"
1361 
1362  "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
1363  "paddw %%mm1, %%mm5 \n\t" //'t1
1364 
1365  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
1366  "psubw %%mm1, %%mm6 \n\t" //'t2
1367  // t7 't12 't11 t4 t6 - 't13 't10 ---
1368 
1369  "paddw %%mm3, %%mm7 \n\t" //z2
1370 
1371  "movq %%mm5, 1*8+%3 \n\t"
1372  "paddw %%mm3, %%mm4 \n\t" //z4
1373 
1374  "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
1375  "movq %%mm0, %%mm1 \n\t"
1376 
1377  "movq %%mm6, 2*8+%3 \n\t"
1378  "psubw %%mm2, %%mm1 \n\t" //z13
1379 
1380 //===
1381  "paddw %%mm2, %%mm0 \n\t" //z11
1382  "movq %%mm1, %%mm5 \n\t"
1383 
1384  "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
1385  "psubw %%mm7, %%mm1 \n\t" //d3
1386 
1387  "paddw %%mm7, %%mm5 \n\t" //d5
1388  "psubw %%mm3, %%mm1 \n\t"
1389 
1390  "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
1391  "psubw %%mm2, %%mm5 \n\t"
1392 
1393  "movq %%mm0, %%mm6 \n\t"
1394  "paddw %%mm4, %%mm0 \n\t" //d1
1395 
1396  "paddusw %%mm3, %%mm1 \n\t"
1397  "psubw %%mm4, %%mm6 \n\t" //d7
1398 
1399  // d1 d3 - - - d5 d7 -
1400  "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
1401  "psubw %%mm7, %%mm0 \n\t"
1402 
1403  "psubw %%mm4, %%mm6 \n\t"
1404  "paddusw %%mm2, %%mm5 \n\t"
1405 
1406  "paddusw %%mm4, %%mm6 \n\t"
1407  "paddw %%mm3, %%mm1 \n\t"
1408 
1409  "paddw %%mm2, %%mm5 \n\t"
1410  "paddw %%mm4, %%mm6 \n\t"
1411 
1412  "psubusw %%mm3, %%mm1 \n\t"
1413  "psubusw %%mm2, %%mm5 \n\t"
1414 
1415  "psubusw %%mm4, %%mm6 \n\t"
1416  "movq %%mm1, %%mm4 \n\t"
1417 
1418  "por %%mm5, %%mm4 \n\t"
1419  "paddusw %%mm7, %%mm0 \n\t"
1420 
1421  "por %%mm6, %%mm4 \n\t"
1422  "paddw %%mm7, %%mm0 \n\t"
1423 
1424  "packssdw %%mm4, %%mm4 \n\t"
1425  "psubusw %%mm7, %%mm0 \n\t"
1426 
1427  "movd %%mm4, %%"REG_a" \n\t"
1428  "or %%"REG_a", %%"REG_a" \n\t"
1429  "jnz 3f \n\t"
1430  //movq [edi+"DCTSIZE_S"*3*2], mm1
1431  //movq [edi+"DCTSIZE_S"*5*2], mm5
1432  //movq [edi+"DCTSIZE_S"*1*2], mm0
1433  //movq [edi+"DCTSIZE_S"*7*2], mm6
1434  // t4 t5 - - - t6 t7 -
1435  //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
1436 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
1437  "movq 0*8+%3, %%mm4 \n\t"
1438  "movq %%mm0, %%mm1 \n\t"
1439 
1440  "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
1441  "movq %%mm1, %%mm2 \n\t"
1442 
1443  "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
1444  "movq %%mm2, %%mm3 \n\t"
1445 
1446  "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
1447  "paddw %%mm4, %%mm5 \n\t"
1448 
1449  "movq 1*8+%3, %%mm6 \n\t"
1450  //paddw mm3, MM_2
1451  "psraw $2, %%mm3 \n\t" //tmp7
1452 
1453  "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
1454  "psubw %%mm3, %%mm4 \n\t"
1455 
1456  "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
1457  "paddw %%mm3, %%mm5 \n\t"
1458 
1459  "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1460  "paddw %%mm6, %%mm7 \n\t"
1461 
1462  "movq 2*8+%3, %%mm3 \n\t"
1463  "psubw %%mm0, %%mm6 \n\t"
1464 
1465  "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
1466  "paddw %%mm0, %%mm7 \n\t"
1467 
1468  "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1469  "paddw %%mm3, %%mm4 \n\t"
1470 
1471  "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1472  "psubw %%mm1, %%mm3 \n\t"
1473 
1474  "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
1475  "paddw %%mm1, %%mm4 \n\t"
1476 
1477  "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
1478  "paddw %%mm3, %%mm5 \n\t"
1479 
1480  "movq 3*8+%3, %%mm0 \n\t"
1481  "add $24, %%"REG_S" \n\t"
1482 
1483  "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1484  "paddw %%mm0, %%mm6 \n\t"
1485 
1486  "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1487  "psubw %%mm2, %%mm0 \n\t"
1488 
1489  "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
1490  "paddw %%mm2, %%mm6 \n\t"
1491 
1492  "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1493  "paddw %%mm0, %%mm7 \n\t"
1494 
1495  "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1496 
1497  "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1498  "add $24, %%"REG_D" \n\t"
1499  "sub $2, %%"REG_c" \n\t"
1500  "jnz 1b \n\t"
1501  "jmp 5f \n\t"
1502 
1503  "3: \n\t"
1504  //--- non DC2
1505  //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
1506  //psraw mm5, 2
1507  //psraw mm0, 2
1508  //psraw mm6, 2
1509  "movq %%mm5, %%mm3 \n\t"
1510  "psubw %%mm1, %%mm5 \n\t"
1511 
1512  "psllw $1, %%mm5 \n\t" //'z10
1513  "paddw %%mm1, %%mm3 \n\t" //'z13
1514 
1515  "movq %%mm0, %%mm2 \n\t"
1516  "psubw %%mm6, %%mm0 \n\t"
1517 
1518  "movq %%mm5, %%mm1 \n\t"
1519  "psllw $1, %%mm0 \n\t" //'z12
1520 
1521  "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
1522  "paddw %%mm0, %%mm5 \n\t"
1523 
1524  "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
1525  "paddw %%mm6, %%mm2 \n\t" //'z11
1526 
1527  "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
1528  "movq %%mm2, %%mm7 \n\t"
1529 
1530  //---
1531  "movq 0*8+%3, %%mm4 \n\t"
1532  "psubw %%mm3, %%mm2 \n\t"
1533 
1534  "psllw $1, %%mm2 \n\t"
1535  "paddw %%mm3, %%mm7 \n\t" //'t7
1536 
1537  "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
1538  "movq %%mm4, %%mm6 \n\t"
1539  //paddw mm7, MM_2
1540  "psraw $2, %%mm7 \n\t"
1541 
1542  "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
1543  "psubw %%mm7, %%mm6 \n\t"
1544 
1545  "movq 1*8+%3, %%mm3 \n\t"
1546  "paddw %%mm7, %%mm4 \n\t"
1547 
1548  "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1549  "paddw %%mm5, %%mm1 \n\t" //'t12
1550 
1551  "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1552  "psubw %%mm7, %%mm1 \n\t" //'t6
1553 
1554  "movq 2*8+%3, %%mm7 \n\t"
1555  "psubw %%mm5, %%mm0 \n\t" //'t10
1556 
1557  "movq 3*8+%3, %%mm6 \n\t"
1558  "movq %%mm3, %%mm5 \n\t"
1559 
1560  "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
1561  "psubw %%mm1, %%mm5 \n\t"
1562 
1563  "psubw %%mm1, %%mm2 \n\t" //'t5
1564  "paddw %%mm1, %%mm3 \n\t"
1565 
1566  "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1567  "movq %%mm7, %%mm4 \n\t"
1568 
1569  "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
1570  "psubw %%mm2, %%mm4 \n\t"
1571 
1572  "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
1573  "paddw %%mm2, %%mm7 \n\t"
1574 
1575  "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1576  "paddw %%mm2, %%mm0 \n\t" //'t4
1577 
1578  // 't4 't6 't5 - - - - 't7
1579  "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1580  "movq %%mm6, %%mm1 \n\t"
1581 
1582  "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
1583  "psubw %%mm0, %%mm1 \n\t"
1584 
1585  "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
1586  "paddw %%mm0, %%mm6 \n\t"
1587 
1588  "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1589  "add $24, %%"REG_S" \n\t"
1590 
1591  "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1592 
1593  "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1594  "add $24, %%"REG_D" \n\t"
1595  "sub $2, %%"REG_c" \n\t"
1596  "jnz 1b \n\t"
1597  "5: \n\t"
1598 
1599  : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
1600  : "d"(thr_adr)
1601  : "%"REG_a
1602  );
1603 }
1604 
1605 #endif // HAVE_MMX
1606 
1607 #if !HAVE_MMX
1608 
1609 static void row_idct_c(int16_t* workspace,
1610  int16_t* output_adr, int output_stride, int cnt)
1611 {
1612  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1613  int_simd16_t tmp10, tmp11, tmp12, tmp13;
1614  int_simd16_t z5, z10, z11, z12, z13;
1615  int16_t* outptr;
1616  int16_t* wsptr;
1617 
1618  cnt*=4;
1619  wsptr = workspace;
1620  outptr = output_adr;
1621  for (; cnt > 0; cnt--) {
1622  // Even part
1623  //Simd version reads 4x4 block and transposes it
1624  tmp10 = ( wsptr[2] + wsptr[3]);
1625  tmp11 = ( wsptr[2] - wsptr[3]);
1626 
1627  tmp13 = ( wsptr[0] + wsptr[1]);
1628  tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
1629 
1630  tmp0 = tmp10 + tmp13; //->temps
1631  tmp3 = tmp10 - tmp13; //->temps
1632  tmp1 = tmp11 + tmp12;
1633  tmp2 = tmp11 - tmp12;
1634 
1635  // Odd part
1636  //Also transpose, with previous:
1637  // ---- ---- ||||
1638  // ---- ---- idct ||||
1639  // ---- ---- ---> ||||
1640  // ---- ---- ||||
1641  z13 = wsptr[4] + wsptr[5];
1642  z10 = wsptr[4] - wsptr[5];
1643  z11 = wsptr[6] + wsptr[7];
1644  z12 = wsptr[6] - wsptr[7];
1645 
1646  tmp7 = z11 + z13;
1647  tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
1648 
1649  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
1650  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
1651  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
1652 
1653  tmp6 = (tmp12<<3) - tmp7;
1654  tmp5 = (tmp11<<3) - tmp6;
1655  tmp4 = (tmp10<<3) + tmp5;
1656 
1657  // Final output stage: descale and write column
1658  outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
1659  outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
1660  outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
1661  outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
1662  outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
1663  outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
1664  outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
1665  outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
1666  outptr++;
1667 
1668  wsptr += DCTSIZE; // advance pointer to next row
1669  }
1670 }
1671 
1672 #else /* HAVE_MMX */
1673 
1674 static void row_idct_mmx (int16_t* workspace,
1675  int16_t* output_adr, int output_stride, int cnt)
1676 {
1677  uint64_t __attribute__((aligned(8))) temps[4];
1678  __asm__ volatile(
1679  "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
1680  "1: \n\t"
1681  "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
1682  //
1683 
1684  "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
1685  "movq %%mm0, %%mm4 \n\t"
1686 
1687  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
1688  "punpcklwd %%mm1, %%mm0 \n\t"
1689 
1690  "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
1691  "punpckhwd %%mm1, %%mm4 \n\t"
1692 
1693  //transpose 4x4
1694  "movq %%mm2, %%mm7 \n\t"
1695  "punpcklwd %%mm3, %%mm2 \n\t"
1696 
1697  "movq %%mm0, %%mm6 \n\t"
1698  "punpckldq %%mm2, %%mm0 \n\t" //0
1699 
1700  "punpckhdq %%mm2, %%mm6 \n\t" //1
1701  "movq %%mm0, %%mm5 \n\t"
1702 
1703  "punpckhwd %%mm3, %%mm7 \n\t"
1704  "psubw %%mm6, %%mm0 \n\t"
1705 
1706  "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
1707  "movq %%mm4, %%mm2 \n\t"
1708 
1709  "punpckldq %%mm7, %%mm4 \n\t" //2
1710  "paddw %%mm6, %%mm5 \n\t"
1711 
1712  "punpckhdq %%mm7, %%mm2 \n\t" //3
1713  "movq %%mm4, %%mm1 \n\t"
1714 
1715  "psllw $2, %%mm0 \n\t"
1716  "paddw %%mm2, %%mm4 \n\t" //t10
1717 
1718  "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
1719  "psubw %%mm2, %%mm1 \n\t" //t11
1720 
1721  "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
1722  "psubw %%mm5, %%mm0 \n\t"
1723 
1724  "movq %%mm4, %%mm6 \n\t"
1725  "paddw %%mm5, %%mm4 \n\t" //t0
1726 
1727  "psubw %%mm5, %%mm6 \n\t" //t3
1728  "movq %%mm1, %%mm7 \n\t"
1729 
1730  "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
1731  "paddw %%mm0, %%mm1 \n\t" //t1
1732 
1733  "movq %%mm4, 0*8+%3 \n\t" //t0
1734  "movq %%mm3, %%mm4 \n\t"
1735 
1736  "movq %%mm6, 1*8+%3 \n\t" //t3
1737  "punpcklwd %%mm2, %%mm3 \n\t"
1738 
1739  //transpose 4x4
1740  "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
1741  "punpckhwd %%mm2, %%mm4 \n\t"
1742 
1743  "movq %%mm5, %%mm2 \n\t"
1744  "punpcklwd %%mm6, %%mm5 \n\t"
1745 
1746  "psubw %%mm0, %%mm7 \n\t" //t2
1747  "punpckhwd %%mm6, %%mm2 \n\t"
1748 
1749  "movq %%mm3, %%mm0 \n\t"
1750  "punpckldq %%mm5, %%mm3 \n\t" //4
1751 
1752  "punpckhdq %%mm5, %%mm0 \n\t" //5
1753  "movq %%mm4, %%mm5 \n\t"
1754 
1755  //
1756  "movq %%mm3, %%mm6 \n\t"
1757  "punpckldq %%mm2, %%mm4 \n\t" //6
1758 
1759  "psubw %%mm0, %%mm3 \n\t" //z10
1760  "punpckhdq %%mm2, %%mm5 \n\t" //7
1761 
1762  "paddw %%mm0, %%mm6 \n\t" //z13
1763  "movq %%mm4, %%mm2 \n\t"
1764 
1765  "movq %%mm3, %%mm0 \n\t"
1766  "psubw %%mm5, %%mm4 \n\t" //z12
1767 
1768  "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
1769  "paddw %%mm4, %%mm3 \n\t"
1770 
1771  "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
1772  "paddw %%mm5, %%mm2 \n\t" //z11 >
1773 
1774  "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
1775  "movq %%mm2, %%mm5 \n\t"
1776 
1777  "psubw %%mm6, %%mm2 \n\t"
1778  "paddw %%mm6, %%mm5 \n\t" //t7
1779 
1780  "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
1781  "paddw %%mm3, %%mm0 \n\t" //t12
1782 
1783  "psllw $3, %%mm0 \n\t"
1784  "psubw %%mm3, %%mm4 \n\t" //t10
1785 
1786  "movq 0*8+%3, %%mm6 \n\t"
1787  "movq %%mm1, %%mm3 \n\t"
1788 
1789  "psllw $3, %%mm4 \n\t"
1790  "psubw %%mm5, %%mm0 \n\t" //t6
1791 
1792  "psllw $3, %%mm2 \n\t"
1793  "paddw %%mm0, %%mm1 \n\t" //d1
1794 
1795  "psubw %%mm0, %%mm2 \n\t" //t5
1796  "psubw %%mm0, %%mm3 \n\t" //d6
1797 
1798  "paddw %%mm2, %%mm4 \n\t" //t4
1799  "movq %%mm7, %%mm0 \n\t"
1800 
1801  "paddw %%mm2, %%mm7 \n\t" //d2
1802  "psubw %%mm2, %%mm0 \n\t" //d5
1803 
1804  "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4
1805  "psubw %%mm5, %%mm6 \n\t" //d7
1806 
1807  "paddw 0*8+%3, %%mm5 \n\t" //d0
1808  "paddw %%mm2, %%mm1 \n\t"
1809 
1810  "paddw %%mm2, %%mm5 \n\t"
1811  "psraw $3, %%mm1 \n\t"
1812 
1813  "paddw %%mm2, %%mm7 \n\t"
1814  "psraw $3, %%mm5 \n\t"
1815 
1816  "paddw (%%"REG_D"), %%mm5 \n\t"
1817  "psraw $3, %%mm7 \n\t"
1818 
1819  "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
1820  "paddw %%mm2, %%mm0 \n\t"
1821 
1822  "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
1823  "paddw %%mm2, %%mm3 \n\t"
1824 
1825  "movq %%mm5, (%%"REG_D") \n\t"
1826  "paddw %%mm2, %%mm6 \n\t"
1827 
1828  "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
1829  "psraw $3, %%mm0 \n\t"
1830 
1831  "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
1832  "add %%"REG_d", %%"REG_D" \n\t" //3*ls
1833 
1834  "movq 1*8+%3, %%mm5 \n\t" //t3
1835  "psraw $3, %%mm3 \n\t"
1836 
1837  "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
1838  "psubw %%mm4, %%mm5 \n\t" //d3
1839 
1840  "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
1841  "psraw $3, %%mm6 \n\t"
1842 
1843  "paddw 1*8+%3, %%mm4 \n\t" //d4
1844  "paddw %%mm2, %%mm5 \n\t"
1845 
1846  "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
1847  "paddw %%mm2, %%mm4 \n\t"
1848 
1849  "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
1850  "psraw $3, %%mm5 \n\t"
1851 
1852  "paddw (%%"REG_D"), %%mm5 \n\t"
1853  "psraw $3, %%mm4 \n\t"
1854 
1855  "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
1856  "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows
1857 
1858  "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
1859  "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
1860  "movq %%mm5, (%%"REG_D") \n\t"
1861  "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
1862 
1863  "sub %%"REG_d", %%"REG_D" \n\t"
1864  "add $8, %%"REG_D" \n\t"
1865  "dec %%"REG_c" \n\t"
1866  "jnz 1b \n\t"
1867 
1868  : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
1869  : "a"(output_stride*sizeof(short))
1870  : "%"REG_d
1871  );
1872 }
1873 
1874 #endif // HAVE_MMX
1875 
1876 #if !HAVE_MMX
1877 
1878 static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
1879 {
1880  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1881  int_simd16_t tmp10, tmp11, tmp12, tmp13;
1882  int_simd16_t z1, z2, z3, z4, z5, z11, z13;
1883  int16_t *dataptr;
1884 
1885  cnt*=4;
1886  // Pass 1: process rows.
1887 
1888  dataptr = data;
1889  for (; cnt > 0; cnt--) {
1890  tmp0 = pixels[line_size*0] + pixels[line_size*7];
1891  tmp7 = pixels[line_size*0] - pixels[line_size*7];
1892  tmp1 = pixels[line_size*1] + pixels[line_size*6];
1893  tmp6 = pixels[line_size*1] - pixels[line_size*6];
1894  tmp2 = pixels[line_size*2] + pixels[line_size*5];
1895  tmp5 = pixels[line_size*2] - pixels[line_size*5];
1896  tmp3 = pixels[line_size*3] + pixels[line_size*4];
1897  tmp4 = pixels[line_size*3] - pixels[line_size*4];
1898 
1899  // Even part
1900 
1901  tmp10 = tmp0 + tmp3;
1902  tmp13 = tmp0 - tmp3;
1903  tmp11 = tmp1 + tmp2;
1904  tmp12 = tmp1 - tmp2;
1905  //Even columns are written first, this leads to different order of columns
1906  //in column_fidct(), but they are processed independently, so all ok.
1907  //Later in the row_idct() columns readed at the same order.
1908  dataptr[2] = tmp10 + tmp11;
1909  dataptr[3] = tmp10 - tmp11;
1910 
1911  z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
1912  dataptr[0] = tmp13 + z1;
1913  dataptr[1] = tmp13 - z1;
1914 
1915  // Odd part
1916 
1917  tmp10 = (tmp4 + tmp5) <<2;
1918  tmp11 = (tmp5 + tmp6) <<2;
1919  tmp12 = (tmp6 + tmp7) <<2;
1920 
1921  z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
1922  z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
1923  z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
1924  z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
1925 
1926  z11 = tmp7 + z3;
1927  z13 = tmp7 - z3;
1928 
1929  dataptr[4] = z13 + z2;
1930  dataptr[5] = z13 - z2;
1931  dataptr[6] = z11 + z4;
1932  dataptr[7] = z11 - z4;
1933 
1934  pixels++; // advance pointer to next column
1935  dataptr += DCTSIZE;
1936  }
1937 }
1938 
1939 #else /* HAVE_MMX */
1940 
1941 static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
1942 {
1943  uint64_t __attribute__((aligned(8))) temps[4];
1944  __asm__ volatile(
1945  "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
1946  "6: \n\t"
1947  "movd (%%"REG_S"), %%mm0 \n\t"
1948  "pxor %%mm7, %%mm7 \n\t"
1949 
1950  "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
1951  "punpcklbw %%mm7, %%mm0 \n\t"
1952 
1953  "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
1954  "punpcklbw %%mm7, %%mm1 \n\t"
1955 
1956  "punpcklbw %%mm7, %%mm2 \n\t"
1957  "add %%"REG_d", %%"REG_S" \n\t"
1958 
1959  "movq %%mm0, %%mm5 \n\t"
1960  //
1961 
1962  "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch!
1963  "movq %%mm1, %%mm6 \n\t"
1964 
1965  "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t" //6
1966  "punpcklbw %%mm7, %%mm3 \n\t"
1967 
1968  "psubw %%mm3, %%mm5 \n\t"
1969  "punpcklbw %%mm7, %%mm4 \n\t"
1970 
1971  "paddw %%mm3, %%mm0 \n\t"
1972  "psubw %%mm4, %%mm6 \n\t"
1973 
1974  "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5
1975  "paddw %%mm4, %%mm1 \n\t"
1976 
1977  "movq %%mm5, 0*8+%3 \n\t" //t7
1978  "punpcklbw %%mm7, %%mm3 \n\t"
1979 
1980  "movq %%mm6, 1*8+%3 \n\t" //t6
1981  "movq %%mm2, %%mm4 \n\t"
1982 
1983  "movd (%%"REG_S"), %%mm5 \n\t" //3
1984  "paddw %%mm3, %%mm2 \n\t"
1985 
1986  "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t" //4
1987  "punpcklbw %%mm7, %%mm5 \n\t"
1988 
1989  "psubw %%mm3, %%mm4 \n\t"
1990  "punpcklbw %%mm7, %%mm6 \n\t"
1991 
1992  "movq %%mm5, %%mm3 \n\t"
1993  "paddw %%mm6, %%mm5 \n\t" //t3
1994 
1995  "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - -
1996  "movq %%mm0, %%mm6 \n\t"
1997 
1998  "movq %%mm1, %%mm7 \n\t"
1999  "psubw %%mm5, %%mm0 \n\t" //t13
2000 
2001  "psubw %%mm2, %%mm1 \n\t"
2002  "paddw %%mm2, %%mm7 \n\t" //t11
2003 
2004  "paddw %%mm0, %%mm1 \n\t"
2005  "movq %%mm7, %%mm2 \n\t"
2006 
2007  "psllw $2, %%mm1 \n\t"
2008  "paddw %%mm5, %%mm6 \n\t" //t10
2009 
2010  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t"
2011  "paddw %%mm6, %%mm7 \n\t" //d2
2012 
2013  "psubw %%mm2, %%mm6 \n\t" //d3
2014  "movq %%mm0, %%mm5 \n\t"
2015 
2016  //transpose 4x4
2017  "movq %%mm7, %%mm2 \n\t"
2018  "punpcklwd %%mm6, %%mm7 \n\t"
2019 
2020  "paddw %%mm1, %%mm0 \n\t" //d0
2021  "punpckhwd %%mm6, %%mm2 \n\t"
2022 
2023  "psubw %%mm1, %%mm5 \n\t" //d1
2024  "movq %%mm0, %%mm6 \n\t"
2025 
2026  "movq 1*8+%3, %%mm1 \n\t"
2027  "punpcklwd %%mm5, %%mm0 \n\t"
2028 
2029  "punpckhwd %%mm5, %%mm6 \n\t"
2030  "movq %%mm0, %%mm5 \n\t"
2031 
2032  "punpckldq %%mm7, %%mm0 \n\t" //0
2033  "paddw %%mm4, %%mm3 \n\t"
2034 
2035  "punpckhdq %%mm7, %%mm5 \n\t" //1
2036  "movq %%mm6, %%mm7 \n\t"
2037 
2038  "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
2039  "punpckldq %%mm2, %%mm6 \n\t" //2
2040 
2041  "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
2042  "punpckhdq %%mm2, %%mm7 \n\t" //3
2043 
2044  "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
2045  "paddw %%mm1, %%mm4 \n\t"
2046 
2047  "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
2048  "psllw $2, %%mm3 \n\t" //t10
2049 
2050  "movq 0*8+%3, %%mm2 \n\t"
2051  "psllw $2, %%mm4 \n\t" //t11
2052 
2053  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3
2054  "paddw %%mm2, %%mm1 \n\t"
2055 
2056  "psllw $2, %%mm1 \n\t" //t12
2057  "movq %%mm3, %%mm0 \n\t"
2058 
2059  "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t"
2060  "psubw %%mm1, %%mm3 \n\t"
2061 
2062  "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
2063  "movq %%mm2, %%mm5 \n\t"
2064 
2065  "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
2066  "psubw %%mm4, %%mm2 \n\t" //z13
2067 
2068  "paddw %%mm4, %%mm5 \n\t" //z11
2069  "movq %%mm2, %%mm6 \n\t"
2070 
2071  "paddw %%mm3, %%mm0 \n\t" //z2
2072  "movq %%mm5, %%mm7 \n\t"
2073 
2074  "paddw %%mm0, %%mm2 \n\t" //d4
2075  "psubw %%mm0, %%mm6 \n\t" //d5
2076 
2077  "movq %%mm2, %%mm4 \n\t"
2078  "paddw %%mm3, %%mm1 \n\t" //z4
2079 
2080  //transpose 4x4
2081  "punpcklwd %%mm6, %%mm2 \n\t"
2082  "paddw %%mm1, %%mm5 \n\t" //d6
2083 
2084  "punpckhwd %%mm6, %%mm4 \n\t"
2085  "psubw %%mm1, %%mm7 \n\t" //d7
2086 
2087  "movq %%mm5, %%mm6 \n\t"
2088  "punpcklwd %%mm7, %%mm5 \n\t"
2089 
2090  "punpckhwd %%mm7, %%mm6 \n\t"
2091  "movq %%mm2, %%mm7 \n\t"
2092 
2093  "punpckldq %%mm5, %%mm2 \n\t" //4
2094  "sub %%"REG_d", %%"REG_S" \n\t"
2095 
2096  "punpckhdq %%mm5, %%mm7 \n\t" //5
2097  "movq %%mm4, %%mm5 \n\t"
2098 
2099  "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2100  "punpckldq %%mm6, %%mm4 \n\t" //6
2101 
2102  "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2103  "punpckhdq %%mm6, %%mm5 \n\t" //7
2104 
2105  "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2106  "add $4, %%"REG_S" \n\t"
2107 
2108  "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2109  "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows
2110  "dec %%"REG_c" \n\t"
2111  "jnz 6b \n\t"
2112 
2113  : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
2114  : "a"(line_size)
2115  : "%"REG_d);
2116 }
2117 
2118 #endif // HAVE_MMX