FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dsputil.c
Go to the documentation of this file.
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 /**
26  * @file
27  * DSP utils
28  */
29 
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
33 #include "avcodec.h"
34 #include "copy_block.h"
35 #include "dct.h"
36 #include "dsputil.h"
37 #include "simple_idct.h"
38 #include "faandct.h"
39 #include "faanidct.h"
40 #include "imgconvert.h"
41 #include "mathops.h"
42 #include "mpegvideo.h"
43 #include "config.h"
44 #include "diracdsp.h"
45 
46 uint32_t ff_squareTbl[512] = {0, };
47 
48 #define BIT_DEPTH 16
49 #include "dsputil_template.c"
50 #undef BIT_DEPTH
51 
52 #define BIT_DEPTH 8
53 #include "dsputil_template.c"
54 
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL/255 * 0x7f)
57 #define pb_80 (~0UL/255 * 0x80)
58 
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60  specification, we interleave the fields */
62  0, 8, 1, 9, 16, 24, 2, 10,
63  17, 25, 32, 40, 48, 56, 33, 41,
64  18, 26, 3, 11, 4, 12, 19, 27,
65  34, 42, 49, 57, 50, 58, 35, 43,
66  20, 28, 5, 13, 6, 14, 21, 29,
67  36, 44, 51, 59, 52, 60, 37, 45,
68  22, 30, 7, 15, 23, 31, 38, 46,
69  53, 61, 54, 62, 39, 47, 55, 63,
70 };
71 
73  0, 1, 2, 3, 8, 9, 16, 17,
74  10, 11, 4, 5, 6, 7, 15, 14,
75  13, 12, 19, 18, 24, 25, 32, 33,
76  26, 27, 20, 21, 22, 23, 28, 29,
77  30, 31, 34, 35, 40, 41, 48, 49,
78  42, 43, 36, 37, 38, 39, 44, 45,
79  46, 47, 50, 51, 56, 57, 58, 59,
80  52, 53, 54, 55, 60, 61, 62, 63,
81 };
82 
84  0, 8, 16, 24, 1, 9, 2, 10,
85  17, 25, 32, 40, 48, 56, 57, 49,
86  41, 33, 26, 18, 3, 11, 4, 12,
87  19, 27, 34, 42, 50, 58, 35, 43,
88  51, 59, 20, 28, 5, 13, 6, 14,
89  21, 29, 36, 44, 52, 60, 37, 45,
90  53, 61, 22, 30, 7, 15, 23, 31,
91  38, 46, 54, 62, 39, 47, 55, 63,
92 };
93 
94 /* Input permutation for the simple_idct_mmx */
95 static const uint8_t simple_mmx_permutation[64]={
96  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
97  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
98  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
99  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
100  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
101  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
102  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
103  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
104 };
105 
106 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
107 
109  const uint8_t *src_scantable)
110 {
111  int i;
112  int end;
113 
114  st->scantable= src_scantable;
115 
116  for(i=0; i<64; i++){
117  int j;
118  j = src_scantable[i];
119  st->permutated[i] = permutation[j];
120  }
121 
122  end=-1;
123  for(i=0; i<64; i++){
124  int j;
125  j = st->permutated[i];
126  if(j>end) end=j;
127  st->raster_end[i]= end;
128  }
129 }
130 
132  int idct_permutation_type)
133 {
134  int i;
135 
136  switch(idct_permutation_type){
137  case FF_NO_IDCT_PERM:
138  for(i=0; i<64; i++)
139  idct_permutation[i]= i;
140  break;
142  for(i=0; i<64; i++)
143  idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
144  break;
145  case FF_SIMPLE_IDCT_PERM:
146  for(i=0; i<64; i++)
147  idct_permutation[i]= simple_mmx_permutation[i];
148  break;
150  for(i=0; i<64; i++)
151  idct_permutation[i]= ((i&7)<<3) | (i>>3);
152  break;
154  for(i=0; i<64; i++)
155  idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
156  break;
157  case FF_SSE2_IDCT_PERM:
158  for(i=0; i<64; i++)
159  idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
160  break;
161  default:
162  av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
163  }
164 }
165 
166 static int pix_sum_c(uint8_t * pix, int line_size)
167 {
168  int s, i, j;
169 
170  s = 0;
171  for (i = 0; i < 16; i++) {
172  for (j = 0; j < 16; j += 8) {
173  s += pix[0];
174  s += pix[1];
175  s += pix[2];
176  s += pix[3];
177  s += pix[4];
178  s += pix[5];
179  s += pix[6];
180  s += pix[7];
181  pix += 8;
182  }
183  pix += line_size - 16;
184  }
185  return s;
186 }
187 
188 static int pix_norm1_c(uint8_t * pix, int line_size)
189 {
190  int s, i, j;
191  uint32_t *sq = ff_squareTbl + 256;
192 
193  s = 0;
194  for (i = 0; i < 16; i++) {
195  for (j = 0; j < 16; j += 8) {
196 #if 0
197  s += sq[pix[0]];
198  s += sq[pix[1]];
199  s += sq[pix[2]];
200  s += sq[pix[3]];
201  s += sq[pix[4]];
202  s += sq[pix[5]];
203  s += sq[pix[6]];
204  s += sq[pix[7]];
205 #else
206 #if HAVE_FAST_64BIT
207  register uint64_t x=*(uint64_t*)pix;
208  s += sq[x&0xff];
209  s += sq[(x>>8)&0xff];
210  s += sq[(x>>16)&0xff];
211  s += sq[(x>>24)&0xff];
212  s += sq[(x>>32)&0xff];
213  s += sq[(x>>40)&0xff];
214  s += sq[(x>>48)&0xff];
215  s += sq[(x>>56)&0xff];
216 #else
217  register uint32_t x=*(uint32_t*)pix;
218  s += sq[x&0xff];
219  s += sq[(x>>8)&0xff];
220  s += sq[(x>>16)&0xff];
221  s += sq[(x>>24)&0xff];
222  x=*(uint32_t*)(pix+4);
223  s += sq[x&0xff];
224  s += sq[(x>>8)&0xff];
225  s += sq[(x>>16)&0xff];
226  s += sq[(x>>24)&0xff];
227 #endif
228 #endif
229  pix += 8;
230  }
231  pix += line_size - 16;
232  }
233  return s;
234 }
235 
236 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
237  int i;
238 
239  for(i=0; i+8<=w; i+=8){
240  dst[i+0]= av_bswap32(src[i+0]);
241  dst[i+1]= av_bswap32(src[i+1]);
242  dst[i+2]= av_bswap32(src[i+2]);
243  dst[i+3]= av_bswap32(src[i+3]);
244  dst[i+4]= av_bswap32(src[i+4]);
245  dst[i+5]= av_bswap32(src[i+5]);
246  dst[i+6]= av_bswap32(src[i+6]);
247  dst[i+7]= av_bswap32(src[i+7]);
248  }
249  for(;i<w; i++){
250  dst[i+0]= av_bswap32(src[i+0]);
251  }
252 }
253 
254 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
255 {
256  while (len--)
257  *dst++ = av_bswap16(*src++);
258 }
259 
260 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
261 {
262  int s, i;
263  uint32_t *sq = ff_squareTbl + 256;
264 
265  s = 0;
266  for (i = 0; i < h; i++) {
267  s += sq[pix1[0] - pix2[0]];
268  s += sq[pix1[1] - pix2[1]];
269  s += sq[pix1[2] - pix2[2]];
270  s += sq[pix1[3] - pix2[3]];
271  pix1 += line_size;
272  pix2 += line_size;
273  }
274  return s;
275 }
276 
277 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
278 {
279  int s, i;
280  uint32_t *sq = ff_squareTbl + 256;
281 
282  s = 0;
283  for (i = 0; i < h; i++) {
284  s += sq[pix1[0] - pix2[0]];
285  s += sq[pix1[1] - pix2[1]];
286  s += sq[pix1[2] - pix2[2]];
287  s += sq[pix1[3] - pix2[3]];
288  s += sq[pix1[4] - pix2[4]];
289  s += sq[pix1[5] - pix2[5]];
290  s += sq[pix1[6] - pix2[6]];
291  s += sq[pix1[7] - pix2[7]];
292  pix1 += line_size;
293  pix2 += line_size;
294  }
295  return s;
296 }
297 
298 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
299 {
300  int s, i;
301  uint32_t *sq = ff_squareTbl + 256;
302 
303  s = 0;
304  for (i = 0; i < h; i++) {
305  s += sq[pix1[ 0] - pix2[ 0]];
306  s += sq[pix1[ 1] - pix2[ 1]];
307  s += sq[pix1[ 2] - pix2[ 2]];
308  s += sq[pix1[ 3] - pix2[ 3]];
309  s += sq[pix1[ 4] - pix2[ 4]];
310  s += sq[pix1[ 5] - pix2[ 5]];
311  s += sq[pix1[ 6] - pix2[ 6]];
312  s += sq[pix1[ 7] - pix2[ 7]];
313  s += sq[pix1[ 8] - pix2[ 8]];
314  s += sq[pix1[ 9] - pix2[ 9]];
315  s += sq[pix1[10] - pix2[10]];
316  s += sq[pix1[11] - pix2[11]];
317  s += sq[pix1[12] - pix2[12]];
318  s += sq[pix1[13] - pix2[13]];
319  s += sq[pix1[14] - pix2[14]];
320  s += sq[pix1[15] - pix2[15]];
321 
322  pix1 += line_size;
323  pix2 += line_size;
324  }
325  return s;
326 }
327 
328 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
329  const uint8_t *s2, int stride){
330  int i;
331 
332  /* read the pixels */
333  for(i=0;i<8;i++) {
334  block[0] = s1[0] - s2[0];
335  block[1] = s1[1] - s2[1];
336  block[2] = s1[2] - s2[2];
337  block[3] = s1[3] - s2[3];
338  block[4] = s1[4] - s2[4];
339  block[5] = s1[5] - s2[5];
340  block[6] = s1[6] - s2[6];
341  block[7] = s1[7] - s2[7];
342  s1 += stride;
343  s2 += stride;
344  block += 8;
345  }
346 }
347 
348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
349  int line_size)
350 {
351  int i;
352 
353  /* read the pixels */
354  for(i=0;i<8;i++) {
355  pixels[0] = av_clip_uint8(block[0]);
356  pixels[1] = av_clip_uint8(block[1]);
357  pixels[2] = av_clip_uint8(block[2]);
358  pixels[3] = av_clip_uint8(block[3]);
359  pixels[4] = av_clip_uint8(block[4]);
360  pixels[5] = av_clip_uint8(block[5]);
361  pixels[6] = av_clip_uint8(block[6]);
362  pixels[7] = av_clip_uint8(block[7]);
363 
364  pixels += line_size;
365  block += 8;
366  }
367 }
368 
369 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
370  int line_size)
371 {
372  int i;
373 
374  /* read the pixels */
375  for(i=0;i<4;i++) {
376  pixels[0] = av_clip_uint8(block[0]);
377  pixels[1] = av_clip_uint8(block[1]);
378  pixels[2] = av_clip_uint8(block[2]);
379  pixels[3] = av_clip_uint8(block[3]);
380 
381  pixels += line_size;
382  block += 8;
383  }
384 }
385 
386 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
387  int line_size)
388 {
389  int i;
390 
391  /* read the pixels */
392  for(i=0;i<2;i++) {
393  pixels[0] = av_clip_uint8(block[0]);
394  pixels[1] = av_clip_uint8(block[1]);
395 
396  pixels += line_size;
397  block += 8;
398  }
399 }
400 
401 static void put_signed_pixels_clamped_c(const int16_t *block,
402  uint8_t *av_restrict pixels,
403  int line_size)
404 {
405  int i, j;
406 
407  for (i = 0; i < 8; i++) {
408  for (j = 0; j < 8; j++) {
409  if (*block < -128)
410  *pixels = 0;
411  else if (*block > 127)
412  *pixels = 255;
413  else
414  *pixels = (uint8_t)(*block + 128);
415  block++;
416  pixels++;
417  }
418  pixels += (line_size - 8);
419  }
420 }
421 
422 static void add_pixels8_c(uint8_t *av_restrict pixels,
423  int16_t *block,
424  int line_size)
425 {
426  int i;
427 
428  for(i=0;i<8;i++) {
429  pixels[0] += block[0];
430  pixels[1] += block[1];
431  pixels[2] += block[2];
432  pixels[3] += block[3];
433  pixels[4] += block[4];
434  pixels[5] += block[5];
435  pixels[6] += block[6];
436  pixels[7] += block[7];
437  pixels += line_size;
438  block += 8;
439  }
440 }
441 
442 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
443  int line_size)
444 {
445  int i;
446 
447  /* read the pixels */
448  for(i=0;i<8;i++) {
449  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
450  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
451  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
452  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
453  pixels[4] = av_clip_uint8(pixels[4] + block[4]);
454  pixels[5] = av_clip_uint8(pixels[5] + block[5]);
455  pixels[6] = av_clip_uint8(pixels[6] + block[6]);
456  pixels[7] = av_clip_uint8(pixels[7] + block[7]);
457  pixels += line_size;
458  block += 8;
459  }
460 }
461 
462 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
463  int line_size)
464 {
465  int i;
466 
467  /* read the pixels */
468  for(i=0;i<4;i++) {
469  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
470  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
471  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
472  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
473  pixels += line_size;
474  block += 8;
475  }
476 }
477 
478 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
479  int line_size)
480 {
481  int i;
482 
483  /* read the pixels */
484  for(i=0;i<2;i++) {
485  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
486  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
487  pixels += line_size;
488  block += 8;
489  }
490 }
491 
492 static int sum_abs_dctelem_c(int16_t *block)
493 {
494  int sum=0, i;
495  for(i=0; i<64; i++)
496  sum+= FFABS(block[i]);
497  return sum;
498 }
499 
500 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
501 {
502  int i;
503 
504  for (i = 0; i < h; i++) {
505  memset(block, value, 16);
506  block += line_size;
507  }
508 }
509 
510 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
511 {
512  int i;
513 
514  for (i = 0; i < h; i++) {
515  memset(block, value, 8);
516  block += line_size;
517  }
518 }
519 
520 #define avg2(a,b) ((a+b+1)>>1)
521 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
522 
523 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
524 {
525  const int A=(16-x16)*(16-y16);
526  const int B=( x16)*(16-y16);
527  const int C=(16-x16)*( y16);
528  const int D=( x16)*( y16);
529  int i;
530 
531  for(i=0; i<h; i++)
532  {
533  dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
534  dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
535  dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
536  dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
537  dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
538  dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
539  dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
540  dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
541  dst+= stride;
542  src+= stride;
543  }
544 }
545 
546 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
547  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
548 {
549  int y, vx, vy;
550  const int s= 1<<shift;
551 
552  width--;
553  height--;
554 
555  for(y=0; y<h; y++){
556  int x;
557 
558  vx= ox;
559  vy= oy;
560  for(x=0; x<8; x++){ //XXX FIXME optimize
561  int src_x, src_y, frac_x, frac_y, index;
562 
563  src_x= vx>>16;
564  src_y= vy>>16;
565  frac_x= src_x&(s-1);
566  frac_y= src_y&(s-1);
567  src_x>>=shift;
568  src_y>>=shift;
569 
570  if((unsigned)src_x < width){
571  if((unsigned)src_y < height){
572  index= src_x + src_y*stride;
573  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
574  + src[index +1]* frac_x )*(s-frac_y)
575  + ( src[index+stride ]*(s-frac_x)
576  + src[index+stride+1]* frac_x )* frac_y
577  + r)>>(shift*2);
578  }else{
579  index= src_x + av_clip(src_y, 0, height)*stride;
580  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
581  + src[index +1]* frac_x )*s
582  + r)>>(shift*2);
583  }
584  }else{
585  if((unsigned)src_y < height){
586  index= av_clip(src_x, 0, width) + src_y*stride;
587  dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
588  + src[index+stride ]* frac_y )*s
589  + r)>>(shift*2);
590  }else{
591  index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
592  dst[y*stride + x]= src[index ];
593  }
594  }
595 
596  vx+= dxx;
597  vy+= dyx;
598  }
599  ox += dxy;
600  oy += dyy;
601  }
602 }
603 
604 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
605  switch(width){
606  case 2: put_pixels2_8_c (dst, src, stride, height); break;
607  case 4: put_pixels4_8_c (dst, src, stride, height); break;
608  case 8: put_pixels8_8_c (dst, src, stride, height); break;
609  case 16:put_pixels16_8_c(dst, src, stride, height); break;
610  }
611 }
612 
613 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
614  int i,j;
615  for (i=0; i < height; i++) {
616  for (j=0; j < width; j++) {
617  dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
618  }
619  src += stride;
620  dst += stride;
621  }
622 }
623 
624 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
625  int i,j;
626  for (i=0; i < height; i++) {
627  for (j=0; j < width; j++) {
628  dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
629  }
630  src += stride;
631  dst += stride;
632  }
633 }
634 
635 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
636  int i,j;
637  for (i=0; i < height; i++) {
638  for (j=0; j < width; j++) {
639  dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
640  }
641  src += stride;
642  dst += stride;
643  }
644 }
645 
646 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
647  int i,j;
648  for (i=0; i < height; i++) {
649  for (j=0; j < width; j++) {
650  dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
651  }
652  src += stride;
653  dst += stride;
654  }
655 }
656 
657 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
658  int i,j;
659  for (i=0; i < height; i++) {
660  for (j=0; j < width; j++) {
661  dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
662  }
663  src += stride;
664  dst += stride;
665  }
666 }
667 
668 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
669  int i,j;
670  for (i=0; i < height; i++) {
671  for (j=0; j < width; j++) {
672  dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
673  }
674  src += stride;
675  dst += stride;
676  }
677 }
678 
679 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
680  int i,j;
681  for (i=0; i < height; i++) {
682  for (j=0; j < width; j++) {
683  dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
684  }
685  src += stride;
686  dst += stride;
687  }
688 }
689 
690 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
691  int i,j;
692  for (i=0; i < height; i++) {
693  for (j=0; j < width; j++) {
694  dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
695  }
696  src += stride;
697  dst += stride;
698  }
699 }
700 
701 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
702  switch(width){
703  case 2: avg_pixels2_8_c (dst, src, stride, height); break;
704  case 4: avg_pixels4_8_c (dst, src, stride, height); break;
705  case 8: avg_pixels8_8_c (dst, src, stride, height); break;
706  case 16:avg_pixels16_8_c(dst, src, stride, height); break;
707  }
708 }
709 
710 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
711  int i,j;
712  for (i=0; i < height; i++) {
713  for (j=0; j < width; j++) {
714  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
715  }
716  src += stride;
717  dst += stride;
718  }
719 }
720 
721 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
722  int i,j;
723  for (i=0; i < height; i++) {
724  for (j=0; j < width; j++) {
725  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
726  }
727  src += stride;
728  dst += stride;
729  }
730 }
731 
732 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
733  int i,j;
734  for (i=0; i < height; i++) {
735  for (j=0; j < width; j++) {
736  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
737  }
738  src += stride;
739  dst += stride;
740  }
741 }
742 
743 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
744  int i,j;
745  for (i=0; i < height; i++) {
746  for (j=0; j < width; j++) {
747  dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
748  }
749  src += stride;
750  dst += stride;
751  }
752 }
753 
754 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
755  int i,j;
756  for (i=0; i < height; i++) {
757  for (j=0; j < width; j++) {
758  dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
759  }
760  src += stride;
761  dst += stride;
762  }
763 }
764 
765 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
766  int i,j;
767  for (i=0; i < height; i++) {
768  for (j=0; j < width; j++) {
769  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
770  }
771  src += stride;
772  dst += stride;
773  }
774 }
775 
776 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
777  int i,j;
778  for (i=0; i < height; i++) {
779  for (j=0; j < width; j++) {
780  dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
781  }
782  src += stride;
783  dst += stride;
784  }
785 }
786 
787 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
788  int i,j;
789  for (i=0; i < height; i++) {
790  for (j=0; j < width; j++) {
791  dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
792  }
793  src += stride;
794  dst += stride;
795  }
796 }
797 
798 #define QPEL_MC(r, OPNAME, RND, OP) \
799 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
800  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
801  int i;\
802  for(i=0; i<h; i++)\
803  {\
804  OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
805  OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
806  OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
807  OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
808  OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
809  OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
810  OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
811  OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
812  dst+=dstStride;\
813  src+=srcStride;\
814  }\
815 }\
816 \
817 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
818  const int w=8;\
819  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
820  int i;\
821  for(i=0; i<w; i++)\
822  {\
823  const int src0= src[0*srcStride];\
824  const int src1= src[1*srcStride];\
825  const int src2= src[2*srcStride];\
826  const int src3= src[3*srcStride];\
827  const int src4= src[4*srcStride];\
828  const int src5= src[5*srcStride];\
829  const int src6= src[6*srcStride];\
830  const int src7= src[7*srcStride];\
831  const int src8= src[8*srcStride];\
832  OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
833  OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
834  OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
835  OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
836  OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
837  OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
838  OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
839  OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
840  dst++;\
841  src++;\
842  }\
843 }\
844 \
845 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
846  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
847  int i;\
848  \
849  for(i=0; i<h; i++)\
850  {\
851  OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
852  OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
853  OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
854  OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
855  OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
856  OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
857  OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
858  OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
859  OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
860  OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
861  OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
862  OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
863  OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
864  OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
865  OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
866  OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
867  dst+=dstStride;\
868  src+=srcStride;\
869  }\
870 }\
871 \
872 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
873  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
874  int i;\
875  const int w=16;\
876  for(i=0; i<w; i++)\
877  {\
878  const int src0= src[0*srcStride];\
879  const int src1= src[1*srcStride];\
880  const int src2= src[2*srcStride];\
881  const int src3= src[3*srcStride];\
882  const int src4= src[4*srcStride];\
883  const int src5= src[5*srcStride];\
884  const int src6= src[6*srcStride];\
885  const int src7= src[7*srcStride];\
886  const int src8= src[8*srcStride];\
887  const int src9= src[9*srcStride];\
888  const int src10= src[10*srcStride];\
889  const int src11= src[11*srcStride];\
890  const int src12= src[12*srcStride];\
891  const int src13= src[13*srcStride];\
892  const int src14= src[14*srcStride];\
893  const int src15= src[15*srcStride];\
894  const int src16= src[16*srcStride];\
895  OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
896  OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
897  OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
898  OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
899  OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
900  OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
901  OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
902  OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
903  OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
904  OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
905  OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
906  OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
907  OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
908  OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
909  OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
910  OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
911  dst++;\
912  src++;\
913  }\
914 }\
915 \
916 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
917 {\
918  uint8_t half[64];\
919  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
920  OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
921 }\
922 \
923 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
924 {\
925  OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
926 }\
927 \
928 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
929 {\
930  uint8_t half[64];\
931  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
932  OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
933 }\
934 \
935 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
936 {\
937  uint8_t full[16*9];\
938  uint8_t half[64];\
939  copy_block9(full, src, 16, stride, 9);\
940  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
941  OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
942 }\
943 \
944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
945 {\
946  uint8_t full[16*9];\
947  copy_block9(full, src, 16, stride, 9);\
948  OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
949 }\
950 \
951 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
952 {\
953  uint8_t full[16*9];\
954  uint8_t half[64];\
955  copy_block9(full, src, 16, stride, 9);\
956  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
957  OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
958 }\
959 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
960 {\
961  uint8_t full[16*9];\
962  uint8_t halfH[72];\
963  uint8_t halfV[64];\
964  uint8_t halfHV[64];\
965  copy_block9(full, src, 16, stride, 9);\
966  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
967  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
968  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
969  OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
970 }\
971 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
972 {\
973  uint8_t full[16*9];\
974  uint8_t halfH[72];\
975  uint8_t halfHV[64];\
976  copy_block9(full, src, 16, stride, 9);\
977  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
978  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
979  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
980  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
981 }\
982 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
983 {\
984  uint8_t full[16*9];\
985  uint8_t halfH[72];\
986  uint8_t halfV[64];\
987  uint8_t halfHV[64];\
988  copy_block9(full, src, 16, stride, 9);\
989  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
990  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
991  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
992  OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
993 }\
994 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
995 {\
996  uint8_t full[16*9];\
997  uint8_t halfH[72];\
998  uint8_t halfHV[64];\
999  copy_block9(full, src, 16, stride, 9);\
1000  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1004 }\
1005 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1006 {\
1007  uint8_t full[16*9];\
1008  uint8_t halfH[72];\
1009  uint8_t halfV[64];\
1010  uint8_t halfHV[64];\
1011  copy_block9(full, src, 16, stride, 9);\
1012  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015  OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1016 }\
1017 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1018 {\
1019  uint8_t full[16*9];\
1020  uint8_t halfH[72];\
1021  uint8_t halfHV[64];\
1022  copy_block9(full, src, 16, stride, 9);\
1023  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1024  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1025  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1026  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1027 }\
1028 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1029 {\
1030  uint8_t full[16*9];\
1031  uint8_t halfH[72];\
1032  uint8_t halfV[64];\
1033  uint8_t halfHV[64];\
1034  copy_block9(full, src, 16, stride, 9);\
1035  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1036  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1037  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038  OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1039 }\
1040 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1041 {\
1042  uint8_t full[16*9];\
1043  uint8_t halfH[72];\
1044  uint8_t halfHV[64];\
1045  copy_block9(full, src, 16, stride, 9);\
1046  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1050 }\
1051 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1052 {\
1053  uint8_t halfH[72];\
1054  uint8_t halfHV[64];\
1055  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1056  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1057  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1058 }\
1059 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1060 {\
1061  uint8_t halfH[72];\
1062  uint8_t halfHV[64];\
1063  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1064  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1065  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1066 }\
1067 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1068 {\
1069  uint8_t full[16*9];\
1070  uint8_t halfH[72];\
1071  uint8_t halfV[64];\
1072  uint8_t halfHV[64];\
1073  copy_block9(full, src, 16, stride, 9);\
1074  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1076  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1078 }\
1079 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1080 {\
1081  uint8_t full[16*9];\
1082  uint8_t halfH[72];\
1083  copy_block9(full, src, 16, stride, 9);\
1084  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1085  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1086  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1087 }\
1088 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1089 {\
1090  uint8_t full[16*9];\
1091  uint8_t halfH[72];\
1092  uint8_t halfV[64];\
1093  uint8_t halfHV[64];\
1094  copy_block9(full, src, 16, stride, 9);\
1095  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1096  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1097  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1098  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1099 }\
1100 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1101 {\
1102  uint8_t full[16*9];\
1103  uint8_t halfH[72];\
1104  copy_block9(full, src, 16, stride, 9);\
1105  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1106  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1107  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1108 }\
1109 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1110 {\
1111  uint8_t halfH[72];\
1112  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1113  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1114 }\
1115 \
1116 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1117 {\
1118  uint8_t half[256];\
1119  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1120  OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1121 }\
1122 \
1123 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1124 {\
1125  OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1126 }\
1127 \
1128 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1129 {\
1130  uint8_t half[256];\
1131  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132  OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1133 }\
1134 \
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1136 {\
1137  uint8_t full[24*17];\
1138  uint8_t half[256];\
1139  copy_block17(full, src, 24, stride, 17);\
1140  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141  OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1142 }\
1143 \
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1145 {\
1146  uint8_t full[24*17];\
1147  copy_block17(full, src, 24, stride, 17);\
1148  OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1149 }\
1150 \
1151 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1152 {\
1153  uint8_t full[24*17];\
1154  uint8_t half[256];\
1155  copy_block17(full, src, 24, stride, 17);\
1156  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1157  OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1158 }\
1159 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1160 {\
1161  uint8_t full[24*17];\
1162  uint8_t halfH[272];\
1163  uint8_t halfV[256];\
1164  uint8_t halfHV[256];\
1165  copy_block17(full, src, 24, stride, 17);\
1166  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169  OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1170 }\
1171 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1172 {\
1173  uint8_t full[24*17];\
1174  uint8_t halfH[272];\
1175  uint8_t halfHV[256];\
1176  copy_block17(full, src, 24, stride, 17);\
1177  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1179  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1181 }\
1182 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1183 {\
1184  uint8_t full[24*17];\
1185  uint8_t halfH[272];\
1186  uint8_t halfV[256];\
1187  uint8_t halfHV[256];\
1188  copy_block17(full, src, 24, stride, 17);\
1189  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1190  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1191  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192  OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1193 }\
1194 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1195 {\
1196  uint8_t full[24*17];\
1197  uint8_t halfH[272];\
1198  uint8_t halfHV[256];\
1199  copy_block17(full, src, 24, stride, 17);\
1200  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1202  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1204 }\
1205 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1206 {\
1207  uint8_t full[24*17];\
1208  uint8_t halfH[272];\
1209  uint8_t halfV[256];\
1210  uint8_t halfHV[256];\
1211  copy_block17(full, src, 24, stride, 17);\
1212  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215  OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1216 }\
1217 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1218 {\
1219  uint8_t full[24*17];\
1220  uint8_t halfH[272];\
1221  uint8_t halfHV[256];\
1222  copy_block17(full, src, 24, stride, 17);\
1223  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1225  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1227 }\
1228 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1229 {\
1230  uint8_t full[24*17];\
1231  uint8_t halfH[272];\
1232  uint8_t halfV[256];\
1233  uint8_t halfHV[256];\
1234  copy_block17(full, src, 24, stride, 17);\
1235  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1236  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1237  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238  OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1239 }\
1240 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1241 {\
1242  uint8_t full[24*17];\
1243  uint8_t halfH[272];\
1244  uint8_t halfHV[256];\
1245  copy_block17(full, src, 24, stride, 17);\
1246  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1249  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1250 }\
1251 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1252 {\
1253  uint8_t halfH[272];\
1254  uint8_t halfHV[256];\
1255  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1256  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1257  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1258 }\
1259 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1260 {\
1261  uint8_t halfH[272];\
1262  uint8_t halfHV[256];\
1263  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1266 }\
1267 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1268 {\
1269  uint8_t full[24*17];\
1270  uint8_t halfH[272];\
1271  uint8_t halfV[256];\
1272  uint8_t halfHV[256];\
1273  copy_block17(full, src, 24, stride, 17);\
1274  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1275  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1276  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1277  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1278 }\
1279 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1280 {\
1281  uint8_t full[24*17];\
1282  uint8_t halfH[272];\
1283  copy_block17(full, src, 24, stride, 17);\
1284  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1285  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1286  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1287 }\
1288 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1289 {\
1290  uint8_t full[24*17];\
1291  uint8_t halfH[272];\
1292  uint8_t halfV[256];\
1293  uint8_t halfHV[256];\
1294  copy_block17(full, src, 24, stride, 17);\
1295  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1296  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1297  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1298  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1299 }\
1300 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1301 {\
1302  uint8_t full[24*17];\
1303  uint8_t halfH[272];\
1304  copy_block17(full, src, 24, stride, 17);\
1305  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1306  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1307  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1308 }\
1309 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1310 {\
1311  uint8_t halfH[272];\
1312  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1313  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1314 }
1315 
1316 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1317 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1318 #define op_put(a, b) a = cm[((b) + 16)>>5]
1319 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1320 
1321 QPEL_MC(0, put_ , _ , op_put)
1322 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1323 QPEL_MC(0, avg_ , _ , op_avg)
1324 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1325 #undef op_avg
1326 #undef op_avg_no_rnd
1327 #undef op_put
1328 #undef op_put_no_rnd
1329 
1331 {
1332  put_pixels8_8_c(dst, src, stride, 8);
1333 }
1335 {
1336  avg_pixels8_8_c(dst, src, stride, 8);
1337 }
1339 {
1340  put_pixels16_8_c(dst, src, stride, 16);
1341 }
1343 {
1344  avg_pixels16_8_c(dst, src, stride, 16);
1345 }
1346 
1347 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1348 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1349 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1350 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1351 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1352 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1353 
1354 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1355  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1356  int i;
1357 
1358  for(i=0; i<h; i++){
1359  dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1360  dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1361  dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1362  dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1363  dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1364  dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1365  dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1366  dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1367  dst+=dstStride;
1368  src+=srcStride;
1369  }
1370 }
1371 
1372 #if CONFIG_RV40_DECODER
1373 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1374 {
1375  put_pixels16_xy2_8_c(dst, src, stride, 16);
1376 }
1377 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 {
1379  avg_pixels16_xy2_8_c(dst, src, stride, 16);
1380 }
1381 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 {
1383  put_pixels8_xy2_8_c(dst, src, stride, 8);
1384 }
1385 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 {
1387  avg_pixels8_xy2_8_c(dst, src, stride, 8);
1388 }
1389 #endif /* CONFIG_RV40_DECODER */
1390 
1391 #if CONFIG_DIRAC_DECODER
1392 #define DIRAC_MC(OPNAME)\
1393 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1394 {\
1395  OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1396 }\
1397 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1398 {\
1399  OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1400 }\
1401 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1402 {\
1403  OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1404  OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1405 }\
1406 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1407 {\
1408  OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1409 }\
1410 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1411 {\
1412  OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1413 }\
1414 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1415 {\
1416  OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1417  OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1418 }\
1419 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1420 {\
1421  OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1422 }\
1423 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1424 {\
1425  OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1426 }\
1427 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1428 {\
1429  OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1430  OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1431 }
1432 DIRAC_MC(put)
1433 DIRAC_MC(avg)
1434 #endif
1435 
1436 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1437  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1438  int i;
1439 
1440  for(i=0; i<w; i++){
1441  const int src_1= src[ -srcStride];
1442  const int src0 = src[0 ];
1443  const int src1 = src[ srcStride];
1444  const int src2 = src[2*srcStride];
1445  const int src3 = src[3*srcStride];
1446  const int src4 = src[4*srcStride];
1447  const int src5 = src[5*srcStride];
1448  const int src6 = src[6*srcStride];
1449  const int src7 = src[7*srcStride];
1450  const int src8 = src[8*srcStride];
1451  const int src9 = src[9*srcStride];
1452  dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1453  dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1454  dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1455  dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1456  dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1457  dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1458  dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1459  dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1460  src++;
1461  dst++;
1462  }
1463 }
1464 
1465 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1466 {
1467  uint8_t half[64];
1468  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1469  put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1470 }
1471 
1472 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1473 {
1474  wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1475 }
1476 
1477 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1478 {
1479  uint8_t half[64];
1480  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1481  put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1482 }
1483 
1484 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1485 {
1486  wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1487 }
1488 
1489 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1490 {
1491  uint8_t halfH[88];
1492  uint8_t halfV[64];
1493  uint8_t halfHV[64];
1494  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1495  wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1496  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1497  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1498 }
1499 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1500 {
1501  uint8_t halfH[88];
1502  uint8_t halfV[64];
1503  uint8_t halfHV[64];
1504  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1505  wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1506  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1507  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1508 }
1509 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1510 {
1511  uint8_t halfH[88];
1512  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1513  wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1514 }
1515 
1516 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1517 {
1518  int s, i;
1519 
1520  s = 0;
1521  for(i=0;i<h;i++) {
1522  s += abs(pix1[0] - pix2[0]);
1523  s += abs(pix1[1] - pix2[1]);
1524  s += abs(pix1[2] - pix2[2]);
1525  s += abs(pix1[3] - pix2[3]);
1526  s += abs(pix1[4] - pix2[4]);
1527  s += abs(pix1[5] - pix2[5]);
1528  s += abs(pix1[6] - pix2[6]);
1529  s += abs(pix1[7] - pix2[7]);
1530  s += abs(pix1[8] - pix2[8]);
1531  s += abs(pix1[9] - pix2[9]);
1532  s += abs(pix1[10] - pix2[10]);
1533  s += abs(pix1[11] - pix2[11]);
1534  s += abs(pix1[12] - pix2[12]);
1535  s += abs(pix1[13] - pix2[13]);
1536  s += abs(pix1[14] - pix2[14]);
1537  s += abs(pix1[15] - pix2[15]);
1538  pix1 += line_size;
1539  pix2 += line_size;
1540  }
1541  return s;
1542 }
1543 
1544 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1545 {
1546  int s, i;
1547 
1548  s = 0;
1549  for(i=0;i<h;i++) {
1550  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1551  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1552  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1553  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1554  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1555  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1556  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1557  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1558  s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1559  s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1560  s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1561  s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1562  s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1563  s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1564  s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1565  s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1566  pix1 += line_size;
1567  pix2 += line_size;
1568  }
1569  return s;
1570 }
1571 
1572 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1573 {
1574  int s, i;
1575  uint8_t *pix3 = pix2 + line_size;
1576 
1577  s = 0;
1578  for(i=0;i<h;i++) {
1579  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1580  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1581  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1582  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1583  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1584  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1585  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1586  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1587  s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1588  s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1589  s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1590  s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1591  s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1592  s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1593  s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1594  s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1595  pix1 += line_size;
1596  pix2 += line_size;
1597  pix3 += line_size;
1598  }
1599  return s;
1600 }
1601 
1602 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1603 {
1604  int s, i;
1605  uint8_t *pix3 = pix2 + line_size;
1606 
1607  s = 0;
1608  for(i=0;i<h;i++) {
1609  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1610  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1611  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1612  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1613  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1614  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1615  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1616  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1617  s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1618  s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1619  s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1620  s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1621  s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1622  s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1623  s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1624  s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1625  pix1 += line_size;
1626  pix2 += line_size;
1627  pix3 += line_size;
1628  }
1629  return s;
1630 }
1631 
1632 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1633 {
1634  int s, i;
1635 
1636  s = 0;
1637  for(i=0;i<h;i++) {
1638  s += abs(pix1[0] - pix2[0]);
1639  s += abs(pix1[1] - pix2[1]);
1640  s += abs(pix1[2] - pix2[2]);
1641  s += abs(pix1[3] - pix2[3]);
1642  s += abs(pix1[4] - pix2[4]);
1643  s += abs(pix1[5] - pix2[5]);
1644  s += abs(pix1[6] - pix2[6]);
1645  s += abs(pix1[7] - pix2[7]);
1646  pix1 += line_size;
1647  pix2 += line_size;
1648  }
1649  return s;
1650 }
1651 
1652 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1653 {
1654  int s, i;
1655 
1656  s = 0;
1657  for(i=0;i<h;i++) {
1658  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1659  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1660  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1661  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1662  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1663  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1664  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1665  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1666  pix1 += line_size;
1667  pix2 += line_size;
1668  }
1669  return s;
1670 }
1671 
1672 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1673 {
1674  int s, i;
1675  uint8_t *pix3 = pix2 + line_size;
1676 
1677  s = 0;
1678  for(i=0;i<h;i++) {
1679  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1680  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1681  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1682  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1683  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1684  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1685  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1686  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1687  pix1 += line_size;
1688  pix2 += line_size;
1689  pix3 += line_size;
1690  }
1691  return s;
1692 }
1693 
1694 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1695 {
1696  int s, i;
1697  uint8_t *pix3 = pix2 + line_size;
1698 
1699  s = 0;
1700  for(i=0;i<h;i++) {
1701  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1702  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1703  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1704  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1705  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1706  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1707  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1708  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1709  pix1 += line_size;
1710  pix2 += line_size;
1711  pix3 += line_size;
1712  }
1713  return s;
1714 }
1715 
1716 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1717  MpegEncContext *c = v;
1718  int score1=0;
1719  int score2=0;
1720  int x,y;
1721 
1722  for(y=0; y<h; y++){
1723  for(x=0; x<16; x++){
1724  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1725  }
1726  if(y+1<h){
1727  for(x=0; x<15; x++){
1728  score2+= FFABS( s1[x ] - s1[x +stride]
1729  - s1[x+1] + s1[x+1+stride])
1730  -FFABS( s2[x ] - s2[x +stride]
1731  - s2[x+1] + s2[x+1+stride]);
1732  }
1733  }
1734  s1+= stride;
1735  s2+= stride;
1736  }
1737 
1738  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1739  else return score1 + FFABS(score2)*8;
1740 }
1741 
1742 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743  MpegEncContext *c = v;
1744  int score1=0;
1745  int score2=0;
1746  int x,y;
1747 
1748  for(y=0; y<h; y++){
1749  for(x=0; x<8; x++){
1750  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1751  }
1752  if(y+1<h){
1753  for(x=0; x<7; x++){
1754  score2+= FFABS( s1[x ] - s1[x +stride]
1755  - s1[x+1] + s1[x+1+stride])
1756  -FFABS( s2[x ] - s2[x +stride]
1757  - s2[x+1] + s2[x+1+stride]);
1758  }
1759  }
1760  s1+= stride;
1761  s2+= stride;
1762  }
1763 
1764  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765  else return score1 + FFABS(score2)*8;
1766 }
1767 
1768 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1769  int i;
1770  unsigned int sum=0;
1771 
1772  for(i=0; i<8*8; i++){
1773  int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1774  int w= weight[i];
1775  b>>= RECON_SHIFT;
1776  av_assert2(-512<b && b<512);
1777 
1778  sum += (w*b)*(w*b)>>4;
1779  }
1780  return sum>>2;
1781 }
1782 
1783 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1784  int i;
1785 
1786  for(i=0; i<8*8; i++){
1787  rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1788  }
1789 }
1790 
1791 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1792  return 0;
1793 }
1794 
1796  int i;
1797 
1798  memset(cmp, 0, sizeof(void*)*6);
1799 
1800  for(i=0; i<6; i++){
1801  switch(type&0xFF){
1802  case FF_CMP_SAD:
1803  cmp[i]= c->sad[i];
1804  break;
1805  case FF_CMP_SATD:
1806  cmp[i]= c->hadamard8_diff[i];
1807  break;
1808  case FF_CMP_SSE:
1809  cmp[i]= c->sse[i];
1810  break;
1811  case FF_CMP_DCT:
1812  cmp[i]= c->dct_sad[i];
1813  break;
1814  case FF_CMP_DCT264:
1815  cmp[i]= c->dct264_sad[i];
1816  break;
1817  case FF_CMP_DCTMAX:
1818  cmp[i]= c->dct_max[i];
1819  break;
1820  case FF_CMP_PSNR:
1821  cmp[i]= c->quant_psnr[i];
1822  break;
1823  case FF_CMP_BIT:
1824  cmp[i]= c->bit[i];
1825  break;
1826  case FF_CMP_RD:
1827  cmp[i]= c->rd[i];
1828  break;
1829  case FF_CMP_VSAD:
1830  cmp[i]= c->vsad[i];
1831  break;
1832  case FF_CMP_VSSE:
1833  cmp[i]= c->vsse[i];
1834  break;
1835  case FF_CMP_ZERO:
1836  cmp[i]= zero_cmp;
1837  break;
1838  case FF_CMP_NSSE:
1839  cmp[i]= c->nsse[i];
1840  break;
1841 #if CONFIG_DWT
1842  case FF_CMP_W53:
1843  cmp[i]= c->w53[i];
1844  break;
1845  case FF_CMP_W97:
1846  cmp[i]= c->w97[i];
1847  break;
1848 #endif
1849  default:
1850  av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1851  }
1852  }
1853 }
1854 
1855 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1856  long i;
1857  for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
1858  long a = *(long*)(src+i);
1859  long b = *(long*)(dst+i);
1860  *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1861  }
1862  for(; i<w; i++)
1863  dst[i+0] += src[i+0];
1864 }
1865 
1866 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1867  long i;
1868 #if !HAVE_FAST_UNALIGNED
1869  if((long)src2 & (sizeof(long)-1)){
1870  for(i=0; i+7<w; i+=8){
1871  dst[i+0] = src1[i+0]-src2[i+0];
1872  dst[i+1] = src1[i+1]-src2[i+1];
1873  dst[i+2] = src1[i+2]-src2[i+2];
1874  dst[i+3] = src1[i+3]-src2[i+3];
1875  dst[i+4] = src1[i+4]-src2[i+4];
1876  dst[i+5] = src1[i+5]-src2[i+5];
1877  dst[i+6] = src1[i+6]-src2[i+6];
1878  dst[i+7] = src1[i+7]-src2[i+7];
1879  }
1880  }else
1881 #endif
1882  for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
1883  long a = *(long*)(src1+i);
1884  long b = *(long*)(src2+i);
1885  *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1886  }
1887  for(; i<w; i++)
1888  dst[i+0] = src1[i+0]-src2[i+0];
1889 }
1890 
1891 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1892  int i;
1893  uint8_t l, lt;
1894 
1895  l= *left;
1896  lt= *left_top;
1897 
1898  for(i=0; i<w; i++){
1899  l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1900  lt= src1[i];
1901  dst[i]= l;
1902  }
1903 
1904  *left= l;
1905  *left_top= lt;
1906 }
1907 
1908 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1909  int i;
1910  uint8_t l, lt;
1911 
1912  l= *left;
1913  lt= *left_top;
1914 
1915  for(i=0; i<w; i++){
1916  const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1917  lt= src1[i];
1918  l= src2[i];
1919  dst[i]= l - pred;
1920  }
1921 
1922  *left= l;
1923  *left_top= lt;
1924 }
1925 
1926 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1927  int i;
1928 
1929  for(i=0; i<w-1; i++){
1930  acc+= src[i];
1931  dst[i]= acc;
1932  i++;
1933  acc+= src[i];
1934  dst[i]= acc;
1935  }
1936 
1937  for(; i<w; i++){
1938  acc+= src[i];
1939  dst[i]= acc;
1940  }
1941 
1942  return acc;
1943 }
1944 
1945 #if HAVE_BIGENDIAN
1946 #define B 3
1947 #define G 2
1948 #define R 1
1949 #define A 0
1950 #else
1951 #define B 0
1952 #define G 1
1953 #define R 2
1954 #define A 3
1955 #endif
1956 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1957  int i;
1958  int r,g,b,a;
1959  r= *red;
1960  g= *green;
1961  b= *blue;
1962  a= *alpha;
1963 
1964  for(i=0; i<w; i++){
1965  b+= src[4*i+B];
1966  g+= src[4*i+G];
1967  r+= src[4*i+R];
1968  a+= src[4*i+A];
1969 
1970  dst[4*i+B]= b;
1971  dst[4*i+G]= g;
1972  dst[4*i+R]= r;
1973  dst[4*i+A]= a;
1974  }
1975 
1976  *red= r;
1977  *green= g;
1978  *blue= b;
1979  *alpha= a;
1980 }
1981 #undef B
1982 #undef G
1983 #undef R
1984 #undef A
1985 
1986 #define BUTTERFLY2(o1,o2,i1,i2) \
1987 o1= (i1)+(i2);\
1988 o2= (i1)-(i2);
1989 
1990 #define BUTTERFLY1(x,y) \
1991 {\
1992  int a,b;\
1993  a= x;\
1994  b= y;\
1995  x= a+b;\
1996  y= a-b;\
1997 }
1998 
1999 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2000 
2001 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2002  int i;
2003  int temp[64];
2004  int sum=0;
2005 
2006  av_assert2(h==8);
2007 
2008  for(i=0; i<8; i++){
2009  //FIXME try pointer walks
2010  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2011  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2012  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2013  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2014 
2015  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2016  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2017  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2018  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2019 
2020  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2021  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2022  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2023  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2024  }
2025 
2026  for(i=0; i<8; i++){
2027  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2028  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2029  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2030  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2031 
2032  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2033  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2034  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2035  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2036 
2037  sum +=
2038  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2039  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2040  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2041  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2042  }
2043  return sum;
2044 }
2045 
2046 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2047  int i;
2048  int temp[64];
2049  int sum=0;
2050 
2051  av_assert2(h==8);
2052 
2053  for(i=0; i<8; i++){
2054  //FIXME try pointer walks
2055  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2056  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2057  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2058  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2059 
2060  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2061  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2062  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2063  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2064 
2065  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2066  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2067  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2068  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2069  }
2070 
2071  for(i=0; i<8; i++){
2072  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2073  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2074  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2075  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2076 
2077  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2078  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2079  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2080  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2081 
2082  sum +=
2083  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2084  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2085  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2086  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2087  }
2088 
2089  sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2090 
2091  return sum;
2092 }
2093 
2094 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2095  MpegEncContext * const s= (MpegEncContext *)c;
2096  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2097 
2098  av_assert2(h==8);
2099 
2100  s->dsp.diff_pixels(temp, src1, src2, stride);
2101  s->dsp.fdct(temp);
2102  return s->dsp.sum_abs_dctelem(temp);
2103 }
2104 
2105 #if CONFIG_GPL
2106 #define DCT8_1D {\
2107  const int s07 = SRC(0) + SRC(7);\
2108  const int s16 = SRC(1) + SRC(6);\
2109  const int s25 = SRC(2) + SRC(5);\
2110  const int s34 = SRC(3) + SRC(4);\
2111  const int a0 = s07 + s34;\
2112  const int a1 = s16 + s25;\
2113  const int a2 = s07 - s34;\
2114  const int a3 = s16 - s25;\
2115  const int d07 = SRC(0) - SRC(7);\
2116  const int d16 = SRC(1) - SRC(6);\
2117  const int d25 = SRC(2) - SRC(5);\
2118  const int d34 = SRC(3) - SRC(4);\
2119  const int a4 = d16 + d25 + (d07 + (d07>>1));\
2120  const int a5 = d07 - d34 - (d25 + (d25>>1));\
2121  const int a6 = d07 + d34 - (d16 + (d16>>1));\
2122  const int a7 = d16 - d25 + (d34 + (d34>>1));\
2123  DST(0, a0 + a1 ) ;\
2124  DST(1, a4 + (a7>>2)) ;\
2125  DST(2, a2 + (a3>>1)) ;\
2126  DST(3, a5 + (a6>>2)) ;\
2127  DST(4, a0 - a1 ) ;\
2128  DST(5, a6 - (a5>>2)) ;\
2129  DST(6, (a2>>1) - a3 ) ;\
2130  DST(7, (a4>>2) - a7 ) ;\
2131 }
2132 
2133 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134  MpegEncContext * const s= (MpegEncContext *)c;
2135  int16_t dct[8][8];
2136  int i;
2137  int sum=0;
2138 
2139  s->dsp.diff_pixels(dct[0], src1, src2, stride);
2140 
2141 #define SRC(x) dct[i][x]
2142 #define DST(x,v) dct[i][x]= v
2143  for( i = 0; i < 8; i++ )
2144  DCT8_1D
2145 #undef SRC
2146 #undef DST
2147 
2148 #define SRC(x) dct[x][i]
2149 #define DST(x,v) sum += FFABS(v)
2150  for( i = 0; i < 8; i++ )
2151  DCT8_1D
2152 #undef SRC
2153 #undef DST
2154  return sum;
2155 }
2156 #endif
2157 
2158 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2159  MpegEncContext * const s= (MpegEncContext *)c;
2160  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2161  int sum=0, i;
2162 
2163  av_assert2(h==8);
2164 
2165  s->dsp.diff_pixels(temp, src1, src2, stride);
2166  s->dsp.fdct(temp);
2167 
2168  for(i=0; i<64; i++)
2169  sum= FFMAX(sum, FFABS(temp[i]));
2170 
2171  return sum;
2172 }
2173 
2174 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2175  MpegEncContext * const s= (MpegEncContext *)c;
2176  LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2177  int16_t * const bak = temp+64;
2178  int sum=0, i;
2179 
2180  av_assert2(h==8);
2181  s->mb_intra=0;
2182 
2183  s->dsp.diff_pixels(temp, src1, src2, stride);
2184 
2185  memcpy(bak, temp, 64*sizeof(int16_t));
2186 
2187  s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2188  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2189  ff_simple_idct_8(temp); //FIXME
2190 
2191  for(i=0; i<64; i++)
2192  sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2193 
2194  return sum;
2195 }
2196 
2197 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198  MpegEncContext * const s= (MpegEncContext *)c;
2199  const uint8_t *scantable= s->intra_scantable.permutated;
2200  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2201  LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2202  LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2203  int i, last, run, bits, level, distortion, start_i;
2204  const int esc_length= s->ac_esc_length;
2205  uint8_t * length;
2206  uint8_t * last_length;
2207 
2208  av_assert2(h==8);
2209 
2210  copy_block8(lsrc1, src1, 8, stride, 8);
2211  copy_block8(lsrc2, src2, 8, stride, 8);
2212 
2213  s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2214 
2215  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2216 
2217  bits=0;
2218 
2219  if (s->mb_intra) {
2220  start_i = 1;
2221  length = s->intra_ac_vlc_length;
2222  last_length= s->intra_ac_vlc_last_length;
2223  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2224  } else {
2225  start_i = 0;
2226  length = s->inter_ac_vlc_length;
2227  last_length= s->inter_ac_vlc_last_length;
2228  }
2229 
2230  if(last>=start_i){
2231  run=0;
2232  for(i=start_i; i<last; i++){
2233  int j= scantable[i];
2234  level= temp[j];
2235 
2236  if(level){
2237  level+=64;
2238  if((level&(~127)) == 0){
2239  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2240  }else
2241  bits+= esc_length;
2242  run=0;
2243  }else
2244  run++;
2245  }
2246  i= scantable[last];
2247 
2248  level= temp[i] + 64;
2249 
2250  av_assert2(level - 64);
2251 
2252  if((level&(~127)) == 0){
2253  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2254  }else
2255  bits+= esc_length;
2256 
2257  }
2258 
2259  if(last>=0){
2260  if(s->mb_intra)
2261  s->dct_unquantize_intra(s, temp, 0, s->qscale);
2262  else
2263  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2264  }
2265 
2266  s->dsp.idct_add(lsrc2, 8, temp);
2267 
2268  distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2269 
2270  return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2271 }
2272 
2273 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2274  MpegEncContext * const s= (MpegEncContext *)c;
2275  const uint8_t *scantable= s->intra_scantable.permutated;
2276  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2277  int i, last, run, bits, level, start_i;
2278  const int esc_length= s->ac_esc_length;
2279  uint8_t * length;
2280  uint8_t * last_length;
2281 
2282  av_assert2(h==8);
2283 
2284  s->dsp.diff_pixels(temp, src1, src2, stride);
2285 
2286  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2287 
2288  bits=0;
2289 
2290  if (s->mb_intra) {
2291  start_i = 1;
2292  length = s->intra_ac_vlc_length;
2293  last_length= s->intra_ac_vlc_last_length;
2294  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2295  } else {
2296  start_i = 0;
2297  length = s->inter_ac_vlc_length;
2298  last_length= s->inter_ac_vlc_last_length;
2299  }
2300 
2301  if(last>=start_i){
2302  run=0;
2303  for(i=start_i; i<last; i++){
2304  int j= scantable[i];
2305  level= temp[j];
2306 
2307  if(level){
2308  level+=64;
2309  if((level&(~127)) == 0){
2310  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2311  }else
2312  bits+= esc_length;
2313  run=0;
2314  }else
2315  run++;
2316  }
2317  i= scantable[last];
2318 
2319  level= temp[i] + 64;
2320 
2321  av_assert2(level - 64);
2322 
2323  if((level&(~127)) == 0){
2324  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2325  }else
2326  bits+= esc_length;
2327  }
2328 
2329  return bits;
2330 }
2331 
2332 #define VSAD_INTRA(size) \
2333 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2334  int score=0; \
2335  int x,y; \
2336  \
2337  for(y=1; y<h; y++){ \
2338  for(x=0; x<size; x+=4){ \
2339  score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2340  +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2341  } \
2342  s+= stride; \
2343  } \
2344  \
2345  return score; \
2346 }
2347 VSAD_INTRA(8)
2348 VSAD_INTRA(16)
2349 
2350 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2351  int score=0;
2352  int x,y;
2353 
2354  for(y=1; y<h; y++){
2355  for(x=0; x<16; x++){
2356  score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2357  }
2358  s1+= stride;
2359  s2+= stride;
2360  }
2361 
2362  return score;
2363 }
2364 
2365 #define SQ(a) ((a)*(a))
2366 #define VSSE_INTRA(size) \
2367 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2368  int score=0; \
2369  int x,y; \
2370  \
2371  for(y=1; y<h; y++){ \
2372  for(x=0; x<size; x+=4){ \
2373  score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2374  +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2375  } \
2376  s+= stride; \
2377  } \
2378  \
2379  return score; \
2380 }
2381 VSSE_INTRA(8)
2382 VSSE_INTRA(16)
2383 
2384 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2385  int score=0;
2386  int x,y;
2387 
2388  for(y=1; y<h; y++){
2389  for(x=0; x<16; x++){
2390  score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2391  }
2392  s1+= stride;
2393  s2+= stride;
2394  }
2395 
2396  return score;
2397 }
2398 
2399 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2400  int size){
2401  int score=0;
2402  int i;
2403  for(i=0; i<size; i++)
2404  score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2405  return score;
2406 }
2407 
2408 #define WRAPPER8_16_SQ(name8, name16)\
2409 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2410  int score=0;\
2411  score +=name8(s, dst , src , stride, 8);\
2412  score +=name8(s, dst+8 , src+8 , stride, 8);\
2413  if(h==16){\
2414  dst += 8*stride;\
2415  src += 8*stride;\
2416  score +=name8(s, dst , src , stride, 8);\
2417  score +=name8(s, dst+8 , src+8 , stride, 8);\
2418  }\
2419  return score;\
2420 }
2421 
2422 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2423 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2424 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2425 #if CONFIG_GPL
2426 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2427 #endif
2428 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2429 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2430 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2431 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2432 
2433 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2434  uint32_t maxi, uint32_t maxisign)
2435 {
2436 
2437  if(a > mini) return mini;
2438  else if((a^(1U<<31)) > maxisign) return maxi;
2439  else return a;
2440 }
2441 
2442 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2443  int i;
2444  uint32_t mini = *(uint32_t*)min;
2445  uint32_t maxi = *(uint32_t*)max;
2446  uint32_t maxisign = maxi ^ (1U<<31);
2447  uint32_t *dsti = (uint32_t*)dst;
2448  const uint32_t *srci = (const uint32_t*)src;
2449  for(i=0; i<len; i+=8) {
2450  dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2451  dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2452  dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2453  dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2454  dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2455  dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2456  dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2457  dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2458  }
2459 }
2460 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2461  int i;
2462  if(min < 0 && max > 0) {
2463  vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2464  } else {
2465  for(i=0; i < len; i+=8) {
2466  dst[i ] = av_clipf(src[i ], min, max);
2467  dst[i + 1] = av_clipf(src[i + 1], min, max);
2468  dst[i + 2] = av_clipf(src[i + 2], min, max);
2469  dst[i + 3] = av_clipf(src[i + 3], min, max);
2470  dst[i + 4] = av_clipf(src[i + 4], min, max);
2471  dst[i + 5] = av_clipf(src[i + 5], min, max);
2472  dst[i + 6] = av_clipf(src[i + 6], min, max);
2473  dst[i + 7] = av_clipf(src[i + 7], min, max);
2474  }
2475  }
2476 }
2477 
2478 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2479 {
2480  int res = 0;
2481 
2482  while (order--)
2483  res += *v1++ * *v2++;
2484 
2485  return res;
2486 }
2487 
2488 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2489 {
2490  int res = 0;
2491  while (order--) {
2492  res += *v1 * *v2++;
2493  *v1++ += mul * *v3++;
2494  }
2495  return res;
2496 }
2497 
2498 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2499  int32_t max, unsigned int len)
2500 {
2501  do {
2502  *dst++ = av_clip(*src++, min, max);
2503  *dst++ = av_clip(*src++, min, max);
2504  *dst++ = av_clip(*src++, min, max);
2505  *dst++ = av_clip(*src++, min, max);
2506  *dst++ = av_clip(*src++, min, max);
2507  *dst++ = av_clip(*src++, min, max);
2508  *dst++ = av_clip(*src++, min, max);
2509  *dst++ = av_clip(*src++, min, max);
2510  len -= 8;
2511  } while (len > 0);
2512 }
2513 
2514 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2515 {
2516  ff_j_rev_dct (block);
2517  put_pixels_clamped_c(block, dest, line_size);
2518 }
2519 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2520 {
2521  ff_j_rev_dct (block);
2522  add_pixels_clamped_c(block, dest, line_size);
2523 }
2524 
2525 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2526 {
2527  ff_j_rev_dct4 (block);
2528  put_pixels_clamped4_c(block, dest, line_size);
2529 }
2530 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2531 {
2532  ff_j_rev_dct4 (block);
2533  add_pixels_clamped4_c(block, dest, line_size);
2534 }
2535 
2536 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2537 {
2538  ff_j_rev_dct2 (block);
2539  put_pixels_clamped2_c(block, dest, line_size);
2540 }
2541 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2542 {
2543  ff_j_rev_dct2 (block);
2544  add_pixels_clamped2_c(block, dest, line_size);
2545 }
2546 
2547 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2548 {
2549  dest[0] = av_clip_uint8((block[0] + 4)>>3);
2550 }
2551 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2552 {
2553  dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2554 }
2555 
2556 /* init static data */
2558 {
2559  int i;
2560 
2561  for(i=0;i<512;i++) {
2562  ff_squareTbl[i] = (i - 256) * (i - 256);
2563  }
2564 }
2565 
2567  static int did_fail=0;
2568  LOCAL_ALIGNED_16(int, aligned, [4]);
2569 
2570  if((intptr_t)aligned & 15){
2571  if(!did_fail){
2572 #if HAVE_MMX || HAVE_ALTIVEC
2573  av_log(NULL, AV_LOG_ERROR,
2574  "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2575  "and may be very slow or crash. This is not a bug in libavcodec,\n"
2576  "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2577  "Do not report crashes to FFmpeg developers.\n");
2578 #endif
2579  did_fail=1;
2580  }
2581  return -1;
2582  }
2583  return 0;
2584 }
2585 
2587 {
2589 
2590 #if CONFIG_ENCODERS
2591  if (avctx->bits_per_raw_sample == 10) {
2594  } else {
2595  if(avctx->dct_algo==FF_DCT_FASTINT) {
2596  c->fdct = ff_fdct_ifast;
2598  }
2599  else if(avctx->dct_algo==FF_DCT_FAAN) {
2600  c->fdct = ff_faandct;
2601  c->fdct248 = ff_faandct248;
2602  }
2603  else {
2604  c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2606  }
2607  }
2608 #endif //CONFIG_ENCODERS
2609 
2610  if(avctx->lowres==1){
2613  c->idct = ff_j_rev_dct4;
2615  }else if(avctx->lowres==2){
2618  c->idct = ff_j_rev_dct2;
2620  }else if(avctx->lowres==3){
2623  c->idct = ff_j_rev_dct1;
2625  }else{
2626  if (avctx->bits_per_raw_sample == 10) {
2629  c->idct = ff_simple_idct_10;
2631  } else if (avctx->bits_per_raw_sample == 12) {
2634  c->idct = ff_simple_idct_12;
2636  } else {
2637  if(avctx->idct_algo==FF_IDCT_INT){
2638  c->idct_put= jref_idct_put;
2639  c->idct_add= jref_idct_add;
2640  c->idct = ff_j_rev_dct;
2642  }else if(avctx->idct_algo==FF_IDCT_FAAN){
2645  c->idct = ff_faanidct;
2647  }else{ //accurate/default
2650  c->idct = ff_simple_idct_8;
2652  }
2653  }
2654  }
2655 
2661  c->gmc1 = gmc1_c;
2662  c->gmc = ff_gmc_c;
2663  c->pix_sum = pix_sum_c;
2664  c->pix_norm1 = pix_norm1_c;
2665 
2667  c->fill_block_tab[1] = fill_block8_c;
2668 
2669  /* TODO [0] 16 [1] 8 */
2670  c->pix_abs[0][0] = pix_abs16_c;
2671  c->pix_abs[0][1] = pix_abs16_x2_c;
2672  c->pix_abs[0][2] = pix_abs16_y2_c;
2673  c->pix_abs[0][3] = pix_abs16_xy2_c;
2674  c->pix_abs[1][0] = pix_abs8_c;
2675  c->pix_abs[1][1] = pix_abs8_x2_c;
2676  c->pix_abs[1][2] = pix_abs8_y2_c;
2677  c->pix_abs[1][3] = pix_abs8_xy2_c;
2678 
2688 
2698 
2699 #define dspfunc(PFX, IDX, NUM) \
2700  c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2701  c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2702  c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2703  c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2704  c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2705  c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2706  c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2707  c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2708  c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2709  c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2710  c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2711  c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2712  c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2713  c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2714  c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2715  c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2716 
2717  dspfunc(put_qpel, 0, 16);
2718  dspfunc(put_no_rnd_qpel, 0, 16);
2719 
2720  dspfunc(avg_qpel, 0, 16);
2721  /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2722 
2723  dspfunc(put_qpel, 1, 8);
2724  dspfunc(put_no_rnd_qpel, 1, 8);
2725 
2726  dspfunc(avg_qpel, 1, 8);
2727  /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2728 
2729 #undef dspfunc
2730 
2739 
2740 #define SET_CMP_FUNC(name) \
2741  c->name[0]= name ## 16_c;\
2742  c->name[1]= name ## 8x8_c;
2743 
2744  SET_CMP_FUNC(hadamard8_diff)
2745  c->hadamard8_diff[4]= hadamard8_intra16_c;
2747  SET_CMP_FUNC(dct_sad)
2748  SET_CMP_FUNC(dct_max)
2749 #if CONFIG_GPL
2750  SET_CMP_FUNC(dct264_sad)
2751 #endif
2752  c->sad[0]= pix_abs16_c;
2753  c->sad[1]= pix_abs8_c;
2754  c->sse[0]= sse16_c;
2755  c->sse[1]= sse8_c;
2756  c->sse[2]= sse4_c;
2757  SET_CMP_FUNC(quant_psnr)
2758  SET_CMP_FUNC(rd)
2759  SET_CMP_FUNC(bit)
2760  c->vsad[0]= vsad16_c;
2761  c->vsad[4]= vsad_intra16_c;
2762  c->vsad[5]= vsad_intra8_c;
2763  c->vsse[0]= vsse16_c;
2764  c->vsse[4]= vsse_intra16_c;
2765  c->vsse[5]= vsse_intra8_c;
2766  c->nsse[0]= nsse16_c;
2767  c->nsse[1]= nsse8_c;
2768 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2770 #endif
2771 
2773 
2774  c->add_bytes= add_bytes_c;
2780  c->bswap_buf= bswap_buf;
2781  c->bswap16_buf = bswap16_buf;
2782 
2785 
2790 
2791  c->shrink[0]= av_image_copy_plane;
2792  c->shrink[1]= ff_shrink22;
2793  c->shrink[2]= ff_shrink44;
2794  c->shrink[3]= ff_shrink88;
2795 
2797 
2798 #undef FUNC
2799 #undef FUNCC
2800 #define FUNC(f, depth) f ## _ ## depth
2801 #define FUNCC(f, depth) f ## _ ## depth ## _c
2802 
2803  c->draw_edges = FUNCC(draw_edges, 8);
2804  c->clear_block = FUNCC(clear_block, 8);
2805  c->clear_blocks = FUNCC(clear_blocks, 8);
2806 
2807 #define BIT_DEPTH_FUNCS(depth) \
2808  c->get_pixels = FUNCC(get_pixels, depth);
2809 
2810  switch (avctx->bits_per_raw_sample) {
2811  case 9:
2812  case 10:
2813  case 12:
2814  case 14:
2815  BIT_DEPTH_FUNCS(16);
2816  break;
2817  default:
2818  if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2819  BIT_DEPTH_FUNCS(8);
2820  }
2821  break;
2822  }
2823 
2824 
2825  if (ARCH_ALPHA)
2826  ff_dsputil_init_alpha(c, avctx);
2827  if (ARCH_ARM)
2828  ff_dsputil_init_arm(c, avctx);
2829  if (ARCH_BFIN)
2830  ff_dsputil_init_bfin(c, avctx);
2831  if (ARCH_PPC)
2832  ff_dsputil_init_ppc(c, avctx);
2833  if (ARCH_SH4)
2834  ff_dsputil_init_sh4(c, avctx);
2835  if (HAVE_VIS)
2836  ff_dsputil_init_vis(c, avctx);
2837  if (ARCH_X86)
2838  ff_dsputil_init_x86(c, avctx);
2839 
2842 }
2843 
2845 {
2846  ff_dsputil_init(c, avctx);
2847 }
2848 
2850 {
2851  ff_dsputil_init(c, avctx);
2852 }