FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dsputil.c
Go to the documentation of this file.
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 /**
26  * @file
27  * DSP utils
28  */
29 
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
33 #include "avcodec.h"
34 #include "copy_block.h"
35 #include "dct.h"
36 #include "dsputil.h"
37 #include "simple_idct.h"
38 #include "faandct.h"
39 #include "faanidct.h"
40 #include "imgconvert.h"
41 #include "mathops.h"
42 #include "mpegvideo.h"
43 #include "config.h"
44 #include "diracdsp.h"
45 
46 uint32_t ff_squareTbl[512] = {0, };
47 
48 #define BIT_DEPTH 16
49 #include "dsputil_template.c"
50 #undef BIT_DEPTH
51 
52 #define BIT_DEPTH 8
53 #include "dsputil_template.c"
54 
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL/255 * 0x7f)
57 #define pb_80 (~0UL/255 * 0x80)
58 
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60  specification, we interleave the fields */
62  0, 8, 1, 9, 16, 24, 2, 10,
63  17, 25, 32, 40, 48, 56, 33, 41,
64  18, 26, 3, 11, 4, 12, 19, 27,
65  34, 42, 49, 57, 50, 58, 35, 43,
66  20, 28, 5, 13, 6, 14, 21, 29,
67  36, 44, 51, 59, 52, 60, 37, 45,
68  22, 30, 7, 15, 23, 31, 38, 46,
69  53, 61, 54, 62, 39, 47, 55, 63,
70 };
71 
72 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
74 
76  0, 1, 2, 3, 8, 9, 16, 17,
77  10, 11, 4, 5, 6, 7, 15, 14,
78  13, 12, 19, 18, 24, 25, 32, 33,
79  26, 27, 20, 21, 22, 23, 28, 29,
80  30, 31, 34, 35, 40, 41, 48, 49,
81  42, 43, 36, 37, 38, 39, 44, 45,
82  46, 47, 50, 51, 56, 57, 58, 59,
83  52, 53, 54, 55, 60, 61, 62, 63,
84 };
85 
87  0, 8, 16, 24, 1, 9, 2, 10,
88  17, 25, 32, 40, 48, 56, 57, 49,
89  41, 33, 26, 18, 3, 11, 4, 12,
90  19, 27, 34, 42, 50, 58, 35, 43,
91  51, 59, 20, 28, 5, 13, 6, 14,
92  21, 29, 36, 44, 52, 60, 37, 45,
93  53, 61, 22, 30, 7, 15, 23, 31,
94  38, 46, 54, 62, 39, 47, 55, 63,
95 };
96 
97 /* Input permutation for the simple_idct_mmx */
98 static const uint8_t simple_mmx_permutation[64]={
99  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
100  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
101  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
102  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
103  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
104  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
105  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
106  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 };
108 
109 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
110 
112  const uint8_t *src_scantable)
113 {
114  int i;
115  int end;
116 
117  st->scantable= src_scantable;
118 
119  for(i=0; i<64; i++){
120  int j;
121  j = src_scantable[i];
122  st->permutated[i] = permutation[j];
123  }
124 
125  end=-1;
126  for(i=0; i<64; i++){
127  int j;
128  j = st->permutated[i];
129  if(j>end) end=j;
130  st->raster_end[i]= end;
131  }
132 }
133 
135  int idct_permutation_type)
136 {
137  int i;
138 
139  switch(idct_permutation_type){
140  case FF_NO_IDCT_PERM:
141  for(i=0; i<64; i++)
142  idct_permutation[i]= i;
143  break;
145  for(i=0; i<64; i++)
146  idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
147  break;
148  case FF_SIMPLE_IDCT_PERM:
149  for(i=0; i<64; i++)
150  idct_permutation[i]= simple_mmx_permutation[i];
151  break;
153  for(i=0; i<64; i++)
154  idct_permutation[i]= ((i&7)<<3) | (i>>3);
155  break;
157  for(i=0; i<64; i++)
158  idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
159  break;
160  case FF_SSE2_IDCT_PERM:
161  for(i=0; i<64; i++)
162  idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
163  break;
164  default:
165  av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
166  }
167 }
168 
169 static int pix_sum_c(uint8_t * pix, int line_size)
170 {
171  int s, i, j;
172 
173  s = 0;
174  for (i = 0; i < 16; i++) {
175  for (j = 0; j < 16; j += 8) {
176  s += pix[0];
177  s += pix[1];
178  s += pix[2];
179  s += pix[3];
180  s += pix[4];
181  s += pix[5];
182  s += pix[6];
183  s += pix[7];
184  pix += 8;
185  }
186  pix += line_size - 16;
187  }
188  return s;
189 }
190 
191 static int pix_norm1_c(uint8_t * pix, int line_size)
192 {
193  int s, i, j;
194  uint32_t *sq = ff_squareTbl + 256;
195 
196  s = 0;
197  for (i = 0; i < 16; i++) {
198  for (j = 0; j < 16; j += 8) {
199 #if 0
200  s += sq[pix[0]];
201  s += sq[pix[1]];
202  s += sq[pix[2]];
203  s += sq[pix[3]];
204  s += sq[pix[4]];
205  s += sq[pix[5]];
206  s += sq[pix[6]];
207  s += sq[pix[7]];
208 #else
209 #if HAVE_FAST_64BIT
210  register uint64_t x=*(uint64_t*)pix;
211  s += sq[x&0xff];
212  s += sq[(x>>8)&0xff];
213  s += sq[(x>>16)&0xff];
214  s += sq[(x>>24)&0xff];
215  s += sq[(x>>32)&0xff];
216  s += sq[(x>>40)&0xff];
217  s += sq[(x>>48)&0xff];
218  s += sq[(x>>56)&0xff];
219 #else
220  register uint32_t x=*(uint32_t*)pix;
221  s += sq[x&0xff];
222  s += sq[(x>>8)&0xff];
223  s += sq[(x>>16)&0xff];
224  s += sq[(x>>24)&0xff];
225  x=*(uint32_t*)(pix+4);
226  s += sq[x&0xff];
227  s += sq[(x>>8)&0xff];
228  s += sq[(x>>16)&0xff];
229  s += sq[(x>>24)&0xff];
230 #endif
231 #endif
232  pix += 8;
233  }
234  pix += line_size - 16;
235  }
236  return s;
237 }
238 
239 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
240  int i;
241 
242  for(i=0; i+8<=w; i+=8){
243  dst[i+0]= av_bswap32(src[i+0]);
244  dst[i+1]= av_bswap32(src[i+1]);
245  dst[i+2]= av_bswap32(src[i+2]);
246  dst[i+3]= av_bswap32(src[i+3]);
247  dst[i+4]= av_bswap32(src[i+4]);
248  dst[i+5]= av_bswap32(src[i+5]);
249  dst[i+6]= av_bswap32(src[i+6]);
250  dst[i+7]= av_bswap32(src[i+7]);
251  }
252  for(;i<w; i++){
253  dst[i+0]= av_bswap32(src[i+0]);
254  }
255 }
256 
257 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
258 {
259  while (len--)
260  *dst++ = av_bswap16(*src++);
261 }
262 
263 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
264 {
265  int s, i;
266  uint32_t *sq = ff_squareTbl + 256;
267 
268  s = 0;
269  for (i = 0; i < h; i++) {
270  s += sq[pix1[0] - pix2[0]];
271  s += sq[pix1[1] - pix2[1]];
272  s += sq[pix1[2] - pix2[2]];
273  s += sq[pix1[3] - pix2[3]];
274  pix1 += line_size;
275  pix2 += line_size;
276  }
277  return s;
278 }
279 
280 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281 {
282  int s, i;
283  uint32_t *sq = ff_squareTbl + 256;
284 
285  s = 0;
286  for (i = 0; i < h; i++) {
287  s += sq[pix1[0] - pix2[0]];
288  s += sq[pix1[1] - pix2[1]];
289  s += sq[pix1[2] - pix2[2]];
290  s += sq[pix1[3] - pix2[3]];
291  s += sq[pix1[4] - pix2[4]];
292  s += sq[pix1[5] - pix2[5]];
293  s += sq[pix1[6] - pix2[6]];
294  s += sq[pix1[7] - pix2[7]];
295  pix1 += line_size;
296  pix2 += line_size;
297  }
298  return s;
299 }
300 
301 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
302 {
303  int s, i;
304  uint32_t *sq = ff_squareTbl + 256;
305 
306  s = 0;
307  for (i = 0; i < h; i++) {
308  s += sq[pix1[ 0] - pix2[ 0]];
309  s += sq[pix1[ 1] - pix2[ 1]];
310  s += sq[pix1[ 2] - pix2[ 2]];
311  s += sq[pix1[ 3] - pix2[ 3]];
312  s += sq[pix1[ 4] - pix2[ 4]];
313  s += sq[pix1[ 5] - pix2[ 5]];
314  s += sq[pix1[ 6] - pix2[ 6]];
315  s += sq[pix1[ 7] - pix2[ 7]];
316  s += sq[pix1[ 8] - pix2[ 8]];
317  s += sq[pix1[ 9] - pix2[ 9]];
318  s += sq[pix1[10] - pix2[10]];
319  s += sq[pix1[11] - pix2[11]];
320  s += sq[pix1[12] - pix2[12]];
321  s += sq[pix1[13] - pix2[13]];
322  s += sq[pix1[14] - pix2[14]];
323  s += sq[pix1[15] - pix2[15]];
324 
325  pix1 += line_size;
326  pix2 += line_size;
327  }
328  return s;
329 }
330 
331 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
332  const uint8_t *s2, int stride){
333  int i;
334 
335  /* read the pixels */
336  for(i=0;i<8;i++) {
337  block[0] = s1[0] - s2[0];
338  block[1] = s1[1] - s2[1];
339  block[2] = s1[2] - s2[2];
340  block[3] = s1[3] - s2[3];
341  block[4] = s1[4] - s2[4];
342  block[5] = s1[5] - s2[5];
343  block[6] = s1[6] - s2[6];
344  block[7] = s1[7] - s2[7];
345  s1 += stride;
346  s2 += stride;
347  block += 8;
348  }
349 }
350 
351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
352  int line_size)
353 {
354  int i;
355 
356  /* read the pixels */
357  for(i=0;i<8;i++) {
358  pixels[0] = av_clip_uint8(block[0]);
359  pixels[1] = av_clip_uint8(block[1]);
360  pixels[2] = av_clip_uint8(block[2]);
361  pixels[3] = av_clip_uint8(block[3]);
362  pixels[4] = av_clip_uint8(block[4]);
363  pixels[5] = av_clip_uint8(block[5]);
364  pixels[6] = av_clip_uint8(block[6]);
365  pixels[7] = av_clip_uint8(block[7]);
366 
367  pixels += line_size;
368  block += 8;
369  }
370 }
371 
372 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
373  int line_size)
374 {
375  int i;
376 
377  /* read the pixels */
378  for(i=0;i<4;i++) {
379  pixels[0] = av_clip_uint8(block[0]);
380  pixels[1] = av_clip_uint8(block[1]);
381  pixels[2] = av_clip_uint8(block[2]);
382  pixels[3] = av_clip_uint8(block[3]);
383 
384  pixels += line_size;
385  block += 8;
386  }
387 }
388 
389 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
390  int line_size)
391 {
392  int i;
393 
394  /* read the pixels */
395  for(i=0;i<2;i++) {
396  pixels[0] = av_clip_uint8(block[0]);
397  pixels[1] = av_clip_uint8(block[1]);
398 
399  pixels += line_size;
400  block += 8;
401  }
402 }
403 
404 static void put_signed_pixels_clamped_c(const int16_t *block,
405  uint8_t *av_restrict pixels,
406  int line_size)
407 {
408  int i, j;
409 
410  for (i = 0; i < 8; i++) {
411  for (j = 0; j < 8; j++) {
412  if (*block < -128)
413  *pixels = 0;
414  else if (*block > 127)
415  *pixels = 255;
416  else
417  *pixels = (uint8_t)(*block + 128);
418  block++;
419  pixels++;
420  }
421  pixels += (line_size - 8);
422  }
423 }
424 
425 static void add_pixels8_c(uint8_t *av_restrict pixels,
426  int16_t *block,
427  int line_size)
428 {
429  int i;
430 
431  for(i=0;i<8;i++) {
432  pixels[0] += block[0];
433  pixels[1] += block[1];
434  pixels[2] += block[2];
435  pixels[3] += block[3];
436  pixels[4] += block[4];
437  pixels[5] += block[5];
438  pixels[6] += block[6];
439  pixels[7] += block[7];
440  pixels += line_size;
441  block += 8;
442  }
443 }
444 
445 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
446  int line_size)
447 {
448  int i;
449 
450  /* read the pixels */
451  for(i=0;i<8;i++) {
452  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
453  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
454  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
455  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
456  pixels[4] = av_clip_uint8(pixels[4] + block[4]);
457  pixels[5] = av_clip_uint8(pixels[5] + block[5]);
458  pixels[6] = av_clip_uint8(pixels[6] + block[6]);
459  pixels[7] = av_clip_uint8(pixels[7] + block[7]);
460  pixels += line_size;
461  block += 8;
462  }
463 }
464 
465 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
466  int line_size)
467 {
468  int i;
469 
470  /* read the pixels */
471  for(i=0;i<4;i++) {
472  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
473  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
474  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
475  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
476  pixels += line_size;
477  block += 8;
478  }
479 }
480 
481 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
482  int line_size)
483 {
484  int i;
485 
486  /* read the pixels */
487  for(i=0;i<2;i++) {
488  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
489  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
490  pixels += line_size;
491  block += 8;
492  }
493 }
494 
495 static int sum_abs_dctelem_c(int16_t *block)
496 {
497  int sum=0, i;
498  for(i=0; i<64; i++)
499  sum+= FFABS(block[i]);
500  return sum;
501 }
502 
503 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
504 {
505  int i;
506 
507  for (i = 0; i < h; i++) {
508  memset(block, value, 16);
509  block += line_size;
510  }
511 }
512 
513 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
514 {
515  int i;
516 
517  for (i = 0; i < h; i++) {
518  memset(block, value, 8);
519  block += line_size;
520  }
521 }
522 
523 #define avg2(a,b) ((a+b+1)>>1)
524 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
525 
526 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
527 {
528  const int A=(16-x16)*(16-y16);
529  const int B=( x16)*(16-y16);
530  const int C=(16-x16)*( y16);
531  const int D=( x16)*( y16);
532  int i;
533 
534  for(i=0; i<h; i++)
535  {
536  dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
537  dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
538  dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
539  dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
540  dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
541  dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
542  dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
543  dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
544  dst+= stride;
545  src+= stride;
546  }
547 }
548 
549 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
550  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
551 {
552  int y, vx, vy;
553  const int s= 1<<shift;
554 
555  width--;
556  height--;
557 
558  for(y=0; y<h; y++){
559  int x;
560 
561  vx= ox;
562  vy= oy;
563  for(x=0; x<8; x++){ //XXX FIXME optimize
564  int src_x, src_y, frac_x, frac_y, index;
565 
566  src_x= vx>>16;
567  src_y= vy>>16;
568  frac_x= src_x&(s-1);
569  frac_y= src_y&(s-1);
570  src_x>>=shift;
571  src_y>>=shift;
572 
573  if((unsigned)src_x < width){
574  if((unsigned)src_y < height){
575  index= src_x + src_y*stride;
576  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
577  + src[index +1]* frac_x )*(s-frac_y)
578  + ( src[index+stride ]*(s-frac_x)
579  + src[index+stride+1]* frac_x )* frac_y
580  + r)>>(shift*2);
581  }else{
582  index= src_x + av_clip(src_y, 0, height)*stride;
583  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
584  + src[index +1]* frac_x )*s
585  + r)>>(shift*2);
586  }
587  }else{
588  if((unsigned)src_y < height){
589  index= av_clip(src_x, 0, width) + src_y*stride;
590  dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
591  + src[index+stride ]* frac_y )*s
592  + r)>>(shift*2);
593  }else{
594  index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
595  dst[y*stride + x]= src[index ];
596  }
597  }
598 
599  vx+= dxx;
600  vy+= dyx;
601  }
602  ox += dxy;
603  oy += dyy;
604  }
605 }
606 
607 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
608  switch(width){
609  case 2: put_pixels2_8_c (dst, src, stride, height); break;
610  case 4: put_pixels4_8_c (dst, src, stride, height); break;
611  case 8: put_pixels8_8_c (dst, src, stride, height); break;
612  case 16:put_pixels16_8_c(dst, src, stride, height); break;
613  }
614 }
615 
616 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
617  int i,j;
618  for (i=0; i < height; i++) {
619  for (j=0; j < width; j++) {
620  dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
621  }
622  src += stride;
623  dst += stride;
624  }
625 }
626 
627 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
628  int i,j;
629  for (i=0; i < height; i++) {
630  for (j=0; j < width; j++) {
631  dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
632  }
633  src += stride;
634  dst += stride;
635  }
636 }
637 
638 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639  int i,j;
640  for (i=0; i < height; i++) {
641  for (j=0; j < width; j++) {
642  dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
643  }
644  src += stride;
645  dst += stride;
646  }
647 }
648 
649 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650  int i,j;
651  for (i=0; i < height; i++) {
652  for (j=0; j < width; j++) {
653  dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
654  }
655  src += stride;
656  dst += stride;
657  }
658 }
659 
660 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661  int i,j;
662  for (i=0; i < height; i++) {
663  for (j=0; j < width; j++) {
664  dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
665  }
666  src += stride;
667  dst += stride;
668  }
669 }
670 
671 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672  int i,j;
673  for (i=0; i < height; i++) {
674  for (j=0; j < width; j++) {
675  dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
676  }
677  src += stride;
678  dst += stride;
679  }
680 }
681 
682 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683  int i,j;
684  for (i=0; i < height; i++) {
685  for (j=0; j < width; j++) {
686  dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
687  }
688  src += stride;
689  dst += stride;
690  }
691 }
692 
693 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694  int i,j;
695  for (i=0; i < height; i++) {
696  for (j=0; j < width; j++) {
697  dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
698  }
699  src += stride;
700  dst += stride;
701  }
702 }
703 
704 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705  switch(width){
706  case 2: avg_pixels2_8_c (dst, src, stride, height); break;
707  case 4: avg_pixels4_8_c (dst, src, stride, height); break;
708  case 8: avg_pixels8_8_c (dst, src, stride, height); break;
709  case 16:avg_pixels16_8_c(dst, src, stride, height); break;
710  }
711 }
712 
713 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
714  int i,j;
715  for (i=0; i < height; i++) {
716  for (j=0; j < width; j++) {
717  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
718  }
719  src += stride;
720  dst += stride;
721  }
722 }
723 
724 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
725  int i,j;
726  for (i=0; i < height; i++) {
727  for (j=0; j < width; j++) {
728  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
729  }
730  src += stride;
731  dst += stride;
732  }
733 }
734 
735 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736  int i,j;
737  for (i=0; i < height; i++) {
738  for (j=0; j < width; j++) {
739  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
740  }
741  src += stride;
742  dst += stride;
743  }
744 }
745 
746 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747  int i,j;
748  for (i=0; i < height; i++) {
749  for (j=0; j < width; j++) {
750  dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
751  }
752  src += stride;
753  dst += stride;
754  }
755 }
756 
757 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758  int i,j;
759  for (i=0; i < height; i++) {
760  for (j=0; j < width; j++) {
761  dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
762  }
763  src += stride;
764  dst += stride;
765  }
766 }
767 
768 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769  int i,j;
770  for (i=0; i < height; i++) {
771  for (j=0; j < width; j++) {
772  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
773  }
774  src += stride;
775  dst += stride;
776  }
777 }
778 
779 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780  int i,j;
781  for (i=0; i < height; i++) {
782  for (j=0; j < width; j++) {
783  dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
784  }
785  src += stride;
786  dst += stride;
787  }
788 }
789 
790 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791  int i,j;
792  for (i=0; i < height; i++) {
793  for (j=0; j < width; j++) {
794  dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
795  }
796  src += stride;
797  dst += stride;
798  }
799 }
800 
801 #define QPEL_MC(r, OPNAME, RND, OP) \
802 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
803  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
804  int i;\
805  for(i=0; i<h; i++)\
806  {\
807  OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
808  OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
809  OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
810  OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
811  OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
812  OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
813  OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
814  OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
815  dst+=dstStride;\
816  src+=srcStride;\
817  }\
818 }\
819 \
820 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
821  const int w=8;\
822  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
823  int i;\
824  for(i=0; i<w; i++)\
825  {\
826  const int src0= src[0*srcStride];\
827  const int src1= src[1*srcStride];\
828  const int src2= src[2*srcStride];\
829  const int src3= src[3*srcStride];\
830  const int src4= src[4*srcStride];\
831  const int src5= src[5*srcStride];\
832  const int src6= src[6*srcStride];\
833  const int src7= src[7*srcStride];\
834  const int src8= src[8*srcStride];\
835  OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
836  OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
837  OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
838  OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
839  OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
840  OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
841  OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
842  OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
843  dst++;\
844  src++;\
845  }\
846 }\
847 \
848 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
849  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
850  int i;\
851  \
852  for(i=0; i<h; i++)\
853  {\
854  OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
855  OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
856  OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
857  OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
858  OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
859  OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
860  OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
861  OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
862  OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
863  OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
864  OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
865  OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
866  OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
867  OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
868  OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
869  OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
870  dst+=dstStride;\
871  src+=srcStride;\
872  }\
873 }\
874 \
875 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
876  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
877  int i;\
878  const int w=16;\
879  for(i=0; i<w; i++)\
880  {\
881  const int src0= src[0*srcStride];\
882  const int src1= src[1*srcStride];\
883  const int src2= src[2*srcStride];\
884  const int src3= src[3*srcStride];\
885  const int src4= src[4*srcStride];\
886  const int src5= src[5*srcStride];\
887  const int src6= src[6*srcStride];\
888  const int src7= src[7*srcStride];\
889  const int src8= src[8*srcStride];\
890  const int src9= src[9*srcStride];\
891  const int src10= src[10*srcStride];\
892  const int src11= src[11*srcStride];\
893  const int src12= src[12*srcStride];\
894  const int src13= src[13*srcStride];\
895  const int src14= src[14*srcStride];\
896  const int src15= src[15*srcStride];\
897  const int src16= src[16*srcStride];\
898  OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
899  OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
900  OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
901  OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
902  OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
903  OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
904  OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
905  OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
906  OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
907  OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
908  OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
909  OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
910  OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
911  OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
912  OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
913  OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
914  dst++;\
915  src++;\
916  }\
917 }\
918 \
919 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
920 {\
921  uint8_t half[64];\
922  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
923  OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
924 }\
925 \
926 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
927 {\
928  OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
929 }\
930 \
931 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
932 {\
933  uint8_t half[64];\
934  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
935  OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
936 }\
937 \
938 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
939 {\
940  uint8_t full[16*9];\
941  uint8_t half[64];\
942  copy_block9(full, src, 16, stride, 9);\
943  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
944  OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
945 }\
946 \
947 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
948 {\
949  uint8_t full[16*9];\
950  copy_block9(full, src, 16, stride, 9);\
951  OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
952 }\
953 \
954 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
955 {\
956  uint8_t full[16*9];\
957  uint8_t half[64];\
958  copy_block9(full, src, 16, stride, 9);\
959  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
960  OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
961 }\
962 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
963 {\
964  uint8_t full[16*9];\
965  uint8_t halfH[72];\
966  uint8_t halfV[64];\
967  uint8_t halfHV[64];\
968  copy_block9(full, src, 16, stride, 9);\
969  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
970  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
971  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
972  OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
973 }\
974 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
975 {\
976  uint8_t full[16*9];\
977  uint8_t halfH[72];\
978  uint8_t halfHV[64];\
979  copy_block9(full, src, 16, stride, 9);\
980  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
981  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
982  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
983  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
984 }\
985 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
986 {\
987  uint8_t full[16*9];\
988  uint8_t halfH[72];\
989  uint8_t halfV[64];\
990  uint8_t halfHV[64];\
991  copy_block9(full, src, 16, stride, 9);\
992  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
993  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
994  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
995  OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
996 }\
997 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
998 {\
999  uint8_t full[16*9];\
1000  uint8_t halfH[72];\
1001  uint8_t halfHV[64];\
1002  copy_block9(full, src, 16, stride, 9);\
1003  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1004  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1005  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1007 }\
1008 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1009 {\
1010  uint8_t full[16*9];\
1011  uint8_t halfH[72];\
1012  uint8_t halfV[64];\
1013  uint8_t halfHV[64];\
1014  copy_block9(full, src, 16, stride, 9);\
1015  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018  OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1019 }\
1020 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1021 {\
1022  uint8_t full[16*9];\
1023  uint8_t halfH[72];\
1024  uint8_t halfHV[64];\
1025  copy_block9(full, src, 16, stride, 9);\
1026  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1028  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1030 }\
1031 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1032 {\
1033  uint8_t full[16*9];\
1034  uint8_t halfH[72];\
1035  uint8_t halfV[64];\
1036  uint8_t halfHV[64];\
1037  copy_block9(full, src, 16, stride, 9);\
1038  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1039  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1040  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1041  OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1042 }\
1043 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1044 {\
1045  uint8_t full[16*9];\
1046  uint8_t halfH[72];\
1047  uint8_t halfHV[64];\
1048  copy_block9(full, src, 16, stride, 9);\
1049  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1050  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1051  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1053 }\
1054 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1055 {\
1056  uint8_t halfH[72];\
1057  uint8_t halfHV[64];\
1058  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1059  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1061 }\
1062 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1063 {\
1064  uint8_t halfH[72];\
1065  uint8_t halfHV[64];\
1066  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1067  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1068  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1069 }\
1070 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1071 {\
1072  uint8_t full[16*9];\
1073  uint8_t halfH[72];\
1074  uint8_t halfV[64];\
1075  uint8_t halfHV[64];\
1076  copy_block9(full, src, 16, stride, 9);\
1077  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1078  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1079  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1080  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1081 }\
1082 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1083 {\
1084  uint8_t full[16*9];\
1085  uint8_t halfH[72];\
1086  copy_block9(full, src, 16, stride, 9);\
1087  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1088  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1089  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1090 }\
1091 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1092 {\
1093  uint8_t full[16*9];\
1094  uint8_t halfH[72];\
1095  uint8_t halfV[64];\
1096  uint8_t halfHV[64];\
1097  copy_block9(full, src, 16, stride, 9);\
1098  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1099  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1100  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1101  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1102 }\
1103 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1104 {\
1105  uint8_t full[16*9];\
1106  uint8_t halfH[72];\
1107  copy_block9(full, src, 16, stride, 9);\
1108  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1109  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1110  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1111 }\
1112 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1113 {\
1114  uint8_t halfH[72];\
1115  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1117 }\
1118 \
1119 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1120 {\
1121  uint8_t half[256];\
1122  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123  OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1124 }\
1125 \
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1127 {\
1128  OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1129 }\
1130 \
1131 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1132 {\
1133  uint8_t half[256];\
1134  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135  OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1136 }\
1137 \
1138 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1139 {\
1140  uint8_t full[24*17];\
1141  uint8_t half[256];\
1142  copy_block17(full, src, 24, stride, 17);\
1143  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1144  OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1145 }\
1146 \
1147 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1148 {\
1149  uint8_t full[24*17];\
1150  copy_block17(full, src, 24, stride, 17);\
1151  OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1152 }\
1153 \
1154 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1155 {\
1156  uint8_t full[24*17];\
1157  uint8_t half[256];\
1158  copy_block17(full, src, 24, stride, 17);\
1159  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1160  OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1161 }\
1162 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1163 {\
1164  uint8_t full[24*17];\
1165  uint8_t halfH[272];\
1166  uint8_t halfV[256];\
1167  uint8_t halfHV[256];\
1168  copy_block17(full, src, 24, stride, 17);\
1169  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1170  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1171  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172  OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1173 }\
1174 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1175 {\
1176  uint8_t full[24*17];\
1177  uint8_t halfH[272];\
1178  uint8_t halfHV[256];\
1179  copy_block17(full, src, 24, stride, 17);\
1180  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1182  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1184 }\
1185 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1186 {\
1187  uint8_t full[24*17];\
1188  uint8_t halfH[272];\
1189  uint8_t halfV[256];\
1190  uint8_t halfHV[256];\
1191  copy_block17(full, src, 24, stride, 17);\
1192  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1193  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1194  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195  OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1196 }\
1197 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1198 {\
1199  uint8_t full[24*17];\
1200  uint8_t halfH[272];\
1201  uint8_t halfHV[256];\
1202  copy_block17(full, src, 24, stride, 17);\
1203  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1204  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1205  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1207 }\
1208 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1209 {\
1210  uint8_t full[24*17];\
1211  uint8_t halfH[272];\
1212  uint8_t halfV[256];\
1213  uint8_t halfHV[256];\
1214  copy_block17(full, src, 24, stride, 17);\
1215  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218  OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1219 }\
1220 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1221 {\
1222  uint8_t full[24*17];\
1223  uint8_t halfH[272];\
1224  uint8_t halfHV[256];\
1225  copy_block17(full, src, 24, stride, 17);\
1226  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1228  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1230 }\
1231 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1232 {\
1233  uint8_t full[24*17];\
1234  uint8_t halfH[272];\
1235  uint8_t halfV[256];\
1236  uint8_t halfHV[256];\
1237  copy_block17(full, src, 24, stride, 17);\
1238  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1239  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1240  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241  OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1242 }\
1243 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1244 {\
1245  uint8_t full[24*17];\
1246  uint8_t halfH[272];\
1247  uint8_t halfHV[256];\
1248  copy_block17(full, src, 24, stride, 17);\
1249  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1251  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1253 }\
1254 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1255 {\
1256  uint8_t halfH[272];\
1257  uint8_t halfHV[256];\
1258  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1259  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1261 }\
1262 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1263 {\
1264  uint8_t halfH[272];\
1265  uint8_t halfHV[256];\
1266  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1268  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1269 }\
1270 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1271 {\
1272  uint8_t full[24*17];\
1273  uint8_t halfH[272];\
1274  uint8_t halfV[256];\
1275  uint8_t halfHV[256];\
1276  copy_block17(full, src, 24, stride, 17);\
1277  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1278  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1279  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1280  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1281 }\
1282 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1283 {\
1284  uint8_t full[24*17];\
1285  uint8_t halfH[272];\
1286  copy_block17(full, src, 24, stride, 17);\
1287  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1288  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1289  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1290 }\
1291 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1292 {\
1293  uint8_t full[24*17];\
1294  uint8_t halfH[272];\
1295  uint8_t halfV[256];\
1296  uint8_t halfHV[256];\
1297  copy_block17(full, src, 24, stride, 17);\
1298  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1299  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1300  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1301  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1302 }\
1303 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1304 {\
1305  uint8_t full[24*17];\
1306  uint8_t halfH[272];\
1307  copy_block17(full, src, 24, stride, 17);\
1308  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1309  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1310  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1311 }\
1312 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1313 {\
1314  uint8_t halfH[272];\
1315  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1316  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1317 }
1318 
1319 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1320 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1321 #define op_put(a, b) a = cm[((b) + 16)>>5]
1322 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1323 
1324 QPEL_MC(0, put_ , _ , op_put)
1325 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1326 QPEL_MC(0, avg_ , _ , op_avg)
1327 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1328 #undef op_avg
1329 #undef op_avg_no_rnd
1330 #undef op_put
1331 #undef op_put_no_rnd
1332 
1334 {
1335  put_pixels8_8_c(dst, src, stride, 8);
1336 }
1338 {
1339  avg_pixels8_8_c(dst, src, stride, 8);
1340 }
1342 {
1343  put_pixels16_8_c(dst, src, stride, 16);
1344 }
1346 {
1347  avg_pixels16_8_c(dst, src, stride, 16);
1348 }
1349 
1350 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1351 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1352 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1353 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1354 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1355 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1356 
1357 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1358  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1359  int i;
1360 
1361  for(i=0; i<h; i++){
1362  dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1363  dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1364  dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1365  dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1366  dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1367  dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1368  dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1369  dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1370  dst+=dstStride;
1371  src+=srcStride;
1372  }
1373 }
1374 
1375 #if CONFIG_RV40_DECODER
1376 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1377 {
1378  put_pixels16_xy2_8_c(dst, src, stride, 16);
1379 }
1380 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1381 {
1382  avg_pixels16_xy2_8_c(dst, src, stride, 16);
1383 }
1384 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1385 {
1386  put_pixels8_xy2_8_c(dst, src, stride, 8);
1387 }
1388 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1389 {
1390  avg_pixels8_xy2_8_c(dst, src, stride, 8);
1391 }
1392 #endif /* CONFIG_RV40_DECODER */
1393 
1394 #if CONFIG_DIRAC_DECODER
1395 #define DIRAC_MC(OPNAME)\
1396 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1397 {\
1398  OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1399 }\
1400 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1401 {\
1402  OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1403 }\
1404 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1405 {\
1406  OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1407  OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1408 }\
1409 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1410 {\
1411  OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1412 }\
1413 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1414 {\
1415  OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1416 }\
1417 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1418 {\
1419  OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1420  OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1421 }\
1422 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1423 {\
1424  OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1425 }\
1426 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1427 {\
1428  OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1429 }\
1430 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1431 {\
1432  OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1433  OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1434 }
1435 DIRAC_MC(put)
1436 DIRAC_MC(avg)
1437 #endif
1438 
1439 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1440  const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1441  int i;
1442 
1443  for(i=0; i<w; i++){
1444  const int src_1= src[ -srcStride];
1445  const int src0 = src[0 ];
1446  const int src1 = src[ srcStride];
1447  const int src2 = src[2*srcStride];
1448  const int src3 = src[3*srcStride];
1449  const int src4 = src[4*srcStride];
1450  const int src5 = src[5*srcStride];
1451  const int src6 = src[6*srcStride];
1452  const int src7 = src[7*srcStride];
1453  const int src8 = src[8*srcStride];
1454  const int src9 = src[9*srcStride];
1455  dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1456  dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1457  dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1458  dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1459  dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1460  dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1461  dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1462  dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1463  src++;
1464  dst++;
1465  }
1466 }
1467 
1468 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1469 {
1470  uint8_t half[64];
1471  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1472  put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1473 }
1474 
1475 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1476 {
1477  wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1478 }
1479 
1480 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1481 {
1482  uint8_t half[64];
1483  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1484  put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1485 }
1486 
1487 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1488 {
1489  wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1490 }
1491 
1492 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1493 {
1494  uint8_t halfH[88];
1495  uint8_t halfV[64];
1496  uint8_t halfHV[64];
1497  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1498  wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1499  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1500  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1501 }
1502 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1503 {
1504  uint8_t halfH[88];
1505  uint8_t halfV[64];
1506  uint8_t halfHV[64];
1507  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1508  wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1509  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1510  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1511 }
1512 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1513 {
1514  uint8_t halfH[88];
1515  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1516  wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1517 }
1518 
1519 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1520  if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1521  int x;
1522  const int strength= ff_h263_loop_filter_strength[qscale];
1523 
1524  for(x=0; x<8; x++){
1525  int d1, d2, ad1;
1526  int p0= src[x-2*stride];
1527  int p1= src[x-1*stride];
1528  int p2= src[x+0*stride];
1529  int p3= src[x+1*stride];
1530  int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1531 
1532  if (d<-2*strength) d1= 0;
1533  else if(d<- strength) d1=-2*strength - d;
1534  else if(d< strength) d1= d;
1535  else if(d< 2*strength) d1= 2*strength - d;
1536  else d1= 0;
1537 
1538  p1 += d1;
1539  p2 -= d1;
1540  if(p1&256) p1= ~(p1>>31);
1541  if(p2&256) p2= ~(p2>>31);
1542 
1543  src[x-1*stride] = p1;
1544  src[x+0*stride] = p2;
1545 
1546  ad1= FFABS(d1)>>1;
1547 
1548  d2= av_clip((p0-p3)/4, -ad1, ad1);
1549 
1550  src[x-2*stride] = p0 - d2;
1551  src[x+ stride] = p3 + d2;
1552  }
1553  }
1554 }
1555 
1556 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1557  if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1558  int y;
1559  const int strength= ff_h263_loop_filter_strength[qscale];
1560 
1561  for(y=0; y<8; y++){
1562  int d1, d2, ad1;
1563  int p0= src[y*stride-2];
1564  int p1= src[y*stride-1];
1565  int p2= src[y*stride+0];
1566  int p3= src[y*stride+1];
1567  int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1568 
1569  if (d<-2*strength) d1= 0;
1570  else if(d<- strength) d1=-2*strength - d;
1571  else if(d< strength) d1= d;
1572  else if(d< 2*strength) d1= 2*strength - d;
1573  else d1= 0;
1574 
1575  p1 += d1;
1576  p2 -= d1;
1577  if(p1&256) p1= ~(p1>>31);
1578  if(p2&256) p2= ~(p2>>31);
1579 
1580  src[y*stride-1] = p1;
1581  src[y*stride+0] = p2;
1582 
1583  ad1= FFABS(d1)>>1;
1584 
1585  d2= av_clip((p0-p3)/4, -ad1, ad1);
1586 
1587  src[y*stride-2] = p0 - d2;
1588  src[y*stride+1] = p3 + d2;
1589  }
1590  }
1591 }
1592 
1593 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1594 {
1595  int s, i;
1596 
1597  s = 0;
1598  for(i=0;i<h;i++) {
1599  s += abs(pix1[0] - pix2[0]);
1600  s += abs(pix1[1] - pix2[1]);
1601  s += abs(pix1[2] - pix2[2]);
1602  s += abs(pix1[3] - pix2[3]);
1603  s += abs(pix1[4] - pix2[4]);
1604  s += abs(pix1[5] - pix2[5]);
1605  s += abs(pix1[6] - pix2[6]);
1606  s += abs(pix1[7] - pix2[7]);
1607  s += abs(pix1[8] - pix2[8]);
1608  s += abs(pix1[9] - pix2[9]);
1609  s += abs(pix1[10] - pix2[10]);
1610  s += abs(pix1[11] - pix2[11]);
1611  s += abs(pix1[12] - pix2[12]);
1612  s += abs(pix1[13] - pix2[13]);
1613  s += abs(pix1[14] - pix2[14]);
1614  s += abs(pix1[15] - pix2[15]);
1615  pix1 += line_size;
1616  pix2 += line_size;
1617  }
1618  return s;
1619 }
1620 
1621 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1622 {
1623  int s, i;
1624 
1625  s = 0;
1626  for(i=0;i<h;i++) {
1627  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1628  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1629  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1630  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1631  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1632  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1633  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1634  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1635  s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1636  s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1637  s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1638  s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1639  s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1640  s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1641  s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1642  s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1643  pix1 += line_size;
1644  pix2 += line_size;
1645  }
1646  return s;
1647 }
1648 
1649 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1650 {
1651  int s, i;
1652  uint8_t *pix3 = pix2 + line_size;
1653 
1654  s = 0;
1655  for(i=0;i<h;i++) {
1656  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1657  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1658  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1659  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1660  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1661  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1662  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1663  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1664  s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1665  s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1666  s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1667  s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1668  s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1669  s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1670  s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1671  s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1672  pix1 += line_size;
1673  pix2 += line_size;
1674  pix3 += line_size;
1675  }
1676  return s;
1677 }
1678 
1679 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1680 {
1681  int s, i;
1682  uint8_t *pix3 = pix2 + line_size;
1683 
1684  s = 0;
1685  for(i=0;i<h;i++) {
1686  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1687  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1688  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1689  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1690  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1691  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1692  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1693  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1694  s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1695  s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1696  s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1697  s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1698  s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1699  s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1700  s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1701  s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1702  pix1 += line_size;
1703  pix2 += line_size;
1704  pix3 += line_size;
1705  }
1706  return s;
1707 }
1708 
1709 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1710 {
1711  int s, i;
1712 
1713  s = 0;
1714  for(i=0;i<h;i++) {
1715  s += abs(pix1[0] - pix2[0]);
1716  s += abs(pix1[1] - pix2[1]);
1717  s += abs(pix1[2] - pix2[2]);
1718  s += abs(pix1[3] - pix2[3]);
1719  s += abs(pix1[4] - pix2[4]);
1720  s += abs(pix1[5] - pix2[5]);
1721  s += abs(pix1[6] - pix2[6]);
1722  s += abs(pix1[7] - pix2[7]);
1723  pix1 += line_size;
1724  pix2 += line_size;
1725  }
1726  return s;
1727 }
1728 
1729 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1730 {
1731  int s, i;
1732 
1733  s = 0;
1734  for(i=0;i<h;i++) {
1735  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1736  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1737  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1738  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1739  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1740  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1741  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1742  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1743  pix1 += line_size;
1744  pix2 += line_size;
1745  }
1746  return s;
1747 }
1748 
1749 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1750 {
1751  int s, i;
1752  uint8_t *pix3 = pix2 + line_size;
1753 
1754  s = 0;
1755  for(i=0;i<h;i++) {
1756  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1757  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1758  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1759  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1760  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1761  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1762  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1763  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1764  pix1 += line_size;
1765  pix2 += line_size;
1766  pix3 += line_size;
1767  }
1768  return s;
1769 }
1770 
1771 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1772 {
1773  int s, i;
1774  uint8_t *pix3 = pix2 + line_size;
1775 
1776  s = 0;
1777  for(i=0;i<h;i++) {
1778  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1779  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1780  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1781  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1782  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1783  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1784  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1785  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1786  pix1 += line_size;
1787  pix2 += line_size;
1788  pix3 += line_size;
1789  }
1790  return s;
1791 }
1792 
1793 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1794  MpegEncContext *c = v;
1795  int score1=0;
1796  int score2=0;
1797  int x,y;
1798 
1799  for(y=0; y<h; y++){
1800  for(x=0; x<16; x++){
1801  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1802  }
1803  if(y+1<h){
1804  for(x=0; x<15; x++){
1805  score2+= FFABS( s1[x ] - s1[x +stride]
1806  - s1[x+1] + s1[x+1+stride])
1807  -FFABS( s2[x ] - s2[x +stride]
1808  - s2[x+1] + s2[x+1+stride]);
1809  }
1810  }
1811  s1+= stride;
1812  s2+= stride;
1813  }
1814 
1815  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1816  else return score1 + FFABS(score2)*8;
1817 }
1818 
1819 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1820  MpegEncContext *c = v;
1821  int score1=0;
1822  int score2=0;
1823  int x,y;
1824 
1825  for(y=0; y<h; y++){
1826  for(x=0; x<8; x++){
1827  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1828  }
1829  if(y+1<h){
1830  for(x=0; x<7; x++){
1831  score2+= FFABS( s1[x ] - s1[x +stride]
1832  - s1[x+1] + s1[x+1+stride])
1833  -FFABS( s2[x ] - s2[x +stride]
1834  - s2[x+1] + s2[x+1+stride]);
1835  }
1836  }
1837  s1+= stride;
1838  s2+= stride;
1839  }
1840 
1841  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1842  else return score1 + FFABS(score2)*8;
1843 }
1844 
1845 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1846  int i;
1847  unsigned int sum=0;
1848 
1849  for(i=0; i<8*8; i++){
1850  int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1851  int w= weight[i];
1852  b>>= RECON_SHIFT;
1853  av_assert2(-512<b && b<512);
1854 
1855  sum += (w*b)*(w*b)>>4;
1856  }
1857  return sum>>2;
1858 }
1859 
1860 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1861  int i;
1862 
1863  for(i=0; i<8*8; i++){
1864  rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1865  }
1866 }
1867 
1868 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1869  return 0;
1870 }
1871 
1873  int i;
1874 
1875  memset(cmp, 0, sizeof(void*)*6);
1876 
1877  for(i=0; i<6; i++){
1878  switch(type&0xFF){
1879  case FF_CMP_SAD:
1880  cmp[i]= c->sad[i];
1881  break;
1882  case FF_CMP_SATD:
1883  cmp[i]= c->hadamard8_diff[i];
1884  break;
1885  case FF_CMP_SSE:
1886  cmp[i]= c->sse[i];
1887  break;
1888  case FF_CMP_DCT:
1889  cmp[i]= c->dct_sad[i];
1890  break;
1891  case FF_CMP_DCT264:
1892  cmp[i]= c->dct264_sad[i];
1893  break;
1894  case FF_CMP_DCTMAX:
1895  cmp[i]= c->dct_max[i];
1896  break;
1897  case FF_CMP_PSNR:
1898  cmp[i]= c->quant_psnr[i];
1899  break;
1900  case FF_CMP_BIT:
1901  cmp[i]= c->bit[i];
1902  break;
1903  case FF_CMP_RD:
1904  cmp[i]= c->rd[i];
1905  break;
1906  case FF_CMP_VSAD:
1907  cmp[i]= c->vsad[i];
1908  break;
1909  case FF_CMP_VSSE:
1910  cmp[i]= c->vsse[i];
1911  break;
1912  case FF_CMP_ZERO:
1913  cmp[i]= zero_cmp;
1914  break;
1915  case FF_CMP_NSSE:
1916  cmp[i]= c->nsse[i];
1917  break;
1918 #if CONFIG_DWT
1919  case FF_CMP_W53:
1920  cmp[i]= c->w53[i];
1921  break;
1922  case FF_CMP_W97:
1923  cmp[i]= c->w97[i];
1924  break;
1925 #endif
1926  default:
1927  av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1928  }
1929  }
1930 }
1931 
1932 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1933  long i;
1934  for(i=0; i<=w-(int)sizeof(long); i+=sizeof(long)){
1935  long a = *(long*)(src+i);
1936  long b = *(long*)(dst+i);
1937  *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1938  }
1939  for(; i<w; i++)
1940  dst[i+0] += src[i+0];
1941 }
1942 
1943 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1944  long i;
1945 #if !HAVE_FAST_UNALIGNED
1946  if((long)src2 & (sizeof(long)-1)){
1947  for(i=0; i+7<w; i+=8){
1948  dst[i+0] = src1[i+0]-src2[i+0];
1949  dst[i+1] = src1[i+1]-src2[i+1];
1950  dst[i+2] = src1[i+2]-src2[i+2];
1951  dst[i+3] = src1[i+3]-src2[i+3];
1952  dst[i+4] = src1[i+4]-src2[i+4];
1953  dst[i+5] = src1[i+5]-src2[i+5];
1954  dst[i+6] = src1[i+6]-src2[i+6];
1955  dst[i+7] = src1[i+7]-src2[i+7];
1956  }
1957  }else
1958 #endif
1959  for(i=0; i<=w-(int)sizeof(long); i+=sizeof(long)){
1960  long a = *(long*)(src1+i);
1961  long b = *(long*)(src2+i);
1962  *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1963  }
1964  for(; i<w; i++)
1965  dst[i+0] = src1[i+0]-src2[i+0];
1966 }
1967 
1968 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1969  int i;
1970  uint8_t l, lt;
1971 
1972  l= *left;
1973  lt= *left_top;
1974 
1975  for(i=0; i<w; i++){
1976  l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1977  lt= src1[i];
1978  dst[i]= l;
1979  }
1980 
1981  *left= l;
1982  *left_top= lt;
1983 }
1984 
1985 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1986  int i;
1987  uint8_t l, lt;
1988 
1989  l= *left;
1990  lt= *left_top;
1991 
1992  for(i=0; i<w; i++){
1993  const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1994  lt= src1[i];
1995  l= src2[i];
1996  dst[i]= l - pred;
1997  }
1998 
1999  *left= l;
2000  *left_top= lt;
2001 }
2002 
2003 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
2004  int i;
2005 
2006  for(i=0; i<w-1; i++){
2007  acc+= src[i];
2008  dst[i]= acc;
2009  i++;
2010  acc+= src[i];
2011  dst[i]= acc;
2012  }
2013 
2014  for(; i<w; i++){
2015  acc+= src[i];
2016  dst[i]= acc;
2017  }
2018 
2019  return acc;
2020 }
2021 
2022 #if HAVE_BIGENDIAN
2023 #define B 3
2024 #define G 2
2025 #define R 1
2026 #define A 0
2027 #else
2028 #define B 0
2029 #define G 1
2030 #define R 2
2031 #define A 3
2032 #endif
2033 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2034  int i;
2035  int r,g,b,a;
2036  r= *red;
2037  g= *green;
2038  b= *blue;
2039  a= *alpha;
2040 
2041  for(i=0; i<w; i++){
2042  b+= src[4*i+B];
2043  g+= src[4*i+G];
2044  r+= src[4*i+R];
2045  a+= src[4*i+A];
2046 
2047  dst[4*i+B]= b;
2048  dst[4*i+G]= g;
2049  dst[4*i+R]= r;
2050  dst[4*i+A]= a;
2051  }
2052 
2053  *red= r;
2054  *green= g;
2055  *blue= b;
2056  *alpha= a;
2057 }
2058 #undef B
2059 #undef G
2060 #undef R
2061 #undef A
2062 
2063 #define BUTTERFLY2(o1,o2,i1,i2) \
2064 o1= (i1)+(i2);\
2065 o2= (i1)-(i2);
2066 
2067 #define BUTTERFLY1(x,y) \
2068 {\
2069  int a,b;\
2070  a= x;\
2071  b= y;\
2072  x= a+b;\
2073  y= a-b;\
2074 }
2075 
2076 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2077 
2078 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2079  int i;
2080  int temp[64];
2081  int sum=0;
2082 
2083  av_assert2(h==8);
2084 
2085  for(i=0; i<8; i++){
2086  //FIXME try pointer walks
2087  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2088  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2089  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2090  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2091 
2092  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2093  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2094  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2095  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2096 
2097  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2098  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2099  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2100  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2101  }
2102 
2103  for(i=0; i<8; i++){
2104  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2105  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2106  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2107  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2108 
2109  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2110  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2111  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2112  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2113 
2114  sum +=
2115  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2116  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2117  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2118  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2119  }
2120  return sum;
2121 }
2122 
2123 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2124  int i;
2125  int temp[64];
2126  int sum=0;
2127 
2128  av_assert2(h==8);
2129 
2130  for(i=0; i<8; i++){
2131  //FIXME try pointer walks
2132  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2133  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2134  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2135  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2136 
2137  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2138  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2139  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2140  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2141 
2142  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2143  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2144  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2145  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2146  }
2147 
2148  for(i=0; i<8; i++){
2149  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2150  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2151  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2152  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2153 
2154  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2155  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2156  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2157  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2158 
2159  sum +=
2160  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2161  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2162  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2163  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2164  }
2165 
2166  sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2167 
2168  return sum;
2169 }
2170 
2171 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2172  MpegEncContext * const s= (MpegEncContext *)c;
2173  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2174 
2175  av_assert2(h==8);
2176 
2177  s->dsp.diff_pixels(temp, src1, src2, stride);
2178  s->dsp.fdct(temp);
2179  return s->dsp.sum_abs_dctelem(temp);
2180 }
2181 
2182 #if CONFIG_GPL
2183 #define DCT8_1D {\
2184  const int s07 = SRC(0) + SRC(7);\
2185  const int s16 = SRC(1) + SRC(6);\
2186  const int s25 = SRC(2) + SRC(5);\
2187  const int s34 = SRC(3) + SRC(4);\
2188  const int a0 = s07 + s34;\
2189  const int a1 = s16 + s25;\
2190  const int a2 = s07 - s34;\
2191  const int a3 = s16 - s25;\
2192  const int d07 = SRC(0) - SRC(7);\
2193  const int d16 = SRC(1) - SRC(6);\
2194  const int d25 = SRC(2) - SRC(5);\
2195  const int d34 = SRC(3) - SRC(4);\
2196  const int a4 = d16 + d25 + (d07 + (d07>>1));\
2197  const int a5 = d07 - d34 - (d25 + (d25>>1));\
2198  const int a6 = d07 + d34 - (d16 + (d16>>1));\
2199  const int a7 = d16 - d25 + (d34 + (d34>>1));\
2200  DST(0, a0 + a1 ) ;\
2201  DST(1, a4 + (a7>>2)) ;\
2202  DST(2, a2 + (a3>>1)) ;\
2203  DST(3, a5 + (a6>>2)) ;\
2204  DST(4, a0 - a1 ) ;\
2205  DST(5, a6 - (a5>>2)) ;\
2206  DST(6, (a2>>1) - a3 ) ;\
2207  DST(7, (a4>>2) - a7 ) ;\
2208 }
2209 
2210 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2211  MpegEncContext * const s= (MpegEncContext *)c;
2212  int16_t dct[8][8];
2213  int i;
2214  int sum=0;
2215 
2216  s->dsp.diff_pixels(dct[0], src1, src2, stride);
2217 
2218 #define SRC(x) dct[i][x]
2219 #define DST(x,v) dct[i][x]= v
2220  for( i = 0; i < 8; i++ )
2221  DCT8_1D
2222 #undef SRC
2223 #undef DST
2224 
2225 #define SRC(x) dct[x][i]
2226 #define DST(x,v) sum += FFABS(v)
2227  for( i = 0; i < 8; i++ )
2228  DCT8_1D
2229 #undef SRC
2230 #undef DST
2231  return sum;
2232 }
2233 #endif
2234 
2235 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2236  MpegEncContext * const s= (MpegEncContext *)c;
2237  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2238  int sum=0, i;
2239 
2240  av_assert2(h==8);
2241 
2242  s->dsp.diff_pixels(temp, src1, src2, stride);
2243  s->dsp.fdct(temp);
2244 
2245  for(i=0; i<64; i++)
2246  sum= FFMAX(sum, FFABS(temp[i]));
2247 
2248  return sum;
2249 }
2250 
2251 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2252  MpegEncContext * const s= (MpegEncContext *)c;
2253  LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2254  int16_t * const bak = temp+64;
2255  int sum=0, i;
2256 
2257  av_assert2(h==8);
2258  s->mb_intra=0;
2259 
2260  s->dsp.diff_pixels(temp, src1, src2, stride);
2261 
2262  memcpy(bak, temp, 64*sizeof(int16_t));
2263 
2264  s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2265  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2266  ff_simple_idct_8(temp); //FIXME
2267 
2268  for(i=0; i<64; i++)
2269  sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2270 
2271  return sum;
2272 }
2273 
2274 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2275  MpegEncContext * const s= (MpegEncContext *)c;
2276  const uint8_t *scantable= s->intra_scantable.permutated;
2277  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2278  LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2279  LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2280  int i, last, run, bits, level, distortion, start_i;
2281  const int esc_length= s->ac_esc_length;
2282  uint8_t * length;
2283  uint8_t * last_length;
2284 
2285  av_assert2(h==8);
2286 
2287  copy_block8(lsrc1, src1, 8, stride, 8);
2288  copy_block8(lsrc2, src2, 8, stride, 8);
2289 
2290  s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2291 
2292  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2293 
2294  bits=0;
2295 
2296  if (s->mb_intra) {
2297  start_i = 1;
2298  length = s->intra_ac_vlc_length;
2299  last_length= s->intra_ac_vlc_last_length;
2300  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2301  } else {
2302  start_i = 0;
2303  length = s->inter_ac_vlc_length;
2304  last_length= s->inter_ac_vlc_last_length;
2305  }
2306 
2307  if(last>=start_i){
2308  run=0;
2309  for(i=start_i; i<last; i++){
2310  int j= scantable[i];
2311  level= temp[j];
2312 
2313  if(level){
2314  level+=64;
2315  if((level&(~127)) == 0){
2316  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2317  }else
2318  bits+= esc_length;
2319  run=0;
2320  }else
2321  run++;
2322  }
2323  i= scantable[last];
2324 
2325  level= temp[i] + 64;
2326 
2327  av_assert2(level - 64);
2328 
2329  if((level&(~127)) == 0){
2330  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2331  }else
2332  bits+= esc_length;
2333 
2334  }
2335 
2336  if(last>=0){
2337  if(s->mb_intra)
2338  s->dct_unquantize_intra(s, temp, 0, s->qscale);
2339  else
2340  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2341  }
2342 
2343  s->dsp.idct_add(lsrc2, 8, temp);
2344 
2345  distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2346 
2347  return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2348 }
2349 
2350 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2351  MpegEncContext * const s= (MpegEncContext *)c;
2352  const uint8_t *scantable= s->intra_scantable.permutated;
2353  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2354  int i, last, run, bits, level, start_i;
2355  const int esc_length= s->ac_esc_length;
2356  uint8_t * length;
2357  uint8_t * last_length;
2358 
2359  av_assert2(h==8);
2360 
2361  s->dsp.diff_pixels(temp, src1, src2, stride);
2362 
2363  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2364 
2365  bits=0;
2366 
2367  if (s->mb_intra) {
2368  start_i = 1;
2369  length = s->intra_ac_vlc_length;
2370  last_length= s->intra_ac_vlc_last_length;
2371  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2372  } else {
2373  start_i = 0;
2374  length = s->inter_ac_vlc_length;
2375  last_length= s->inter_ac_vlc_last_length;
2376  }
2377 
2378  if(last>=start_i){
2379  run=0;
2380  for(i=start_i; i<last; i++){
2381  int j= scantable[i];
2382  level= temp[j];
2383 
2384  if(level){
2385  level+=64;
2386  if((level&(~127)) == 0){
2387  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2388  }else
2389  bits+= esc_length;
2390  run=0;
2391  }else
2392  run++;
2393  }
2394  i= scantable[last];
2395 
2396  level= temp[i] + 64;
2397 
2398  av_assert2(level - 64);
2399 
2400  if((level&(~127)) == 0){
2401  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2402  }else
2403  bits+= esc_length;
2404  }
2405 
2406  return bits;
2407 }
2408 
2409 #define VSAD_INTRA(size) \
2410 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2411  int score=0; \
2412  int x,y; \
2413  \
2414  for(y=1; y<h; y++){ \
2415  for(x=0; x<size; x+=4){ \
2416  score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2417  +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2418  } \
2419  s+= stride; \
2420  } \
2421  \
2422  return score; \
2423 }
2424 VSAD_INTRA(8)
2425 VSAD_INTRA(16)
2426 
2427 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2428  int score=0;
2429  int x,y;
2430 
2431  for(y=1; y<h; y++){
2432  for(x=0; x<16; x++){
2433  score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2434  }
2435  s1+= stride;
2436  s2+= stride;
2437  }
2438 
2439  return score;
2440 }
2441 
2442 #define SQ(a) ((a)*(a))
2443 #define VSSE_INTRA(size) \
2444 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2445  int score=0; \
2446  int x,y; \
2447  \
2448  for(y=1; y<h; y++){ \
2449  for(x=0; x<size; x+=4){ \
2450  score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2451  +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2452  } \
2453  s+= stride; \
2454  } \
2455  \
2456  return score; \
2457 }
2458 VSSE_INTRA(8)
2459 VSSE_INTRA(16)
2460 
2461 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2462  int score=0;
2463  int x,y;
2464 
2465  for(y=1; y<h; y++){
2466  for(x=0; x<16; x++){
2467  score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2468  }
2469  s1+= stride;
2470  s2+= stride;
2471  }
2472 
2473  return score;
2474 }
2475 
2476 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2477  int size){
2478  int score=0;
2479  int i;
2480  for(i=0; i<size; i++)
2481  score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2482  return score;
2483 }
2484 
2485 #define WRAPPER8_16_SQ(name8, name16)\
2486 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2487  int score=0;\
2488  score +=name8(s, dst , src , stride, 8);\
2489  score +=name8(s, dst+8 , src+8 , stride, 8);\
2490  if(h==16){\
2491  dst += 8*stride;\
2492  src += 8*stride;\
2493  score +=name8(s, dst , src , stride, 8);\
2494  score +=name8(s, dst+8 , src+8 , stride, 8);\
2495  }\
2496  return score;\
2497 }
2498 
2499 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2500 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2501 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2502 #if CONFIG_GPL
2503 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2504 #endif
2505 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2506 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2507 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2508 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2509 
2510 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2511  uint32_t maxi, uint32_t maxisign)
2512 {
2513 
2514  if(a > mini) return mini;
2515  else if((a^(1U<<31)) > maxisign) return maxi;
2516  else return a;
2517 }
2518 
2519 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2520  int i;
2521  uint32_t mini = *(uint32_t*)min;
2522  uint32_t maxi = *(uint32_t*)max;
2523  uint32_t maxisign = maxi ^ (1U<<31);
2524  uint32_t *dsti = (uint32_t*)dst;
2525  const uint32_t *srci = (const uint32_t*)src;
2526  for(i=0; i<len; i+=8) {
2527  dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2528  dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2529  dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2530  dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2531  dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2532  dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2533  dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2534  dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2535  }
2536 }
2537 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2538  int i;
2539  if(min < 0 && max > 0) {
2540  vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2541  } else {
2542  for(i=0; i < len; i+=8) {
2543  dst[i ] = av_clipf(src[i ], min, max);
2544  dst[i + 1] = av_clipf(src[i + 1], min, max);
2545  dst[i + 2] = av_clipf(src[i + 2], min, max);
2546  dst[i + 3] = av_clipf(src[i + 3], min, max);
2547  dst[i + 4] = av_clipf(src[i + 4], min, max);
2548  dst[i + 5] = av_clipf(src[i + 5], min, max);
2549  dst[i + 6] = av_clipf(src[i + 6], min, max);
2550  dst[i + 7] = av_clipf(src[i + 7], min, max);
2551  }
2552  }
2553 }
2554 
2555 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2556 {
2557  int res = 0;
2558 
2559  while (order--)
2560  res += *v1++ * *v2++;
2561 
2562  return res;
2563 }
2564 
2565 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2566 {
2567  int res = 0;
2568  while (order--) {
2569  res += *v1 * *v2++;
2570  *v1++ += mul * *v3++;
2571  }
2572  return res;
2573 }
2574 
2575 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2576  const int16_t *window, unsigned int len)
2577 {
2578  int i;
2579  int len2 = len >> 1;
2580 
2581  for (i = 0; i < len2; i++) {
2582  int16_t w = window[i];
2583  output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2584  output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2585  }
2586 }
2587 
2588 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2589  int32_t max, unsigned int len)
2590 {
2591  do {
2592  *dst++ = av_clip(*src++, min, max);
2593  *dst++ = av_clip(*src++, min, max);
2594  *dst++ = av_clip(*src++, min, max);
2595  *dst++ = av_clip(*src++, min, max);
2596  *dst++ = av_clip(*src++, min, max);
2597  *dst++ = av_clip(*src++, min, max);
2598  *dst++ = av_clip(*src++, min, max);
2599  *dst++ = av_clip(*src++, min, max);
2600  len -= 8;
2601  } while (len > 0);
2602 }
2603 
2604 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2605 {
2606  ff_j_rev_dct (block);
2607  put_pixels_clamped_c(block, dest, line_size);
2608 }
2609 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2610 {
2611  ff_j_rev_dct (block);
2612  add_pixels_clamped_c(block, dest, line_size);
2613 }
2614 
2615 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2616 {
2617  ff_j_rev_dct4 (block);
2618  put_pixels_clamped4_c(block, dest, line_size);
2619 }
2620 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2621 {
2622  ff_j_rev_dct4 (block);
2623  add_pixels_clamped4_c(block, dest, line_size);
2624 }
2625 
2626 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2627 {
2628  ff_j_rev_dct2 (block);
2629  put_pixels_clamped2_c(block, dest, line_size);
2630 }
2631 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2632 {
2633  ff_j_rev_dct2 (block);
2634  add_pixels_clamped2_c(block, dest, line_size);
2635 }
2636 
2637 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2638 {
2639  dest[0] = av_clip_uint8((block[0] + 4)>>3);
2640 }
2641 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2642 {
2643  dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2644 }
2645 
2646 /* init static data */
2648 {
2649  int i;
2650 
2651  for(i=0;i<512;i++) {
2652  ff_squareTbl[i] = (i - 256) * (i - 256);
2653  }
2654 
2655  for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2656 }
2657 
2659  static int did_fail=0;
2660  LOCAL_ALIGNED_16(int, aligned, [4]);
2661 
2662  if((intptr_t)aligned & 15){
2663  if(!did_fail){
2664 #if HAVE_MMX || HAVE_ALTIVEC
2665  av_log(NULL, AV_LOG_ERROR,
2666  "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2667  "and may be very slow or crash. This is not a bug in libavcodec,\n"
2668  "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2669  "Do not report crashes to FFmpeg developers.\n");
2670 #endif
2671  did_fail=1;
2672  }
2673  return -1;
2674  }
2675  return 0;
2676 }
2677 
2679 {
2681 
2682 #if CONFIG_ENCODERS
2683  if (avctx->bits_per_raw_sample == 10) {
2686  } else {
2687  if(avctx->dct_algo==FF_DCT_FASTINT) {
2688  c->fdct = ff_fdct_ifast;
2690  }
2691  else if(avctx->dct_algo==FF_DCT_FAAN) {
2692  c->fdct = ff_faandct;
2693  c->fdct248 = ff_faandct248;
2694  }
2695  else {
2696  c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2698  }
2699  }
2700 #endif //CONFIG_ENCODERS
2701 
2702  if(avctx->lowres==1){
2705  c->idct = ff_j_rev_dct4;
2707  }else if(avctx->lowres==2){
2710  c->idct = ff_j_rev_dct2;
2712  }else if(avctx->lowres==3){
2715  c->idct = ff_j_rev_dct1;
2717  }else{
2718  if (avctx->bits_per_raw_sample == 10) {
2721  c->idct = ff_simple_idct_10;
2723  } else if (avctx->bits_per_raw_sample == 12) {
2726  c->idct = ff_simple_idct_12;
2728  } else {
2729  if(avctx->idct_algo==FF_IDCT_INT){
2730  c->idct_put= jref_idct_put;
2731  c->idct_add= jref_idct_add;
2732  c->idct = ff_j_rev_dct;
2734  }else if(avctx->idct_algo==FF_IDCT_FAAN){
2737  c->idct = ff_faanidct;
2739  }else{ //accurate/default
2742  c->idct = ff_simple_idct_8;
2744  }
2745  }
2746  }
2747 
2753  c->gmc1 = gmc1_c;
2754  c->gmc = ff_gmc_c;
2755  c->pix_sum = pix_sum_c;
2756  c->pix_norm1 = pix_norm1_c;
2757 
2759  c->fill_block_tab[1] = fill_block8_c;
2760 
2761  /* TODO [0] 16 [1] 8 */
2762  c->pix_abs[0][0] = pix_abs16_c;
2763  c->pix_abs[0][1] = pix_abs16_x2_c;
2764  c->pix_abs[0][2] = pix_abs16_y2_c;
2765  c->pix_abs[0][3] = pix_abs16_xy2_c;
2766  c->pix_abs[1][0] = pix_abs8_c;
2767  c->pix_abs[1][1] = pix_abs8_x2_c;
2768  c->pix_abs[1][2] = pix_abs8_y2_c;
2769  c->pix_abs[1][3] = pix_abs8_xy2_c;
2770 
2780 
2790 
2791 #define dspfunc(PFX, IDX, NUM) \
2792  c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2793  c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2794  c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2795  c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2796  c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2797  c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2798  c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2799  c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2800  c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2801  c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2802  c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2803  c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2804  c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2805  c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2806  c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2807  c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2808 
2809  dspfunc(put_qpel, 0, 16);
2810  dspfunc(put_no_rnd_qpel, 0, 16);
2811 
2812  dspfunc(avg_qpel, 0, 16);
2813  /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2814 
2815  dspfunc(put_qpel, 1, 8);
2816  dspfunc(put_no_rnd_qpel, 1, 8);
2817 
2818  dspfunc(avg_qpel, 1, 8);
2819  /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2820 
2821 #undef dspfunc
2822 
2831 
2832 #define SET_CMP_FUNC(name) \
2833  c->name[0]= name ## 16_c;\
2834  c->name[1]= name ## 8x8_c;
2835 
2836  SET_CMP_FUNC(hadamard8_diff)
2837  c->hadamard8_diff[4]= hadamard8_intra16_c;
2839  SET_CMP_FUNC(dct_sad)
2840  SET_CMP_FUNC(dct_max)
2841 #if CONFIG_GPL
2842  SET_CMP_FUNC(dct264_sad)
2843 #endif
2844  c->sad[0]= pix_abs16_c;
2845  c->sad[1]= pix_abs8_c;
2846  c->sse[0]= sse16_c;
2847  c->sse[1]= sse8_c;
2848  c->sse[2]= sse4_c;
2849  SET_CMP_FUNC(quant_psnr)
2850  SET_CMP_FUNC(rd)
2851  SET_CMP_FUNC(bit)
2852  c->vsad[0]= vsad16_c;
2853  c->vsad[4]= vsad_intra16_c;
2854  c->vsad[5]= vsad_intra8_c;
2855  c->vsse[0]= vsse16_c;
2856  c->vsse[4]= vsse_intra16_c;
2857  c->vsse[5]= vsse_intra8_c;
2858  c->nsse[0]= nsse16_c;
2859  c->nsse[1]= nsse8_c;
2860 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2862 #endif
2863 
2865 
2866  c->add_bytes= add_bytes_c;
2872  c->bswap_buf= bswap_buf;
2873  c->bswap16_buf = bswap16_buf;
2874 
2875  if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2878  }
2879 
2882 
2888 
2889  c->shrink[0]= av_image_copy_plane;
2890  c->shrink[1]= ff_shrink22;
2891  c->shrink[2]= ff_shrink44;
2892  c->shrink[3]= ff_shrink88;
2893 
2895 
2896 #undef FUNC
2897 #undef FUNCC
2898 #define FUNC(f, depth) f ## _ ## depth
2899 #define FUNCC(f, depth) f ## _ ## depth ## _c
2900 
2901  c->draw_edges = FUNCC(draw_edges, 8);
2902  c->clear_block = FUNCC(clear_block, 8);
2903  c->clear_blocks = FUNCC(clear_blocks, 8);
2904 
2905 #define BIT_DEPTH_FUNCS(depth) \
2906  c->get_pixels = FUNCC(get_pixels, depth);
2907 
2908  switch (avctx->bits_per_raw_sample) {
2909  case 9:
2910  case 10:
2911  case 12:
2912  case 14:
2913  BIT_DEPTH_FUNCS(16);
2914  break;
2915  default:
2916  if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2917  BIT_DEPTH_FUNCS(8);
2918  }
2919  break;
2920  }
2921 
2922 
2923  if (ARCH_ALPHA)
2924  ff_dsputil_init_alpha(c, avctx);
2925  if (ARCH_ARM)
2926  ff_dsputil_init_arm(c, avctx);
2927  if (ARCH_BFIN)
2928  ff_dsputil_init_bfin(c, avctx);
2929  if (ARCH_PPC)
2930  ff_dsputil_init_ppc(c, avctx);
2931  if (ARCH_SH4)
2932  ff_dsputil_init_sh4(c, avctx);
2933  if (HAVE_VIS)
2934  ff_dsputil_init_vis(c, avctx);
2935  if (ARCH_X86)
2936  ff_dsputil_init_x86(c, avctx);
2937 
2940 }
2941 
2943 {
2944  ff_dsputil_init(c, avctx);
2945 }
2946 
2948 {
2949  ff_dsputil_init(c, avctx);
2950 }