FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dsputil.c
Go to the documentation of this file.
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 /**
26  * @file
27  * DSP utils
28  */
29 
30 #include "libavutil/imgutils.h"
31 #include "libavutil/internal.h"
32 #include "avcodec.h"
33 #include "copy_block.h"
34 #include "dct.h"
35 #include "dsputil.h"
36 #include "simple_idct.h"
37 #include "faandct.h"
38 #include "faanidct.h"
39 #include "imgconvert.h"
40 #include "mathops.h"
41 #include "mpegvideo.h"
42 #include "config.h"
43 #include "diracdsp.h"
44 
45 uint32_t ff_squareTbl[512] = {0, };
46 
47 #define BIT_DEPTH 9
48 #include "dsputil_template.c"
49 #undef BIT_DEPTH
50 
51 #define BIT_DEPTH 10
52 #include "dsputil_template.c"
53 #undef BIT_DEPTH
54 
55 #define BIT_DEPTH 12
56 #include "dsputil_template.c"
57 #undef BIT_DEPTH
58 
59 #define BIT_DEPTH 14
60 #include "dsputil_template.c"
61 #undef BIT_DEPTH
62 
63 #define BIT_DEPTH 8
64 #include "dsputil_template.c"
65 
66 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
67 #define pb_7f (~0UL/255 * 0x7f)
68 #define pb_80 (~0UL/255 * 0x80)
69 
70 /* Specific zigzag scan for 248 idct. NOTE that unlike the
71  specification, we interleave the fields */
73  0, 8, 1, 9, 16, 24, 2, 10,
74  17, 25, 32, 40, 48, 56, 33, 41,
75  18, 26, 3, 11, 4, 12, 19, 27,
76  34, 42, 49, 57, 50, 58, 35, 43,
77  20, 28, 5, 13, 6, 14, 21, 29,
78  36, 44, 51, 59, 52, 60, 37, 45,
79  22, 30, 7, 15, 23, 31, 38, 46,
80  53, 61, 54, 62, 39, 47, 55, 63,
81 };
82 
83 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85 
87  0, 1, 2, 3, 8, 9, 16, 17,
88  10, 11, 4, 5, 6, 7, 15, 14,
89  13, 12, 19, 18, 24, 25, 32, 33,
90  26, 27, 20, 21, 22, 23, 28, 29,
91  30, 31, 34, 35, 40, 41, 48, 49,
92  42, 43, 36, 37, 38, 39, 44, 45,
93  46, 47, 50, 51, 56, 57, 58, 59,
94  52, 53, 54, 55, 60, 61, 62, 63,
95 };
96 
98  0, 8, 16, 24, 1, 9, 2, 10,
99  17, 25, 32, 40, 48, 56, 57, 49,
100  41, 33, 26, 18, 3, 11, 4, 12,
101  19, 27, 34, 42, 50, 58, 35, 43,
102  51, 59, 20, 28, 5, 13, 6, 14,
103  21, 29, 36, 44, 52, 60, 37, 45,
104  53, 61, 22, 30, 7, 15, 23, 31,
105  38, 46, 54, 62, 39, 47, 55, 63,
106 };
107 
108 /* Input permutation for the simple_idct_mmx */
109 static const uint8_t simple_mmx_permutation[64]={
110  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118 };
119 
120 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
121 
122 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
123  int i;
124  int end;
125 
126  st->scantable= src_scantable;
127 
128  for(i=0; i<64; i++){
129  int j;
130  j = src_scantable[i];
131  st->permutated[i] = permutation[j];
132  }
133 
134  end=-1;
135  for(i=0; i<64; i++){
136  int j;
137  j = st->permutated[i];
138  if(j>end) end=j;
139  st->raster_end[i]= end;
140  }
141 }
142 
143 void ff_init_scantable_permutation(uint8_t *idct_permutation,
144  int idct_permutation_type)
145 {
146  int i;
147 
148  switch(idct_permutation_type){
149  case FF_NO_IDCT_PERM:
150  for(i=0; i<64; i++)
151  idct_permutation[i]= i;
152  break;
154  for(i=0; i<64; i++)
155  idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
156  break;
157  case FF_SIMPLE_IDCT_PERM:
158  for(i=0; i<64; i++)
159  idct_permutation[i]= simple_mmx_permutation[i];
160  break;
162  for(i=0; i<64; i++)
163  idct_permutation[i]= ((i&7)<<3) | (i>>3);
164  break;
166  for(i=0; i<64; i++)
167  idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
168  break;
169  case FF_SSE2_IDCT_PERM:
170  for(i=0; i<64; i++)
171  idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
172  break;
173  default:
174  av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
175  }
176 }
177 
178 static int pix_sum_c(uint8_t * pix, int line_size)
179 {
180  int s, i, j;
181 
182  s = 0;
183  for (i = 0; i < 16; i++) {
184  for (j = 0; j < 16; j += 8) {
185  s += pix[0];
186  s += pix[1];
187  s += pix[2];
188  s += pix[3];
189  s += pix[4];
190  s += pix[5];
191  s += pix[6];
192  s += pix[7];
193  pix += 8;
194  }
195  pix += line_size - 16;
196  }
197  return s;
198 }
199 
200 static int pix_norm1_c(uint8_t * pix, int line_size)
201 {
202  int s, i, j;
203  uint32_t *sq = ff_squareTbl + 256;
204 
205  s = 0;
206  for (i = 0; i < 16; i++) {
207  for (j = 0; j < 16; j += 8) {
208 #if 0
209  s += sq[pix[0]];
210  s += sq[pix[1]];
211  s += sq[pix[2]];
212  s += sq[pix[3]];
213  s += sq[pix[4]];
214  s += sq[pix[5]];
215  s += sq[pix[6]];
216  s += sq[pix[7]];
217 #else
218 #if HAVE_FAST_64BIT
219  register uint64_t x=*(uint64_t*)pix;
220  s += sq[x&0xff];
221  s += sq[(x>>8)&0xff];
222  s += sq[(x>>16)&0xff];
223  s += sq[(x>>24)&0xff];
224  s += sq[(x>>32)&0xff];
225  s += sq[(x>>40)&0xff];
226  s += sq[(x>>48)&0xff];
227  s += sq[(x>>56)&0xff];
228 #else
229  register uint32_t x=*(uint32_t*)pix;
230  s += sq[x&0xff];
231  s += sq[(x>>8)&0xff];
232  s += sq[(x>>16)&0xff];
233  s += sq[(x>>24)&0xff];
234  x=*(uint32_t*)(pix+4);
235  s += sq[x&0xff];
236  s += sq[(x>>8)&0xff];
237  s += sq[(x>>16)&0xff];
238  s += sq[(x>>24)&0xff];
239 #endif
240 #endif
241  pix += 8;
242  }
243  pix += line_size - 16;
244  }
245  return s;
246 }
247 
248 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
249  int i;
250 
251  for(i=0; i+8<=w; i+=8){
252  dst[i+0]= av_bswap32(src[i+0]);
253  dst[i+1]= av_bswap32(src[i+1]);
254  dst[i+2]= av_bswap32(src[i+2]);
255  dst[i+3]= av_bswap32(src[i+3]);
256  dst[i+4]= av_bswap32(src[i+4]);
257  dst[i+5]= av_bswap32(src[i+5]);
258  dst[i+6]= av_bswap32(src[i+6]);
259  dst[i+7]= av_bswap32(src[i+7]);
260  }
261  for(;i<w; i++){
262  dst[i+0]= av_bswap32(src[i+0]);
263  }
264 }
265 
266 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
267 {
268  while (len--)
269  *dst++ = av_bswap16(*src++);
270 }
271 
272 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
273 {
274  int s, i;
275  uint32_t *sq = ff_squareTbl + 256;
276 
277  s = 0;
278  for (i = 0; i < h; i++) {
279  s += sq[pix1[0] - pix2[0]];
280  s += sq[pix1[1] - pix2[1]];
281  s += sq[pix1[2] - pix2[2]];
282  s += sq[pix1[3] - pix2[3]];
283  pix1 += line_size;
284  pix2 += line_size;
285  }
286  return s;
287 }
288 
289 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
290 {
291  int s, i;
292  uint32_t *sq = ff_squareTbl + 256;
293 
294  s = 0;
295  for (i = 0; i < h; i++) {
296  s += sq[pix1[0] - pix2[0]];
297  s += sq[pix1[1] - pix2[1]];
298  s += sq[pix1[2] - pix2[2]];
299  s += sq[pix1[3] - pix2[3]];
300  s += sq[pix1[4] - pix2[4]];
301  s += sq[pix1[5] - pix2[5]];
302  s += sq[pix1[6] - pix2[6]];
303  s += sq[pix1[7] - pix2[7]];
304  pix1 += line_size;
305  pix2 += line_size;
306  }
307  return s;
308 }
309 
310 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
311 {
312  int s, i;
313  uint32_t *sq = ff_squareTbl + 256;
314 
315  s = 0;
316  for (i = 0; i < h; i++) {
317  s += sq[pix1[ 0] - pix2[ 0]];
318  s += sq[pix1[ 1] - pix2[ 1]];
319  s += sq[pix1[ 2] - pix2[ 2]];
320  s += sq[pix1[ 3] - pix2[ 3]];
321  s += sq[pix1[ 4] - pix2[ 4]];
322  s += sq[pix1[ 5] - pix2[ 5]];
323  s += sq[pix1[ 6] - pix2[ 6]];
324  s += sq[pix1[ 7] - pix2[ 7]];
325  s += sq[pix1[ 8] - pix2[ 8]];
326  s += sq[pix1[ 9] - pix2[ 9]];
327  s += sq[pix1[10] - pix2[10]];
328  s += sq[pix1[11] - pix2[11]];
329  s += sq[pix1[12] - pix2[12]];
330  s += sq[pix1[13] - pix2[13]];
331  s += sq[pix1[14] - pix2[14]];
332  s += sq[pix1[15] - pix2[15]];
333 
334  pix1 += line_size;
335  pix2 += line_size;
336  }
337  return s;
338 }
339 
340 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
341  const uint8_t *s2, int stride){
342  int i;
343 
344  /* read the pixels */
345  for(i=0;i<8;i++) {
346  block[0] = s1[0] - s2[0];
347  block[1] = s1[1] - s2[1];
348  block[2] = s1[2] - s2[2];
349  block[3] = s1[3] - s2[3];
350  block[4] = s1[4] - s2[4];
351  block[5] = s1[5] - s2[5];
352  block[6] = s1[6] - s2[6];
353  block[7] = s1[7] - s2[7];
354  s1 += stride;
355  s2 += stride;
356  block += 8;
357  }
358 }
359 
360 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
361  int line_size)
362 {
363  int i;
364 
365  /* read the pixels */
366  for(i=0;i<8;i++) {
367  pixels[0] = av_clip_uint8(block[0]);
368  pixels[1] = av_clip_uint8(block[1]);
369  pixels[2] = av_clip_uint8(block[2]);
370  pixels[3] = av_clip_uint8(block[3]);
371  pixels[4] = av_clip_uint8(block[4]);
372  pixels[5] = av_clip_uint8(block[5]);
373  pixels[6] = av_clip_uint8(block[6]);
374  pixels[7] = av_clip_uint8(block[7]);
375 
376  pixels += line_size;
377  block += 8;
378  }
379 }
380 
381 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
382  int line_size)
383 {
384  int i;
385 
386  /* read the pixels */
387  for(i=0;i<4;i++) {
388  pixels[0] = av_clip_uint8(block[0]);
389  pixels[1] = av_clip_uint8(block[1]);
390  pixels[2] = av_clip_uint8(block[2]);
391  pixels[3] = av_clip_uint8(block[3]);
392 
393  pixels += line_size;
394  block += 8;
395  }
396 }
397 
398 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
399  int line_size)
400 {
401  int i;
402 
403  /* read the pixels */
404  for(i=0;i<2;i++) {
405  pixels[0] = av_clip_uint8(block[0]);
406  pixels[1] = av_clip_uint8(block[1]);
407 
408  pixels += line_size;
409  block += 8;
410  }
411 }
412 
413 static void put_signed_pixels_clamped_c(const int16_t *block,
414  uint8_t *av_restrict pixels,
415  int line_size)
416 {
417  int i, j;
418 
419  for (i = 0; i < 8; i++) {
420  for (j = 0; j < 8; j++) {
421  if (*block < -128)
422  *pixels = 0;
423  else if (*block > 127)
424  *pixels = 255;
425  else
426  *pixels = (uint8_t)(*block + 128);
427  block++;
428  pixels++;
429  }
430  pixels += (line_size - 8);
431  }
432 }
433 
434 static void add_pixels8_c(uint8_t *av_restrict pixels,
435  int16_t *block,
436  int line_size)
437 {
438  int i;
439 
440  for(i=0;i<8;i++) {
441  pixels[0] += block[0];
442  pixels[1] += block[1];
443  pixels[2] += block[2];
444  pixels[3] += block[3];
445  pixels[4] += block[4];
446  pixels[5] += block[5];
447  pixels[6] += block[6];
448  pixels[7] += block[7];
449  pixels += line_size;
450  block += 8;
451  }
452 }
453 
454 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
455  int line_size)
456 {
457  int i;
458 
459  /* read the pixels */
460  for(i=0;i<8;i++) {
461  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
462  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
463  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
464  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
465  pixels[4] = av_clip_uint8(pixels[4] + block[4]);
466  pixels[5] = av_clip_uint8(pixels[5] + block[5]);
467  pixels[6] = av_clip_uint8(pixels[6] + block[6]);
468  pixels[7] = av_clip_uint8(pixels[7] + block[7]);
469  pixels += line_size;
470  block += 8;
471  }
472 }
473 
474 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
475  int line_size)
476 {
477  int i;
478 
479  /* read the pixels */
480  for(i=0;i<4;i++) {
481  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
482  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
483  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
484  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
485  pixels += line_size;
486  block += 8;
487  }
488 }
489 
490 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
491  int line_size)
492 {
493  int i;
494 
495  /* read the pixels */
496  for(i=0;i<2;i++) {
497  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
498  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
499  pixels += line_size;
500  block += 8;
501  }
502 }
503 
504 static int sum_abs_dctelem_c(int16_t *block)
505 {
506  int sum=0, i;
507  for(i=0; i<64; i++)
508  sum+= FFABS(block[i]);
509  return sum;
510 }
511 
512 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
513 {
514  int i;
515 
516  for (i = 0; i < h; i++) {
517  memset(block, value, 16);
518  block += line_size;
519  }
520 }
521 
522 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
523 {
524  int i;
525 
526  for (i = 0; i < h; i++) {
527  memset(block, value, 8);
528  block += line_size;
529  }
530 }
531 
532 #define avg2(a,b) ((a+b+1)>>1)
533 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
534 
535 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
536 {
537  const int A=(16-x16)*(16-y16);
538  const int B=( x16)*(16-y16);
539  const int C=(16-x16)*( y16);
540  const int D=( x16)*( y16);
541  int i;
542 
543  for(i=0; i<h; i++)
544  {
545  dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
546  dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
547  dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
548  dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
549  dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
550  dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
551  dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
552  dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
553  dst+= stride;
554  src+= stride;
555  }
556 }
557 
558 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
559  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
560 {
561  int y, vx, vy;
562  const int s= 1<<shift;
563 
564  width--;
565  height--;
566 
567  for(y=0; y<h; y++){
568  int x;
569 
570  vx= ox;
571  vy= oy;
572  for(x=0; x<8; x++){ //XXX FIXME optimize
573  int src_x, src_y, frac_x, frac_y, index;
574 
575  src_x= vx>>16;
576  src_y= vy>>16;
577  frac_x= src_x&(s-1);
578  frac_y= src_y&(s-1);
579  src_x>>=shift;
580  src_y>>=shift;
581 
582  if((unsigned)src_x < width){
583  if((unsigned)src_y < height){
584  index= src_x + src_y*stride;
585  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
586  + src[index +1]* frac_x )*(s-frac_y)
587  + ( src[index+stride ]*(s-frac_x)
588  + src[index+stride+1]* frac_x )* frac_y
589  + r)>>(shift*2);
590  }else{
591  index= src_x + av_clip(src_y, 0, height)*stride;
592  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
593  + src[index +1]* frac_x )*s
594  + r)>>(shift*2);
595  }
596  }else{
597  if((unsigned)src_y < height){
598  index= av_clip(src_x, 0, width) + src_y*stride;
599  dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
600  + src[index+stride ]* frac_y )*s
601  + r)>>(shift*2);
602  }else{
603  index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
604  dst[y*stride + x]= src[index ];
605  }
606  }
607 
608  vx+= dxx;
609  vy+= dyx;
610  }
611  ox += dxy;
612  oy += dyy;
613  }
614 }
615 
616 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
617  switch(width){
618  case 2: put_pixels2_8_c (dst, src, stride, height); break;
619  case 4: put_pixels4_8_c (dst, src, stride, height); break;
620  case 8: put_pixels8_8_c (dst, src, stride, height); break;
621  case 16:put_pixels16_8_c(dst, src, stride, height); break;
622  }
623 }
624 
625 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
626  int i,j;
627  for (i=0; i < height; i++) {
628  for (j=0; j < width; j++) {
629  dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
630  }
631  src += stride;
632  dst += stride;
633  }
634 }
635 
636 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
637  int i,j;
638  for (i=0; i < height; i++) {
639  for (j=0; j < width; j++) {
640  dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
641  }
642  src += stride;
643  dst += stride;
644  }
645 }
646 
647 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
648  int i,j;
649  for (i=0; i < height; i++) {
650  for (j=0; j < width; j++) {
651  dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
652  }
653  src += stride;
654  dst += stride;
655  }
656 }
657 
658 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
659  int i,j;
660  for (i=0; i < height; i++) {
661  for (j=0; j < width; j++) {
662  dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
663  }
664  src += stride;
665  dst += stride;
666  }
667 }
668 
669 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
670  int i,j;
671  for (i=0; i < height; i++) {
672  for (j=0; j < width; j++) {
673  dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
674  }
675  src += stride;
676  dst += stride;
677  }
678 }
679 
680 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
681  int i,j;
682  for (i=0; i < height; i++) {
683  for (j=0; j < width; j++) {
684  dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
685  }
686  src += stride;
687  dst += stride;
688  }
689 }
690 
691 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
692  int i,j;
693  for (i=0; i < height; i++) {
694  for (j=0; j < width; j++) {
695  dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
696  }
697  src += stride;
698  dst += stride;
699  }
700 }
701 
702 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
703  int i,j;
704  for (i=0; i < height; i++) {
705  for (j=0; j < width; j++) {
706  dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
707  }
708  src += stride;
709  dst += stride;
710  }
711 }
712 
713 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
714  switch(width){
715  case 2: avg_pixels2_8_c (dst, src, stride, height); break;
716  case 4: avg_pixels4_8_c (dst, src, stride, height); break;
717  case 8: avg_pixels8_8_c (dst, src, stride, height); break;
718  case 16:avg_pixels16_8_c(dst, src, stride, height); break;
719  }
720 }
721 
722 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
723  int i,j;
724  for (i=0; i < height; i++) {
725  for (j=0; j < width; j++) {
726  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
727  }
728  src += stride;
729  dst += stride;
730  }
731 }
732 
733 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
734  int i,j;
735  for (i=0; i < height; i++) {
736  for (j=0; j < width; j++) {
737  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
738  }
739  src += stride;
740  dst += stride;
741  }
742 }
743 
744 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
745  int i,j;
746  for (i=0; i < height; i++) {
747  for (j=0; j < width; j++) {
748  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
749  }
750  src += stride;
751  dst += stride;
752  }
753 }
754 
755 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
756  int i,j;
757  for (i=0; i < height; i++) {
758  for (j=0; j < width; j++) {
759  dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
760  }
761  src += stride;
762  dst += stride;
763  }
764 }
765 
766 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
767  int i,j;
768  for (i=0; i < height; i++) {
769  for (j=0; j < width; j++) {
770  dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
771  }
772  src += stride;
773  dst += stride;
774  }
775 }
776 
777 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
778  int i,j;
779  for (i=0; i < height; i++) {
780  for (j=0; j < width; j++) {
781  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
782  }
783  src += stride;
784  dst += stride;
785  }
786 }
787 
788 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
789  int i,j;
790  for (i=0; i < height; i++) {
791  for (j=0; j < width; j++) {
792  dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
793  }
794  src += stride;
795  dst += stride;
796  }
797 }
798 
799 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
800  int i,j;
801  for (i=0; i < height; i++) {
802  for (j=0; j < width; j++) {
803  dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
804  }
805  src += stride;
806  dst += stride;
807  }
808 }
809 
810 #define QPEL_MC(r, OPNAME, RND, OP) \
811 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
812  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
813  int i;\
814  for(i=0; i<h; i++)\
815  {\
816  OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
817  OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
818  OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
819  OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
820  OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
821  OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
822  OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
823  OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
824  dst+=dstStride;\
825  src+=srcStride;\
826  }\
827 }\
828 \
829 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
830  const int w=8;\
831  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
832  int i;\
833  for(i=0; i<w; i++)\
834  {\
835  const int src0= src[0*srcStride];\
836  const int src1= src[1*srcStride];\
837  const int src2= src[2*srcStride];\
838  const int src3= src[3*srcStride];\
839  const int src4= src[4*srcStride];\
840  const int src5= src[5*srcStride];\
841  const int src6= src[6*srcStride];\
842  const int src7= src[7*srcStride];\
843  const int src8= src[8*srcStride];\
844  OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
845  OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
846  OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
847  OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
848  OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
849  OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
850  OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
851  OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
852  dst++;\
853  src++;\
854  }\
855 }\
856 \
857 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
858  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
859  int i;\
860  \
861  for(i=0; i<h; i++)\
862  {\
863  OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
864  OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
865  OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
866  OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
867  OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
868  OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
869  OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
870  OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
871  OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
872  OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
873  OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
874  OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
875  OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
876  OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
877  OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
878  OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
879  dst+=dstStride;\
880  src+=srcStride;\
881  }\
882 }\
883 \
884 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
885  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
886  int i;\
887  const int w=16;\
888  for(i=0; i<w; i++)\
889  {\
890  const int src0= src[0*srcStride];\
891  const int src1= src[1*srcStride];\
892  const int src2= src[2*srcStride];\
893  const int src3= src[3*srcStride];\
894  const int src4= src[4*srcStride];\
895  const int src5= src[5*srcStride];\
896  const int src6= src[6*srcStride];\
897  const int src7= src[7*srcStride];\
898  const int src8= src[8*srcStride];\
899  const int src9= src[9*srcStride];\
900  const int src10= src[10*srcStride];\
901  const int src11= src[11*srcStride];\
902  const int src12= src[12*srcStride];\
903  const int src13= src[13*srcStride];\
904  const int src14= src[14*srcStride];\
905  const int src15= src[15*srcStride];\
906  const int src16= src[16*srcStride];\
907  OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
908  OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
909  OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
910  OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
911  OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
912  OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
913  OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
914  OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
915  OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
916  OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
917  OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
918  OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
919  OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
920  OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
921  OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
922  OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
923  dst++;\
924  src++;\
925  }\
926 }\
927 \
928 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
929  uint8_t half[64];\
930  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
931  OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
932 }\
933 \
934 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
935  OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
936 }\
937 \
938 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
939  uint8_t half[64];\
940  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
941  OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
942 }\
943 \
944 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
945  uint8_t full[16*9];\
946  uint8_t half[64];\
947  copy_block9(full, src, 16, stride, 9);\
948  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
949  OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
950 }\
951 \
952 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
953  uint8_t full[16*9];\
954  copy_block9(full, src, 16, stride, 9);\
955  OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
956 }\
957 \
958 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
959  uint8_t full[16*9];\
960  uint8_t half[64];\
961  copy_block9(full, src, 16, stride, 9);\
962  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
963  OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
964 }\
965 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
966  uint8_t full[16*9];\
967  uint8_t halfH[72];\
968  uint8_t halfV[64];\
969  uint8_t halfHV[64];\
970  copy_block9(full, src, 16, stride, 9);\
971  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
972  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
973  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
974  OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
975 }\
976 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
977  uint8_t full[16*9];\
978  uint8_t halfH[72];\
979  uint8_t halfHV[64];\
980  copy_block9(full, src, 16, stride, 9);\
981  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
982  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
983  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
984  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
985 }\
986 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
987  uint8_t full[16*9];\
988  uint8_t halfH[72];\
989  uint8_t halfV[64];\
990  uint8_t halfHV[64];\
991  copy_block9(full, src, 16, stride, 9);\
992  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
993  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
994  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
995  OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
996 }\
997 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
998  uint8_t full[16*9];\
999  uint8_t halfH[72];\
1000  uint8_t halfHV[64];\
1001  copy_block9(full, src, 16, stride, 9);\
1002  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1003  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1004  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1005  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1006 }\
1007 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1008  uint8_t full[16*9];\
1009  uint8_t halfH[72];\
1010  uint8_t halfV[64];\
1011  uint8_t halfHV[64];\
1012  copy_block9(full, src, 16, stride, 9);\
1013  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1014  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1015  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1016  OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1017 }\
1018 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1019  uint8_t full[16*9];\
1020  uint8_t halfH[72];\
1021  uint8_t halfHV[64];\
1022  copy_block9(full, src, 16, stride, 9);\
1023  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1024  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1025  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1026  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1027 }\
1028 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1029  uint8_t full[16*9];\
1030  uint8_t halfH[72];\
1031  uint8_t halfV[64];\
1032  uint8_t halfHV[64];\
1033  copy_block9(full, src, 16, stride, 9);\
1034  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1035  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1036  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1037  OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1038 }\
1039 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1040  uint8_t full[16*9];\
1041  uint8_t halfH[72];\
1042  uint8_t halfHV[64];\
1043  copy_block9(full, src, 16, stride, 9);\
1044  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1045  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1046  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1047  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1048 }\
1049 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1050  uint8_t halfH[72];\
1051  uint8_t halfHV[64];\
1052  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1053  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1054  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1055 }\
1056 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1057  uint8_t halfH[72];\
1058  uint8_t halfHV[64];\
1059  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1060  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1061  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1062 }\
1063 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1064  uint8_t full[16*9];\
1065  uint8_t halfH[72];\
1066  uint8_t halfV[64];\
1067  uint8_t halfHV[64];\
1068  copy_block9(full, src, 16, stride, 9);\
1069  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1070  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1071  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1072  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1073 }\
1074 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1075  uint8_t full[16*9];\
1076  uint8_t halfH[72];\
1077  copy_block9(full, src, 16, stride, 9);\
1078  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1080  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1081 }\
1082 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1083  uint8_t full[16*9];\
1084  uint8_t halfH[72];\
1085  uint8_t halfV[64];\
1086  uint8_t halfHV[64];\
1087  copy_block9(full, src, 16, stride, 9);\
1088  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1089  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1090  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1091  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1092 }\
1093 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1094  uint8_t full[16*9];\
1095  uint8_t halfH[72];\
1096  copy_block9(full, src, 16, stride, 9);\
1097  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1098  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1099  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1100 }\
1101 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1102  uint8_t halfH[72];\
1103  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1104  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1105 }\
1106 \
1107 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1108  uint8_t half[256];\
1109  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1110  OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1111 }\
1112 \
1113 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1114  OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1115 }\
1116 \
1117 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1118  uint8_t half[256];\
1119  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1120  OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1121 }\
1122 \
1123 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1124  uint8_t full[24*17];\
1125  uint8_t half[256];\
1126  copy_block17(full, src, 24, stride, 17);\
1127  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1128  OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1129 }\
1130 \
1131 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1132  uint8_t full[24*17];\
1133  copy_block17(full, src, 24, stride, 17);\
1134  OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1135 }\
1136 \
1137 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1138  uint8_t full[24*17];\
1139  uint8_t half[256];\
1140  copy_block17(full, src, 24, stride, 17);\
1141  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1142  OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1143 }\
1144 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1145  uint8_t full[24*17];\
1146  uint8_t halfH[272];\
1147  uint8_t halfV[256];\
1148  uint8_t halfHV[256];\
1149  copy_block17(full, src, 24, stride, 17);\
1150  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1151  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1152  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1153  OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1154 }\
1155 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1156  uint8_t full[24*17];\
1157  uint8_t halfH[272];\
1158  uint8_t halfHV[256];\
1159  copy_block17(full, src, 24, stride, 17);\
1160  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1161  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1162  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1163  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1164 }\
1165 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1166  uint8_t full[24*17];\
1167  uint8_t halfH[272];\
1168  uint8_t halfV[256];\
1169  uint8_t halfHV[256];\
1170  copy_block17(full, src, 24, stride, 17);\
1171  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1172  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1173  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1174  OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1175 }\
1176 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1177  uint8_t full[24*17];\
1178  uint8_t halfH[272];\
1179  uint8_t halfHV[256];\
1180  copy_block17(full, src, 24, stride, 17);\
1181  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1182  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1183  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1184  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1185 }\
1186 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1187  uint8_t full[24*17];\
1188  uint8_t halfH[272];\
1189  uint8_t halfV[256];\
1190  uint8_t halfHV[256];\
1191  copy_block17(full, src, 24, stride, 17);\
1192  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1193  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1194  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195  OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1196 }\
1197 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1198  uint8_t full[24*17];\
1199  uint8_t halfH[272];\
1200  uint8_t halfHV[256];\
1201  copy_block17(full, src, 24, stride, 17);\
1202  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1203  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1204  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1205  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1206 }\
1207 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1208  uint8_t full[24*17];\
1209  uint8_t halfH[272];\
1210  uint8_t halfV[256];\
1211  uint8_t halfHV[256];\
1212  copy_block17(full, src, 24, stride, 17);\
1213  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1214  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1215  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1216  OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1217 }\
1218 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1219  uint8_t full[24*17];\
1220  uint8_t halfH[272];\
1221  uint8_t halfHV[256];\
1222  copy_block17(full, src, 24, stride, 17);\
1223  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1225  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1227 }\
1228 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1229  uint8_t halfH[272];\
1230  uint8_t halfHV[256];\
1231  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1232  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1233  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1234 }\
1235 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1236  uint8_t halfH[272];\
1237  uint8_t halfHV[256];\
1238  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1239  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1240  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1241 }\
1242 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1243  uint8_t full[24*17];\
1244  uint8_t halfH[272];\
1245  uint8_t halfV[256];\
1246  uint8_t halfHV[256];\
1247  copy_block17(full, src, 24, stride, 17);\
1248  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1249  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1250  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1251  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1252 }\
1253 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1254  uint8_t full[24*17];\
1255  uint8_t halfH[272];\
1256  copy_block17(full, src, 24, stride, 17);\
1257  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1258  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1259  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1260 }\
1261 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1262  uint8_t full[24*17];\
1263  uint8_t halfH[272];\
1264  uint8_t halfV[256];\
1265  uint8_t halfHV[256];\
1266  copy_block17(full, src, 24, stride, 17);\
1267  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1268  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1269  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1270  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1271 }\
1272 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1273  uint8_t full[24*17];\
1274  uint8_t halfH[272];\
1275  copy_block17(full, src, 24, stride, 17);\
1276  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1277  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1278  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1279 }\
1280 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1281  uint8_t halfH[272];\
1282  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1283  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1284 }
1285 
1286 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1287 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1288 #define op_put(a, b) a = cm[((b) + 16)>>5]
1289 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1290 
1291 QPEL_MC(0, put_ , _ , op_put)
1292 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1293 QPEL_MC(0, avg_ , _ , op_avg)
1294 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1295 #undef op_avg
1296 #undef op_avg_no_rnd
1297 #undef op_put
1298 #undef op_put_no_rnd
1299 
1300 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1301 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1302 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1303 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1304 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1305 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1306 
1307 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1309  int i;
1310 
1311  for(i=0; i<h; i++){
1312  dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1313  dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1314  dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1315  dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1316  dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1317  dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1318  dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1319  dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1320  dst+=dstStride;
1321  src+=srcStride;
1322  }
1323 }
1324 
1325 #if CONFIG_RV40_DECODER
1327  put_pixels16_xy2_8_c(dst, src, stride, 16);
1328 }
1330  avg_pixels16_xy2_8_c(dst, src, stride, 16);
1331 }
1333  put_pixels8_xy2_8_c(dst, src, stride, 8);
1334 }
1336  avg_pixels8_xy2_8_c(dst, src, stride, 8);
1337 }
1338 #endif /* CONFIG_RV40_DECODER */
1339 
1340 #if CONFIG_DIRAC_DECODER
1341 #define DIRAC_MC(OPNAME)\
1342 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1343 {\
1344  OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1345 }\
1346 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1347 {\
1348  OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1349 }\
1350 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1351 {\
1352  OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1353  OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1354 }\
1355 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1356 {\
1357  OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1358 }\
1359 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1360 {\
1361  OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1362 }\
1363 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1364 {\
1365  OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1366  OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1367 }\
1368 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1369 {\
1370  OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1371 }\
1372 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1373 {\
1374  OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1375 }\
1376 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1377 {\
1378  OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1379  OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1380 }
1381 DIRAC_MC(put)
1382 DIRAC_MC(avg)
1383 #endif
1384 
1385 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1387  int i;
1388 
1389  for(i=0; i<w; i++){
1390  const int src_1= src[ -srcStride];
1391  const int src0 = src[0 ];
1392  const int src1 = src[ srcStride];
1393  const int src2 = src[2*srcStride];
1394  const int src3 = src[3*srcStride];
1395  const int src4 = src[4*srcStride];
1396  const int src5 = src[5*srcStride];
1397  const int src6 = src[6*srcStride];
1398  const int src7 = src[7*srcStride];
1399  const int src8 = src[8*srcStride];
1400  const int src9 = src[9*srcStride];
1401  dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1402  dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1403  dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1404  dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1405  dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1406  dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1407  dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1408  dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1409  src++;
1410  dst++;
1411  }
1412 }
1413 
1414 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1415  uint8_t half[64];
1416  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1417  put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1418 }
1419 
1420 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1421  wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1422 }
1423 
1424 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1425  uint8_t half[64];
1426  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1427  put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1428 }
1429 
1430 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1431  wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1432 }
1433 
1434 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1435  uint8_t halfH[88];
1436  uint8_t halfV[64];
1437  uint8_t halfHV[64];
1438  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1439  wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1440  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1441  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1442 }
1443 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1444  uint8_t halfH[88];
1445  uint8_t halfV[64];
1446  uint8_t halfHV[64];
1447  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1448  wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1449  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1450  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1451 }
1452 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1453  uint8_t halfH[88];
1454  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1455  wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1456 }
1457 
1458 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1459  if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1460  int x;
1461  const int strength= ff_h263_loop_filter_strength[qscale];
1462 
1463  for(x=0; x<8; x++){
1464  int d1, d2, ad1;
1465  int p0= src[x-2*stride];
1466  int p1= src[x-1*stride];
1467  int p2= src[x+0*stride];
1468  int p3= src[x+1*stride];
1469  int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1470 
1471  if (d<-2*strength) d1= 0;
1472  else if(d<- strength) d1=-2*strength - d;
1473  else if(d< strength) d1= d;
1474  else if(d< 2*strength) d1= 2*strength - d;
1475  else d1= 0;
1476 
1477  p1 += d1;
1478  p2 -= d1;
1479  if(p1&256) p1= ~(p1>>31);
1480  if(p2&256) p2= ~(p2>>31);
1481 
1482  src[x-1*stride] = p1;
1483  src[x+0*stride] = p2;
1484 
1485  ad1= FFABS(d1)>>1;
1486 
1487  d2= av_clip((p0-p3)/4, -ad1, ad1);
1488 
1489  src[x-2*stride] = p0 - d2;
1490  src[x+ stride] = p3 + d2;
1491  }
1492  }
1493 }
1494 
1495 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1496  if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1497  int y;
1498  const int strength= ff_h263_loop_filter_strength[qscale];
1499 
1500  for(y=0; y<8; y++){
1501  int d1, d2, ad1;
1502  int p0= src[y*stride-2];
1503  int p1= src[y*stride-1];
1504  int p2= src[y*stride+0];
1505  int p3= src[y*stride+1];
1506  int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1507 
1508  if (d<-2*strength) d1= 0;
1509  else if(d<- strength) d1=-2*strength - d;
1510  else if(d< strength) d1= d;
1511  else if(d< 2*strength) d1= 2*strength - d;
1512  else d1= 0;
1513 
1514  p1 += d1;
1515  p2 -= d1;
1516  if(p1&256) p1= ~(p1>>31);
1517  if(p2&256) p2= ~(p2>>31);
1518 
1519  src[y*stride-1] = p1;
1520  src[y*stride+0] = p2;
1521 
1522  ad1= FFABS(d1)>>1;
1523 
1524  d2= av_clip((p0-p3)/4, -ad1, ad1);
1525 
1526  src[y*stride-2] = p0 - d2;
1527  src[y*stride+1] = p3 + d2;
1528  }
1529  }
1530 }
1531 
1532 static void h261_loop_filter_c(uint8_t *src, int stride){
1533  int x,y,xy,yz;
1534  int temp[64];
1535 
1536  for(x=0; x<8; x++){
1537  temp[x ] = 4*src[x ];
1538  temp[x + 7*8] = 4*src[x + 7*stride];
1539  }
1540  for(y=1; y<7; y++){
1541  for(x=0; x<8; x++){
1542  xy = y * stride + x;
1543  yz = y * 8 + x;
1544  temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1545  }
1546  }
1547 
1548  for(y=0; y<8; y++){
1549  src[ y*stride] = (temp[ y*8] + 2)>>2;
1550  src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1551  for(x=1; x<7; x++){
1552  xy = y * stride + x;
1553  yz = y * 8 + x;
1554  src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1555  }
1556  }
1557 }
1558 
1559 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1560 {
1561  int s, i;
1562 
1563  s = 0;
1564  for(i=0;i<h;i++) {
1565  s += abs(pix1[0] - pix2[0]);
1566  s += abs(pix1[1] - pix2[1]);
1567  s += abs(pix1[2] - pix2[2]);
1568  s += abs(pix1[3] - pix2[3]);
1569  s += abs(pix1[4] - pix2[4]);
1570  s += abs(pix1[5] - pix2[5]);
1571  s += abs(pix1[6] - pix2[6]);
1572  s += abs(pix1[7] - pix2[7]);
1573  s += abs(pix1[8] - pix2[8]);
1574  s += abs(pix1[9] - pix2[9]);
1575  s += abs(pix1[10] - pix2[10]);
1576  s += abs(pix1[11] - pix2[11]);
1577  s += abs(pix1[12] - pix2[12]);
1578  s += abs(pix1[13] - pix2[13]);
1579  s += abs(pix1[14] - pix2[14]);
1580  s += abs(pix1[15] - pix2[15]);
1581  pix1 += line_size;
1582  pix2 += line_size;
1583  }
1584  return s;
1585 }
1586 
1587 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1588 {
1589  int s, i;
1590 
1591  s = 0;
1592  for(i=0;i<h;i++) {
1593  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1594  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1595  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1596  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1597  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1598  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1599  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1600  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1601  s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1602  s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1603  s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1604  s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1605  s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1606  s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1607  s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1608  s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1609  pix1 += line_size;
1610  pix2 += line_size;
1611  }
1612  return s;
1613 }
1614 
1615 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1616 {
1617  int s, i;
1618  uint8_t *pix3 = pix2 + line_size;
1619 
1620  s = 0;
1621  for(i=0;i<h;i++) {
1622  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1623  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1624  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1625  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1626  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1627  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1628  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1629  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1630  s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1631  s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1632  s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1633  s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1634  s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1635  s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1636  s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1637  s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1638  pix1 += line_size;
1639  pix2 += line_size;
1640  pix3 += line_size;
1641  }
1642  return s;
1643 }
1644 
1645 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1646 {
1647  int s, i;
1648  uint8_t *pix3 = pix2 + line_size;
1649 
1650  s = 0;
1651  for(i=0;i<h;i++) {
1652  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1653  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1654  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1655  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1656  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1657  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1658  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1659  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1660  s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1661  s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1662  s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1663  s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1664  s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1665  s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1666  s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1667  s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1668  pix1 += line_size;
1669  pix2 += line_size;
1670  pix3 += line_size;
1671  }
1672  return s;
1673 }
1674 
1675 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1676 {
1677  int s, i;
1678 
1679  s = 0;
1680  for(i=0;i<h;i++) {
1681  s += abs(pix1[0] - pix2[0]);
1682  s += abs(pix1[1] - pix2[1]);
1683  s += abs(pix1[2] - pix2[2]);
1684  s += abs(pix1[3] - pix2[3]);
1685  s += abs(pix1[4] - pix2[4]);
1686  s += abs(pix1[5] - pix2[5]);
1687  s += abs(pix1[6] - pix2[6]);
1688  s += abs(pix1[7] - pix2[7]);
1689  pix1 += line_size;
1690  pix2 += line_size;
1691  }
1692  return s;
1693 }
1694 
1695 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1696 {
1697  int s, i;
1698 
1699  s = 0;
1700  for(i=0;i<h;i++) {
1701  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1702  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1703  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1704  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1705  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1706  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1707  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1708  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1709  pix1 += line_size;
1710  pix2 += line_size;
1711  }
1712  return s;
1713 }
1714 
1715 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1716 {
1717  int s, i;
1718  uint8_t *pix3 = pix2 + line_size;
1719 
1720  s = 0;
1721  for(i=0;i<h;i++) {
1722  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1723  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1724  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1725  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1726  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1727  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1728  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1729  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1730  pix1 += line_size;
1731  pix2 += line_size;
1732  pix3 += line_size;
1733  }
1734  return s;
1735 }
1736 
1737 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1738 {
1739  int s, i;
1740  uint8_t *pix3 = pix2 + line_size;
1741 
1742  s = 0;
1743  for(i=0;i<h;i++) {
1744  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1745  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1746  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1747  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1748  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1749  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1750  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1751  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1752  pix1 += line_size;
1753  pix2 += line_size;
1754  pix3 += line_size;
1755  }
1756  return s;
1757 }
1758 
1759 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1760  MpegEncContext *c = v;
1761  int score1=0;
1762  int score2=0;
1763  int x,y;
1764 
1765  for(y=0; y<h; y++){
1766  for(x=0; x<16; x++){
1767  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1768  }
1769  if(y+1<h){
1770  for(x=0; x<15; x++){
1771  score2+= FFABS( s1[x ] - s1[x +stride]
1772  - s1[x+1] + s1[x+1+stride])
1773  -FFABS( s2[x ] - s2[x +stride]
1774  - s2[x+1] + s2[x+1+stride]);
1775  }
1776  }
1777  s1+= stride;
1778  s2+= stride;
1779  }
1780 
1781  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1782  else return score1 + FFABS(score2)*8;
1783 }
1784 
1785 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1786  MpegEncContext *c = v;
1787  int score1=0;
1788  int score2=0;
1789  int x,y;
1790 
1791  for(y=0; y<h; y++){
1792  for(x=0; x<8; x++){
1793  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1794  }
1795  if(y+1<h){
1796  for(x=0; x<7; x++){
1797  score2+= FFABS( s1[x ] - s1[x +stride]
1798  - s1[x+1] + s1[x+1+stride])
1799  -FFABS( s2[x ] - s2[x +stride]
1800  - s2[x+1] + s2[x+1+stride]);
1801  }
1802  }
1803  s1+= stride;
1804  s2+= stride;
1805  }
1806 
1807  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1808  else return score1 + FFABS(score2)*8;
1809 }
1810 
1811 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1812  int i;
1813  unsigned int sum=0;
1814 
1815  for(i=0; i<8*8; i++){
1816  int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1817  int w= weight[i];
1818  b>>= RECON_SHIFT;
1819  av_assert2(-512<b && b<512);
1820 
1821  sum += (w*b)*(w*b)>>4;
1822  }
1823  return sum>>2;
1824 }
1825 
1826 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1827  int i;
1828 
1829  for(i=0; i<8*8; i++){
1830  rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1831  }
1832 }
1833 
1834 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1835  return 0;
1836 }
1837 
1839  int i;
1840 
1841  memset(cmp, 0, sizeof(void*)*6);
1842 
1843  for(i=0; i<6; i++){
1844  switch(type&0xFF){
1845  case FF_CMP_SAD:
1846  cmp[i]= c->sad[i];
1847  break;
1848  case FF_CMP_SATD:
1849  cmp[i]= c->hadamard8_diff[i];
1850  break;
1851  case FF_CMP_SSE:
1852  cmp[i]= c->sse[i];
1853  break;
1854  case FF_CMP_DCT:
1855  cmp[i]= c->dct_sad[i];
1856  break;
1857  case FF_CMP_DCT264:
1858  cmp[i]= c->dct264_sad[i];
1859  break;
1860  case FF_CMP_DCTMAX:
1861  cmp[i]= c->dct_max[i];
1862  break;
1863  case FF_CMP_PSNR:
1864  cmp[i]= c->quant_psnr[i];
1865  break;
1866  case FF_CMP_BIT:
1867  cmp[i]= c->bit[i];
1868  break;
1869  case FF_CMP_RD:
1870  cmp[i]= c->rd[i];
1871  break;
1872  case FF_CMP_VSAD:
1873  cmp[i]= c->vsad[i];
1874  break;
1875  case FF_CMP_VSSE:
1876  cmp[i]= c->vsse[i];
1877  break;
1878  case FF_CMP_ZERO:
1879  cmp[i]= zero_cmp;
1880  break;
1881  case FF_CMP_NSSE:
1882  cmp[i]= c->nsse[i];
1883  break;
1884 #if CONFIG_DWT
1885  case FF_CMP_W53:
1886  cmp[i]= c->w53[i];
1887  break;
1888  case FF_CMP_W97:
1889  cmp[i]= c->w97[i];
1890  break;
1891 #endif
1892  default:
1893  av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1894  }
1895  }
1896 }
1897 
1898 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1899  long i;
1900  for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1901  long a = *(long*)(src+i);
1902  long b = *(long*)(dst+i);
1903  *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1904  }
1905  for(; i<w; i++)
1906  dst[i+0] += src[i+0];
1907 }
1908 
1909 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1910  long i;
1911 #if !HAVE_FAST_UNALIGNED
1912  if((long)src2 & (sizeof(long)-1)){
1913  for(i=0; i+7<w; i+=8){
1914  dst[i+0] = src1[i+0]-src2[i+0];
1915  dst[i+1] = src1[i+1]-src2[i+1];
1916  dst[i+2] = src1[i+2]-src2[i+2];
1917  dst[i+3] = src1[i+3]-src2[i+3];
1918  dst[i+4] = src1[i+4]-src2[i+4];
1919  dst[i+5] = src1[i+5]-src2[i+5];
1920  dst[i+6] = src1[i+6]-src2[i+6];
1921  dst[i+7] = src1[i+7]-src2[i+7];
1922  }
1923  }else
1924 #endif
1925  for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1926  long a = *(long*)(src1+i);
1927  long b = *(long*)(src2+i);
1928  *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1929  }
1930  for(; i<w; i++)
1931  dst[i+0] = src1[i+0]-src2[i+0];
1932 }
1933 
1934 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1935  int i;
1936  uint8_t l, lt;
1937 
1938  l= *left;
1939  lt= *left_top;
1940 
1941  for(i=0; i<w; i++){
1942  l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1943  lt= src1[i];
1944  dst[i]= l;
1945  }
1946 
1947  *left= l;
1948  *left_top= lt;
1949 }
1950 
1951 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1952  int i;
1953  uint8_t l, lt;
1954 
1955  l= *left;
1956  lt= *left_top;
1957 
1958  for(i=0; i<w; i++){
1959  const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1960  lt= src1[i];
1961  l= src2[i];
1962  dst[i]= l - pred;
1963  }
1964 
1965  *left= l;
1966  *left_top= lt;
1967 }
1968 
1969 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1970  int i;
1971 
1972  for(i=0; i<w-1; i++){
1973  acc+= src[i];
1974  dst[i]= acc;
1975  i++;
1976  acc+= src[i];
1977  dst[i]= acc;
1978  }
1979 
1980  for(; i<w; i++){
1981  acc+= src[i];
1982  dst[i]= acc;
1983  }
1984 
1985  return acc;
1986 }
1987 
1988 #if HAVE_BIGENDIAN
1989 #define B 3
1990 #define G 2
1991 #define R 1
1992 #define A 0
1993 #else
1994 #define B 0
1995 #define G 1
1996 #define R 2
1997 #define A 3
1998 #endif
1999 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2000  int i;
2001  int r,g,b,a;
2002  r= *red;
2003  g= *green;
2004  b= *blue;
2005  a= *alpha;
2006 
2007  for(i=0; i<w; i++){
2008  b+= src[4*i+B];
2009  g+= src[4*i+G];
2010  r+= src[4*i+R];
2011  a+= src[4*i+A];
2012 
2013  dst[4*i+B]= b;
2014  dst[4*i+G]= g;
2015  dst[4*i+R]= r;
2016  dst[4*i+A]= a;
2017  }
2018 
2019  *red= r;
2020  *green= g;
2021  *blue= b;
2022  *alpha= a;
2023 }
2024 #undef B
2025 #undef G
2026 #undef R
2027 #undef A
2028 
2029 #define BUTTERFLY2(o1,o2,i1,i2) \
2030 o1= (i1)+(i2);\
2031 o2= (i1)-(i2);
2032 
2033 #define BUTTERFLY1(x,y) \
2034 {\
2035  int a,b;\
2036  a= x;\
2037  b= y;\
2038  x= a+b;\
2039  y= a-b;\
2040 }
2041 
2042 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2043 
2044 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2045  int i;
2046  int temp[64];
2047  int sum=0;
2048 
2049  av_assert2(h==8);
2050 
2051  for(i=0; i<8; i++){
2052  //FIXME try pointer walks
2053  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2054  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2055  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2056  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2057 
2058  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2059  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2060  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2061  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2062 
2063  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2064  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2065  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2066  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2067  }
2068 
2069  for(i=0; i<8; i++){
2070  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2071  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2072  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2073  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2074 
2075  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2076  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2077  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2078  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2079 
2080  sum +=
2081  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2082  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2083  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2084  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2085  }
2086  return sum;
2087 }
2088 
2089 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2090  int i;
2091  int temp[64];
2092  int sum=0;
2093 
2094  av_assert2(h==8);
2095 
2096  for(i=0; i<8; i++){
2097  //FIXME try pointer walks
2098  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2099  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2100  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2101  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2102 
2103  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2104  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2105  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2106  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2107 
2108  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2109  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2110  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2111  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2112  }
2113 
2114  for(i=0; i<8; i++){
2115  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2116  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2117  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2118  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2119 
2120  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2121  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2122  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2123  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2124 
2125  sum +=
2126  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2127  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2128  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2129  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2130  }
2131 
2132  sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2133 
2134  return sum;
2135 }
2136 
2137 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2138  MpegEncContext * const s= (MpegEncContext *)c;
2139  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2140 
2141  av_assert2(h==8);
2142 
2143  s->dsp.diff_pixels(temp, src1, src2, stride);
2144  s->dsp.fdct(temp);
2145  return s->dsp.sum_abs_dctelem(temp);
2146 }
2147 
2148 #if CONFIG_GPL
2149 #define DCT8_1D {\
2150  const int s07 = SRC(0) + SRC(7);\
2151  const int s16 = SRC(1) + SRC(6);\
2152  const int s25 = SRC(2) + SRC(5);\
2153  const int s34 = SRC(3) + SRC(4);\
2154  const int a0 = s07 + s34;\
2155  const int a1 = s16 + s25;\
2156  const int a2 = s07 - s34;\
2157  const int a3 = s16 - s25;\
2158  const int d07 = SRC(0) - SRC(7);\
2159  const int d16 = SRC(1) - SRC(6);\
2160  const int d25 = SRC(2) - SRC(5);\
2161  const int d34 = SRC(3) - SRC(4);\
2162  const int a4 = d16 + d25 + (d07 + (d07>>1));\
2163  const int a5 = d07 - d34 - (d25 + (d25>>1));\
2164  const int a6 = d07 + d34 - (d16 + (d16>>1));\
2165  const int a7 = d16 - d25 + (d34 + (d34>>1));\
2166  DST(0, a0 + a1 ) ;\
2167  DST(1, a4 + (a7>>2)) ;\
2168  DST(2, a2 + (a3>>1)) ;\
2169  DST(3, a5 + (a6>>2)) ;\
2170  DST(4, a0 - a1 ) ;\
2171  DST(5, a6 - (a5>>2)) ;\
2172  DST(6, (a2>>1) - a3 ) ;\
2173  DST(7, (a4>>2) - a7 ) ;\
2174 }
2175 
2176 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2177  MpegEncContext * const s= (MpegEncContext *)c;
2178  int16_t dct[8][8];
2179  int i;
2180  int sum=0;
2181 
2182  s->dsp.diff_pixels(dct[0], src1, src2, stride);
2183 
2184 #define SRC(x) dct[i][x]
2185 #define DST(x,v) dct[i][x]= v
2186  for( i = 0; i < 8; i++ )
2187  DCT8_1D
2188 #undef SRC
2189 #undef DST
2190 
2191 #define SRC(x) dct[x][i]
2192 #define DST(x,v) sum += FFABS(v)
2193  for( i = 0; i < 8; i++ )
2194  DCT8_1D
2195 #undef SRC
2196 #undef DST
2197  return sum;
2198 }
2199 #endif
2200 
2201 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2202  MpegEncContext * const s= (MpegEncContext *)c;
2203  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2204  int sum=0, i;
2205 
2206  av_assert2(h==8);
2207 
2208  s->dsp.diff_pixels(temp, src1, src2, stride);
2209  s->dsp.fdct(temp);
2210 
2211  for(i=0; i<64; i++)
2212  sum= FFMAX(sum, FFABS(temp[i]));
2213 
2214  return sum;
2215 }
2216 
2217 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2218  MpegEncContext * const s= (MpegEncContext *)c;
2219  LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2220  int16_t * const bak = temp+64;
2221  int sum=0, i;
2222 
2223  av_assert2(h==8);
2224  s->mb_intra=0;
2225 
2226  s->dsp.diff_pixels(temp, src1, src2, stride);
2227 
2228  memcpy(bak, temp, 64*sizeof(int16_t));
2229 
2230  s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2231  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2232  ff_simple_idct_8(temp); //FIXME
2233 
2234  for(i=0; i<64; i++)
2235  sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2236 
2237  return sum;
2238 }
2239 
2240 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2241  MpegEncContext * const s= (MpegEncContext *)c;
2242  const uint8_t *scantable= s->intra_scantable.permutated;
2243  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2244  LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2245  LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2246  int i, last, run, bits, level, distortion, start_i;
2247  const int esc_length= s->ac_esc_length;
2248  uint8_t * length;
2249  uint8_t * last_length;
2250 
2251  av_assert2(h==8);
2252 
2253  copy_block8(lsrc1, src1, 8, stride, 8);
2254  copy_block8(lsrc2, src2, 8, stride, 8);
2255 
2256  s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2257 
2258  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2259 
2260  bits=0;
2261 
2262  if (s->mb_intra) {
2263  start_i = 1;
2264  length = s->intra_ac_vlc_length;
2265  last_length= s->intra_ac_vlc_last_length;
2266  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2267  } else {
2268  start_i = 0;
2269  length = s->inter_ac_vlc_length;
2270  last_length= s->inter_ac_vlc_last_length;
2271  }
2272 
2273  if(last>=start_i){
2274  run=0;
2275  for(i=start_i; i<last; i++){
2276  int j= scantable[i];
2277  level= temp[j];
2278 
2279  if(level){
2280  level+=64;
2281  if((level&(~127)) == 0){
2282  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2283  }else
2284  bits+= esc_length;
2285  run=0;
2286  }else
2287  run++;
2288  }
2289  i= scantable[last];
2290 
2291  level= temp[i] + 64;
2292 
2293  av_assert2(level - 64);
2294 
2295  if((level&(~127)) == 0){
2296  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2297  }else
2298  bits+= esc_length;
2299 
2300  }
2301 
2302  if(last>=0){
2303  if(s->mb_intra)
2304  s->dct_unquantize_intra(s, temp, 0, s->qscale);
2305  else
2306  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2307  }
2308 
2309  s->dsp.idct_add(lsrc2, 8, temp);
2310 
2311  distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2312 
2313  return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2314 }
2315 
2316 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2317  MpegEncContext * const s= (MpegEncContext *)c;
2318  const uint8_t *scantable= s->intra_scantable.permutated;
2319  LOCAL_ALIGNED_16(int16_t, temp, [64]);
2320  int i, last, run, bits, level, start_i;
2321  const int esc_length= s->ac_esc_length;
2322  uint8_t * length;
2323  uint8_t * last_length;
2324 
2325  av_assert2(h==8);
2326 
2327  s->dsp.diff_pixels(temp, src1, src2, stride);
2328 
2329  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2330 
2331  bits=0;
2332 
2333  if (s->mb_intra) {
2334  start_i = 1;
2335  length = s->intra_ac_vlc_length;
2336  last_length= s->intra_ac_vlc_last_length;
2337  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2338  } else {
2339  start_i = 0;
2340  length = s->inter_ac_vlc_length;
2341  last_length= s->inter_ac_vlc_last_length;
2342  }
2343 
2344  if(last>=start_i){
2345  run=0;
2346  for(i=start_i; i<last; i++){
2347  int j= scantable[i];
2348  level= temp[j];
2349 
2350  if(level){
2351  level+=64;
2352  if((level&(~127)) == 0){
2353  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2354  }else
2355  bits+= esc_length;
2356  run=0;
2357  }else
2358  run++;
2359  }
2360  i= scantable[last];
2361 
2362  level= temp[i] + 64;
2363 
2364  av_assert2(level - 64);
2365 
2366  if((level&(~127)) == 0){
2367  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2368  }else
2369  bits+= esc_length;
2370  }
2371 
2372  return bits;
2373 }
2374 
2375 #define VSAD_INTRA(size) \
2376 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2377  int score=0; \
2378  int x,y; \
2379  \
2380  for(y=1; y<h; y++){ \
2381  for(x=0; x<size; x+=4){ \
2382  score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2383  +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2384  } \
2385  s+= stride; \
2386  } \
2387  \
2388  return score; \
2389 }
2390 VSAD_INTRA(8)
2391 VSAD_INTRA(16)
2392 
2393 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2394  int score=0;
2395  int x,y;
2396 
2397  for(y=1; y<h; y++){
2398  for(x=0; x<16; x++){
2399  score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2400  }
2401  s1+= stride;
2402  s2+= stride;
2403  }
2404 
2405  return score;
2406 }
2407 
2408 #define SQ(a) ((a)*(a))
2409 #define VSSE_INTRA(size) \
2410 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2411  int score=0; \
2412  int x,y; \
2413  \
2414  for(y=1; y<h; y++){ \
2415  for(x=0; x<size; x+=4){ \
2416  score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2417  +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2418  } \
2419  s+= stride; \
2420  } \
2421  \
2422  return score; \
2423 }
2424 VSSE_INTRA(8)
2425 VSSE_INTRA(16)
2426 
2427 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2428  int score=0;
2429  int x,y;
2430 
2431  for(y=1; y<h; y++){
2432  for(x=0; x<16; x++){
2433  score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2434  }
2435  s1+= stride;
2436  s2+= stride;
2437  }
2438 
2439  return score;
2440 }
2441 
2442 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2443  int size){
2444  int score=0;
2445  int i;
2446  for(i=0; i<size; i++)
2447  score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2448  return score;
2449 }
2450 
2451 #define WRAPPER8_16_SQ(name8, name16)\
2452 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2453  int score=0;\
2454  score +=name8(s, dst , src , stride, 8);\
2455  score +=name8(s, dst+8 , src+8 , stride, 8);\
2456  if(h==16){\
2457  dst += 8*stride;\
2458  src += 8*stride;\
2459  score +=name8(s, dst , src , stride, 8);\
2460  score +=name8(s, dst+8 , src+8 , stride, 8);\
2461  }\
2462  return score;\
2463 }
2464 
2465 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2466 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2467 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2468 #if CONFIG_GPL
2469 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2470 #endif
2471 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2472 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2473 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2474 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2475 
2476 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2477  uint32_t maxi, uint32_t maxisign)
2478 {
2479 
2480  if(a > mini) return mini;
2481  else if((a^(1U<<31)) > maxisign) return maxi;
2482  else return a;
2483 }
2484 
2485 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2486  int i;
2487  uint32_t mini = *(uint32_t*)min;
2488  uint32_t maxi = *(uint32_t*)max;
2489  uint32_t maxisign = maxi ^ (1U<<31);
2490  uint32_t *dsti = (uint32_t*)dst;
2491  const uint32_t *srci = (const uint32_t*)src;
2492  for(i=0; i<len; i+=8) {
2493  dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2494  dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2495  dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2496  dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2497  dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2498  dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2499  dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2500  dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2501  }
2502 }
2503 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2504  int i;
2505  if(min < 0 && max > 0) {
2506  vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2507  } else {
2508  for(i=0; i < len; i+=8) {
2509  dst[i ] = av_clipf(src[i ], min, max);
2510  dst[i + 1] = av_clipf(src[i + 1], min, max);
2511  dst[i + 2] = av_clipf(src[i + 2], min, max);
2512  dst[i + 3] = av_clipf(src[i + 3], min, max);
2513  dst[i + 4] = av_clipf(src[i + 4], min, max);
2514  dst[i + 5] = av_clipf(src[i + 5], min, max);
2515  dst[i + 6] = av_clipf(src[i + 6], min, max);
2516  dst[i + 7] = av_clipf(src[i + 7], min, max);
2517  }
2518  }
2519 }
2520 
2521 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2522 {
2523  int res = 0;
2524 
2525  while (order--)
2526  res += *v1++ * *v2++;
2527 
2528  return res;
2529 }
2530 
2531 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2532 {
2533  int res = 0;
2534  while (order--) {
2535  res += *v1 * *v2++;
2536  *v1++ += mul * *v3++;
2537  }
2538  return res;
2539 }
2540 
2541 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2542  const int16_t *window, unsigned int len)
2543 {
2544  int i;
2545  int len2 = len >> 1;
2546 
2547  for (i = 0; i < len2; i++) {
2548  int16_t w = window[i];
2549  output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2550  output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2551  }
2552 }
2553 
2554 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2555  int32_t max, unsigned int len)
2556 {
2557  do {
2558  *dst++ = av_clip(*src++, min, max);
2559  *dst++ = av_clip(*src++, min, max);
2560  *dst++ = av_clip(*src++, min, max);
2561  *dst++ = av_clip(*src++, min, max);
2562  *dst++ = av_clip(*src++, min, max);
2563  *dst++ = av_clip(*src++, min, max);
2564  *dst++ = av_clip(*src++, min, max);
2565  *dst++ = av_clip(*src++, min, max);
2566  len -= 8;
2567  } while (len > 0);
2568 }
2569 
2570 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2571 {
2572  ff_j_rev_dct (block);
2573  put_pixels_clamped_c(block, dest, line_size);
2574 }
2575 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2576 {
2577  ff_j_rev_dct (block);
2578  add_pixels_clamped_c(block, dest, line_size);
2579 }
2580 
2581 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2582 {
2583  ff_j_rev_dct4 (block);
2584  put_pixels_clamped4_c(block, dest, line_size);
2585 }
2586 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2587 {
2588  ff_j_rev_dct4 (block);
2589  add_pixels_clamped4_c(block, dest, line_size);
2590 }
2591 
2592 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2593 {
2594  ff_j_rev_dct2 (block);
2595  put_pixels_clamped2_c(block, dest, line_size);
2596 }
2597 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2598 {
2599  ff_j_rev_dct2 (block);
2600  add_pixels_clamped2_c(block, dest, line_size);
2601 }
2602 
2603 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2604 {
2605  dest[0] = av_clip_uint8((block[0] + 4)>>3);
2606 }
2607 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2608 {
2609  dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2610 }
2611 
2612 /* init static data */
2614 {
2615  int i;
2616 
2617  for(i=0;i<512;i++) {
2618  ff_squareTbl[i] = (i - 256) * (i - 256);
2619  }
2620 
2621  for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2622 }
2623 
2625  static int did_fail=0;
2626  LOCAL_ALIGNED_16(int, aligned, [4]);
2627 
2628  if((intptr_t)aligned & 15){
2629  if(!did_fail){
2630 #if HAVE_MMX || HAVE_ALTIVEC
2632  "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2633  "and may be very slow or crash. This is not a bug in libavcodec,\n"
2634  "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2635  "Do not report crashes to FFmpeg developers.\n");
2636 #endif
2637  did_fail=1;
2638  }
2639  return -1;
2640  }
2641  return 0;
2642 }
2643 
2645 {
2647 
2648 #if CONFIG_ENCODERS
2649  if (avctx->bits_per_raw_sample == 10) {
2652  } else {
2653  if(avctx->dct_algo==FF_DCT_FASTINT) {
2654  c->fdct = ff_fdct_ifast;
2656  }
2657  else if(avctx->dct_algo==FF_DCT_FAAN) {
2658  c->fdct = ff_faandct;
2659  c->fdct248 = ff_faandct248;
2660  }
2661  else {
2662  c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2664  }
2665  }
2666 #endif //CONFIG_ENCODERS
2667 
2668  if(avctx->lowres==1){
2671  c->idct = ff_j_rev_dct4;
2673  }else if(avctx->lowres==2){
2676  c->idct = ff_j_rev_dct2;
2678  }else if(avctx->lowres==3){
2681  c->idct = ff_j_rev_dct1;
2683  }else{
2684  if (avctx->bits_per_raw_sample == 10) {
2687  c->idct = ff_simple_idct_10;
2689  } else {
2690  if(avctx->idct_algo==FF_IDCT_INT){
2693  c->idct = ff_j_rev_dct;
2695  }else if(avctx->idct_algo==FF_IDCT_FAAN){
2698  c->idct = ff_faanidct;
2700  }else{ //accurate/default
2703  c->idct = ff_simple_idct_8;
2705  }
2706  }
2707  }
2708 
2714  c->gmc1 = gmc1_c;
2715  c->gmc = ff_gmc_c;
2716  c->pix_sum = pix_sum_c;
2717  c->pix_norm1 = pix_norm1_c;
2718 
2720  c->fill_block_tab[1] = fill_block8_c;
2721 
2722  /* TODO [0] 16 [1] 8 */
2723  c->pix_abs[0][0] = pix_abs16_c;
2724  c->pix_abs[0][1] = pix_abs16_x2_c;
2725  c->pix_abs[0][2] = pix_abs16_y2_c;
2726  c->pix_abs[0][3] = pix_abs16_xy2_c;
2727  c->pix_abs[1][0] = pix_abs8_c;
2728  c->pix_abs[1][1] = pix_abs8_x2_c;
2729  c->pix_abs[1][2] = pix_abs8_y2_c;
2730  c->pix_abs[1][3] = pix_abs8_xy2_c;
2731 
2741 
2751 
2752 #define dspfunc(PFX, IDX, NUM) \
2753  c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2754  c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2755  c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2756  c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2757  c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2758  c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2759  c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2760  c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2761  c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2762  c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2763  c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2764  c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2765  c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2766  c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2767  c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2768  c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2769 
2770  dspfunc(put_qpel, 0, 16);
2771  dspfunc(put_no_rnd_qpel, 0, 16);
2772 
2773  dspfunc(avg_qpel, 0, 16);
2774  /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2775 
2776  dspfunc(put_qpel, 1, 8);
2777  dspfunc(put_no_rnd_qpel, 1, 8);
2778 
2779  dspfunc(avg_qpel, 1, 8);
2780  /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2781 
2782 #undef dspfunc
2783 
2792 
2793 #define SET_CMP_FUNC(name) \
2794  c->name[0]= name ## 16_c;\
2795  c->name[1]= name ## 8x8_c;
2796 
2797  SET_CMP_FUNC(hadamard8_diff)
2798  c->hadamard8_diff[4]= hadamard8_intra16_c;
2800  SET_CMP_FUNC(dct_sad)
2801  SET_CMP_FUNC(dct_max)
2802 #if CONFIG_GPL
2803  SET_CMP_FUNC(dct264_sad)
2804 #endif
2805  c->sad[0]= pix_abs16_c;
2806  c->sad[1]= pix_abs8_c;
2807  c->sse[0]= sse16_c;
2808  c->sse[1]= sse8_c;
2809  c->sse[2]= sse4_c;
2810  SET_CMP_FUNC(quant_psnr)
2811  SET_CMP_FUNC(rd)
2812  SET_CMP_FUNC(bit)
2813  c->vsad[0]= vsad16_c;
2814  c->vsad[4]= vsad_intra16_c;
2815  c->vsad[5]= vsad_intra8_c;
2816  c->vsse[0]= vsse16_c;
2817  c->vsse[4]= vsse_intra16_c;
2818  c->vsse[5]= vsse_intra8_c;
2819  c->nsse[0]= nsse16_c;
2820  c->nsse[1]= nsse8_c;
2821 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2823 #endif
2824 
2826 
2827  c->add_bytes= add_bytes_c;
2833  c->bswap_buf= bswap_buf;
2834  c->bswap16_buf = bswap16_buf;
2835 
2836  if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2839  }
2840 
2842 
2845 
2851 
2852  c->shrink[0]= av_image_copy_plane;
2853  c->shrink[1]= ff_shrink22;
2854  c->shrink[2]= ff_shrink44;
2855  c->shrink[3]= ff_shrink88;
2856 
2858 
2859 #define hpel_funcs(prefix, idx, num) \
2860  c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2861  c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2862  c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2863  c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2864 
2865  hpel_funcs(put, [0], 16);
2866  hpel_funcs(put, [1], 8);
2867  hpel_funcs(put, [2], 4);
2868  hpel_funcs(put, [3], 2);
2869  hpel_funcs(put_no_rnd, [0], 16);
2870  hpel_funcs(put_no_rnd, [1], 8);
2871  hpel_funcs(avg, [0], 16);
2872  hpel_funcs(avg, [1], 8);
2873  hpel_funcs(avg, [2], 4);
2874  hpel_funcs(avg, [3], 2);
2875  hpel_funcs(avg_no_rnd,, 16);
2876 
2877 #undef FUNC
2878 #undef FUNCC
2879 #define FUNC(f, depth) f ## _ ## depth
2880 #define FUNCC(f, depth) f ## _ ## depth ## _c
2881 
2882 #define BIT_DEPTH_FUNCS(depth, dct)\
2883  c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2884  c->draw_edges = FUNCC(draw_edges , depth);\
2885  c->clear_block = FUNCC(clear_block ## dct , depth);\
2886  c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
2887 
2888  switch (avctx->bits_per_raw_sample) {
2889  case 9:
2890  if (c->dct_bits == 32) {
2891  BIT_DEPTH_FUNCS(9, _32);
2892  } else {
2893  BIT_DEPTH_FUNCS(9, _16);
2894  }
2895  break;
2896  case 10:
2897  if (c->dct_bits == 32) {
2898  BIT_DEPTH_FUNCS(10, _32);
2899  } else {
2900  BIT_DEPTH_FUNCS(10, _16);
2901  }
2902  break;
2903  case 12:
2904  if (c->dct_bits == 32) {
2905  BIT_DEPTH_FUNCS(12, _32);
2906  } else {
2907  BIT_DEPTH_FUNCS(12, _16);
2908  }
2909  break;
2910  case 14:
2911  if (c->dct_bits == 32) {
2912  BIT_DEPTH_FUNCS(14, _32);
2913  } else {
2914  BIT_DEPTH_FUNCS(14, _16);
2915  }
2916  break;
2917  default:
2918  if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2919  BIT_DEPTH_FUNCS(8, _16);
2920  }
2921  break;
2922  }
2923 
2924 
2925  if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2926  if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2927  if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2928  if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2929  if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2930  if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2931  if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2932 
2935 }
2936 
2938 {
2939  ff_dsputil_init(c, avctx);
2940 }