FFmpeg: libavcodec/dsputil.c Source File

00001 /*
00002  * DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of FFmpeg.
00009  *
00010  * FFmpeg is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * FFmpeg is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with FFmpeg; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00030 #include "avcodec.h"
00031 #include "dsputil.h"
00032 #include "simple_idct.h"
00033 #include "faandct.h"
00034 #include "faanidct.h"
00035 #include "mathops.h"
00036 #include "h263.h"
00037 #include "snow.h"
00038 
00039 /* snow.c */
00040 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
00041 
00042 /* vorbis.c */
00043 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
00044 
00045 /* ac3dec.c */
00046 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
00047 
00048 /* flacenc.c */
00049 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
00050 
00051 /* pngdec.c */
00052 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
00053 
00054 /* eaidct.c */
00055 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
00056 
00057 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
00058 uint32_t ff_squareTbl[512] = {0, };
00059 
00060 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
00061 #define pb_7f (~0UL/255 * 0x7f)
00062 #define pb_80 (~0UL/255 * 0x80)
00063 
00064 const uint8_t ff_zigzag_direct[64] = {
00065     0,   1,  8, 16,  9,  2,  3, 10,
00066     17, 24, 32, 25, 18, 11,  4,  5,
00067     12, 19, 26, 33, 40, 48, 41, 34,
00068     27, 20, 13,  6,  7, 14, 21, 28,
00069     35, 42, 49, 56, 57, 50, 43, 36,
00070     29, 22, 15, 23, 30, 37, 44, 51,
00071     58, 59, 52, 45, 38, 31, 39, 46,
00072     53, 60, 61, 54, 47, 55, 62, 63
00073 };
00074 
00075 /* Specific zigzag scan for 248 idct. NOTE that unlike the
00076    specification, we interleave the fields */
00077 const uint8_t ff_zigzag248_direct[64] = {
00078      0,  8,  1,  9, 16, 24,  2, 10,
00079     17, 25, 32, 40, 48, 56, 33, 41,
00080     18, 26,  3, 11,  4, 12, 19, 27,
00081     34, 42, 49, 57, 50, 58, 35, 43,
00082     20, 28,  5, 13,  6, 14, 21, 29,
00083     36, 44, 51, 59, 52, 60, 37, 45,
00084     22, 30,  7, 15, 23, 31, 38, 46,
00085     53, 61, 54, 62, 39, 47, 55, 63,
00086 };
00087 
00088 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
00089 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
00090 
00091 const uint8_t ff_alternate_horizontal_scan[64] = {
00092     0,  1,   2,  3,  8,  9, 16, 17,
00093     10, 11,  4,  5,  6,  7, 15, 14,
00094     13, 12, 19, 18, 24, 25, 32, 33,
00095     26, 27, 20, 21, 22, 23, 28, 29,
00096     30, 31, 34, 35, 40, 41, 48, 49,
00097     42, 43, 36, 37, 38, 39, 44, 45,
00098     46, 47, 50, 51, 56, 57, 58, 59,
00099     52, 53, 54, 55, 60, 61, 62, 63,
00100 };
00101 
00102 const uint8_t ff_alternate_vertical_scan[64] = {
00103     0,  8,  16, 24,  1,  9,  2, 10,
00104     17, 25, 32, 40, 48, 56, 57, 49,
00105     41, 33, 26, 18,  3, 11,  4, 12,
00106     19, 27, 34, 42, 50, 58, 35, 43,
00107     51, 59, 20, 28,  5, 13,  6, 14,
00108     21, 29, 36, 44, 52, 60, 37, 45,
00109     53, 61, 22, 30,  7, 15, 23, 31,
00110     38, 46, 54, 62, 39, 47, 55, 63,
00111 };
00112 
00113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
00114 const uint32_t ff_inverse[256]={
00115          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
00116  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
00117  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
00118  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
00119  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
00120  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
00121   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
00122   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
00123   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
00124   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
00125   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
00126   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
00127   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
00128   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
00129   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
00130   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
00131   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
00132   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
00133   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
00134   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
00135   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
00136   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
00137   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
00138   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
00139   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
00140   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
00141   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
00142   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
00143   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
00144   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
00145   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
00146   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
00147 };
00148 
00149 /* Input permutation for the simple_idct_mmx */
00150 static const uint8_t simple_mmx_permutation[64]={
00151         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00152         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00153         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00154         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00155         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00156         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00157         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00158         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00159 };
00160 
00161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00162 
00163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
00164     int i;
00165     int end;
00166 
00167     st->scantable= src_scantable;
00168 
00169     for(i=0; i<64; i++){
00170         int j;
00171         j = src_scantable[i];
00172         st->permutated[i] = permutation[j];
00173 #if ARCH_PPC
00174         st->inverse[j] = i;
00175 #endif
00176     }
00177 
00178     end=-1;
00179     for(i=0; i<64; i++){
00180         int j;
00181         j = st->permutated[i];
00182         if(j>end) end=j;
00183         st->raster_end[i]= end;
00184     }
00185 }
00186 
00187 static int pix_sum_c(uint8_t * pix, int line_size)
00188 {
00189     int s, i, j;
00190 
00191     s = 0;
00192     for (i = 0; i < 16; i++) {
00193         for (j = 0; j < 16; j += 8) {
00194             s += pix[0];
00195             s += pix[1];
00196             s += pix[2];
00197             s += pix[3];
00198             s += pix[4];
00199             s += pix[5];
00200             s += pix[6];
00201             s += pix[7];
00202             pix += 8;
00203         }
00204         pix += line_size - 16;
00205     }
00206     return s;
00207 }
00208 
00209 static int pix_norm1_c(uint8_t * pix, int line_size)
00210 {
00211     int s, i, j;
00212     uint32_t *sq = ff_squareTbl + 256;
00213 
00214     s = 0;
00215     for (i = 0; i < 16; i++) {
00216         for (j = 0; j < 16; j += 8) {
00217 #if 0
00218             s += sq[pix[0]];
00219             s += sq[pix[1]];
00220             s += sq[pix[2]];
00221             s += sq[pix[3]];
00222             s += sq[pix[4]];
00223             s += sq[pix[5]];
00224             s += sq[pix[6]];
00225             s += sq[pix[7]];
00226 #else
00227 #if LONG_MAX > 2147483647
00228             register uint64_t x=*(uint64_t*)pix;
00229             s += sq[x&0xff];
00230             s += sq[(x>>8)&0xff];
00231             s += sq[(x>>16)&0xff];
00232             s += sq[(x>>24)&0xff];
00233             s += sq[(x>>32)&0xff];
00234             s += sq[(x>>40)&0xff];
00235             s += sq[(x>>48)&0xff];
00236             s += sq[(x>>56)&0xff];
00237 #else
00238             register uint32_t x=*(uint32_t*)pix;
00239             s += sq[x&0xff];
00240             s += sq[(x>>8)&0xff];
00241             s += sq[(x>>16)&0xff];
00242             s += sq[(x>>24)&0xff];
00243             x=*(uint32_t*)(pix+4);
00244             s += sq[x&0xff];
00245             s += sq[(x>>8)&0xff];
00246             s += sq[(x>>16)&0xff];
00247             s += sq[(x>>24)&0xff];
00248 #endif
00249 #endif
00250             pix += 8;
00251         }
00252         pix += line_size - 16;
00253     }
00254     return s;
00255 }
00256 
00257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
00258     int i;
00259 
00260     for(i=0; i+8<=w; i+=8){
00261         dst[i+0]= bswap_32(src[i+0]);
00262         dst[i+1]= bswap_32(src[i+1]);
00263         dst[i+2]= bswap_32(src[i+2]);
00264         dst[i+3]= bswap_32(src[i+3]);
00265         dst[i+4]= bswap_32(src[i+4]);
00266         dst[i+5]= bswap_32(src[i+5]);
00267         dst[i+6]= bswap_32(src[i+6]);
00268         dst[i+7]= bswap_32(src[i+7]);
00269     }
00270     for(;i<w; i++){
00271         dst[i+0]= bswap_32(src[i+0]);
00272     }
00273 }
00274 
00275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00276 {
00277     int s, i;
00278     uint32_t *sq = ff_squareTbl + 256;
00279 
00280     s = 0;
00281     for (i = 0; i < h; i++) {
00282         s += sq[pix1[0] - pix2[0]];
00283         s += sq[pix1[1] - pix2[1]];
00284         s += sq[pix1[2] - pix2[2]];
00285         s += sq[pix1[3] - pix2[3]];
00286         pix1 += line_size;
00287         pix2 += line_size;
00288     }
00289     return s;
00290 }
00291 
00292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00293 {
00294     int s, i;
00295     uint32_t *sq = ff_squareTbl + 256;
00296 
00297     s = 0;
00298     for (i = 0; i < h; i++) {
00299         s += sq[pix1[0] - pix2[0]];
00300         s += sq[pix1[1] - pix2[1]];
00301         s += sq[pix1[2] - pix2[2]];
00302         s += sq[pix1[3] - pix2[3]];
00303         s += sq[pix1[4] - pix2[4]];
00304         s += sq[pix1[5] - pix2[5]];
00305         s += sq[pix1[6] - pix2[6]];
00306         s += sq[pix1[7] - pix2[7]];
00307         pix1 += line_size;
00308         pix2 += line_size;
00309     }
00310     return s;
00311 }
00312 
00313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00314 {
00315     int s, i;
00316     uint32_t *sq = ff_squareTbl + 256;
00317 
00318     s = 0;
00319     for (i = 0; i < h; i++) {
00320         s += sq[pix1[ 0] - pix2[ 0]];
00321         s += sq[pix1[ 1] - pix2[ 1]];
00322         s += sq[pix1[ 2] - pix2[ 2]];
00323         s += sq[pix1[ 3] - pix2[ 3]];
00324         s += sq[pix1[ 4] - pix2[ 4]];
00325         s += sq[pix1[ 5] - pix2[ 5]];
00326         s += sq[pix1[ 6] - pix2[ 6]];
00327         s += sq[pix1[ 7] - pix2[ 7]];
00328         s += sq[pix1[ 8] - pix2[ 8]];
00329         s += sq[pix1[ 9] - pix2[ 9]];
00330         s += sq[pix1[10] - pix2[10]];
00331         s += sq[pix1[11] - pix2[11]];
00332         s += sq[pix1[12] - pix2[12]];
00333         s += sq[pix1[13] - pix2[13]];
00334         s += sq[pix1[14] - pix2[14]];
00335         s += sq[pix1[15] - pix2[15]];
00336 
00337         pix1 += line_size;
00338         pix2 += line_size;
00339     }
00340     return s;
00341 }
00342 
00343 
00344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
00345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
00346     int s, i, j;
00347     const int dec_count= w==8 ? 3 : 4;
00348     int tmp[32*32];
00349     int level, ori;
00350     static const int scale[2][2][4][4]={
00351       {
00352         {
00353             // 9/7 8x8 dec=3
00354             {268, 239, 239, 213},
00355             {  0, 224, 224, 152},
00356             {  0, 135, 135, 110},
00357         },{
00358             // 9/7 16x16 or 32x32 dec=4
00359             {344, 310, 310, 280},
00360             {  0, 320, 320, 228},
00361             {  0, 175, 175, 136},
00362             {  0, 129, 129, 102},
00363         }
00364       },{
00365         {
00366             // 5/3 8x8 dec=3
00367             {275, 245, 245, 218},
00368             {  0, 230, 230, 156},
00369             {  0, 138, 138, 113},
00370         },{
00371             // 5/3 16x16 or 32x32 dec=4
00372             {352, 317, 317, 286},
00373             {  0, 328, 328, 233},
00374             {  0, 180, 180, 140},
00375             {  0, 132, 132, 105},
00376         }
00377       }
00378     };
00379 
00380     for (i = 0; i < h; i++) {
00381         for (j = 0; j < w; j+=4) {
00382             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
00383             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
00384             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
00385             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
00386         }
00387         pix1 += line_size;
00388         pix2 += line_size;
00389     }
00390 
00391     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
00392 
00393     s=0;
00394     assert(w==h);
00395     for(level=0; level<dec_count; level++){
00396         for(ori= level ? 1 : 0; ori<4; ori++){
00397             int size= w>>(dec_count-level);
00398             int sx= (ori&1) ? size : 0;
00399             int stride= 32<<(dec_count-level);
00400             int sy= (ori&2) ? stride>>1 : 0;
00401 
00402             for(i=0; i<size; i++){
00403                 for(j=0; j<size; j++){
00404                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
00405                     s += FFABS(v);
00406                 }
00407             }
00408         }
00409     }
00410     assert(s>=0);
00411     return s>>9;
00412 }
00413 
00414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
00415     return w_c(v, pix1, pix2, line_size,  8, h, 1);
00416 }
00417 
00418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
00419     return w_c(v, pix1, pix2, line_size,  8, h, 0);
00420 }
00421 
00422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
00423     return w_c(v, pix1, pix2, line_size, 16, h, 1);
00424 }
00425 
00426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
00427     return w_c(v, pix1, pix2, line_size, 16, h, 0);
00428 }
00429 
00430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
00431     return w_c(v, pix1, pix2, line_size, 32, h, 1);
00432 }
00433 
00434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
00435     return w_c(v, pix1, pix2, line_size, 32, h, 0);
00436 }
00437 #endif
00438 
00439 /* draw the edges of width 'w' of an image of size width, height */
00440 //FIXME check that this is ok for mpeg4 interlaced
00441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
00442 {
00443     uint8_t *ptr, *last_line;
00444     int i;
00445 
00446     last_line = buf + (height - 1) * wrap;
00447     for(i=0;i<w;i++) {
00448         /* top and bottom */
00449         memcpy(buf - (i + 1) * wrap, buf, width);
00450         memcpy(last_line + (i + 1) * wrap, last_line, width);
00451     }
00452     /* left and right */
00453     ptr = buf;
00454     for(i=0;i<height;i++) {
00455         memset(ptr - w, ptr[0], w);
00456         memset(ptr + width, ptr[width-1], w);
00457         ptr += wrap;
00458     }
00459     /* corners */
00460     for(i=0;i<w;i++) {
00461         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
00462         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
00463         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
00464         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
00465     }
00466 }
00467 
00480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
00481                                     int src_x, int src_y, int w, int h){
00482     int x, y;
00483     int start_y, start_x, end_y, end_x;
00484 
00485     if(src_y>= h){
00486         src+= (h-1-src_y)*linesize;
00487         src_y=h-1;
00488     }else if(src_y<=-block_h){
00489         src+= (1-block_h-src_y)*linesize;
00490         src_y=1-block_h;
00491     }
00492     if(src_x>= w){
00493         src+= (w-1-src_x);
00494         src_x=w-1;
00495     }else if(src_x<=-block_w){
00496         src+= (1-block_w-src_x);
00497         src_x=1-block_w;
00498     }
00499 
00500     start_y= FFMAX(0, -src_y);
00501     start_x= FFMAX(0, -src_x);
00502     end_y= FFMIN(block_h, h-src_y);
00503     end_x= FFMIN(block_w, w-src_x);
00504 
00505     // copy existing part
00506     for(y=start_y; y<end_y; y++){
00507         for(x=start_x; x<end_x; x++){
00508             buf[x + y*linesize]= src[x + y*linesize];
00509         }
00510     }
00511 
00512     //top
00513     for(y=0; y<start_y; y++){
00514         for(x=start_x; x<end_x; x++){
00515             buf[x + y*linesize]= buf[x + start_y*linesize];
00516         }
00517     }
00518 
00519     //bottom
00520     for(y=end_y; y<block_h; y++){
00521         for(x=start_x; x<end_x; x++){
00522             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
00523         }
00524     }
00525 
00526     for(y=0; y<block_h; y++){
00527        //left
00528         for(x=0; x<start_x; x++){
00529             buf[x + y*linesize]= buf[start_x + y*linesize];
00530         }
00531 
00532        //right
00533         for(x=end_x; x<block_w; x++){
00534             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
00535         }
00536     }
00537 }
00538 
00539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
00540 {
00541     int i;
00542 
00543     /* read the pixels */
00544     for(i=0;i<8;i++) {
00545         block[0] = pixels[0];
00546         block[1] = pixels[1];
00547         block[2] = pixels[2];
00548         block[3] = pixels[3];
00549         block[4] = pixels[4];
00550         block[5] = pixels[5];
00551         block[6] = pixels[6];
00552         block[7] = pixels[7];
00553         pixels += line_size;
00554         block += 8;
00555     }
00556 }
00557 
00558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
00559                           const uint8_t *s2, int stride){
00560     int i;
00561 
00562     /* read the pixels */
00563     for(i=0;i<8;i++) {
00564         block[0] = s1[0] - s2[0];
00565         block[1] = s1[1] - s2[1];
00566         block[2] = s1[2] - s2[2];
00567         block[3] = s1[3] - s2[3];
00568         block[4] = s1[4] - s2[4];
00569         block[5] = s1[5] - s2[5];
00570         block[6] = s1[6] - s2[6];
00571         block[7] = s1[7] - s2[7];
00572         s1 += stride;
00573         s2 += stride;
00574         block += 8;
00575     }
00576 }
00577 
00578 
00579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00580                                  int line_size)
00581 {
00582     int i;
00583     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00584 
00585     /* read the pixels */
00586     for(i=0;i<8;i++) {
00587         pixels[0] = cm[block[0]];
00588         pixels[1] = cm[block[1]];
00589         pixels[2] = cm[block[2]];
00590         pixels[3] = cm[block[3]];
00591         pixels[4] = cm[block[4]];
00592         pixels[5] = cm[block[5]];
00593         pixels[6] = cm[block[6]];
00594         pixels[7] = cm[block[7]];
00595 
00596         pixels += line_size;
00597         block += 8;
00598     }
00599 }
00600 
00601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00602                                  int line_size)
00603 {
00604     int i;
00605     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00606 
00607     /* read the pixels */
00608     for(i=0;i<4;i++) {
00609         pixels[0] = cm[block[0]];
00610         pixels[1] = cm[block[1]];
00611         pixels[2] = cm[block[2]];
00612         pixels[3] = cm[block[3]];
00613 
00614         pixels += line_size;
00615         block += 8;
00616     }
00617 }
00618 
00619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00620                                  int line_size)
00621 {
00622     int i;
00623     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00624 
00625     /* read the pixels */
00626     for(i=0;i<2;i++) {
00627         pixels[0] = cm[block[0]];
00628         pixels[1] = cm[block[1]];
00629 
00630         pixels += line_size;
00631         block += 8;
00632     }
00633 }
00634 
00635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
00636                                         uint8_t *restrict pixels,
00637                                         int line_size)
00638 {
00639     int i, j;
00640 
00641     for (i = 0; i < 8; i++) {
00642         for (j = 0; j < 8; j++) {
00643             if (*block < -128)
00644                 *pixels = 0;
00645             else if (*block > 127)
00646                 *pixels = 255;
00647             else
00648                 *pixels = (uint8_t)(*block + 128);
00649             block++;
00650             pixels++;
00651         }
00652         pixels += (line_size - 8);
00653     }
00654 }
00655 
00656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00657                           int line_size)
00658 {
00659     int i;
00660     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00661 
00662     /* read the pixels */
00663     for(i=0;i<8;i++) {
00664         pixels[0] = cm[pixels[0] + block[0]];
00665         pixels[1] = cm[pixels[1] + block[1]];
00666         pixels[2] = cm[pixels[2] + block[2]];
00667         pixels[3] = cm[pixels[3] + block[3]];
00668         pixels[4] = cm[pixels[4] + block[4]];
00669         pixels[5] = cm[pixels[5] + block[5]];
00670         pixels[6] = cm[pixels[6] + block[6]];
00671         pixels[7] = cm[pixels[7] + block[7]];
00672         pixels += line_size;
00673         block += 8;
00674     }
00675 }
00676 
00677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00678                           int line_size)
00679 {
00680     int i;
00681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00682 
00683     /* read the pixels */
00684     for(i=0;i<4;i++) {
00685         pixels[0] = cm[pixels[0] + block[0]];
00686         pixels[1] = cm[pixels[1] + block[1]];
00687         pixels[2] = cm[pixels[2] + block[2]];
00688         pixels[3] = cm[pixels[3] + block[3]];
00689         pixels += line_size;
00690         block += 8;
00691     }
00692 }
00693 
00694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00695                           int line_size)
00696 {
00697     int i;
00698     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00699 
00700     /* read the pixels */
00701     for(i=0;i<2;i++) {
00702         pixels[0] = cm[pixels[0] + block[0]];
00703         pixels[1] = cm[pixels[1] + block[1]];
00704         pixels += line_size;
00705         block += 8;
00706     }
00707 }
00708 
00709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
00710 {
00711     int i;
00712     for(i=0;i<8;i++) {
00713         pixels[0] += block[0];
00714         pixels[1] += block[1];
00715         pixels[2] += block[2];
00716         pixels[3] += block[3];
00717         pixels[4] += block[4];
00718         pixels[5] += block[5];
00719         pixels[6] += block[6];
00720         pixels[7] += block[7];
00721         pixels += line_size;
00722         block += 8;
00723     }
00724 }
00725 
00726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
00727 {
00728     int i;
00729     for(i=0;i<4;i++) {
00730         pixels[0] += block[0];
00731         pixels[1] += block[1];
00732         pixels[2] += block[2];
00733         pixels[3] += block[3];
00734         pixels += line_size;
00735         block += 4;
00736     }
00737 }
00738 
00739 static int sum_abs_dctelem_c(DCTELEM *block)
00740 {
00741     int sum=0, i;
00742     for(i=0; i<64; i++)
00743         sum+= FFABS(block[i]);
00744     return sum;
00745 }
00746 
00747 #if 0
00748 
00749 #define PIXOP2(OPNAME, OP) \
00750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00751 {\
00752     int i;\
00753     for(i=0; i<h; i++){\
00754         OP(*((uint64_t*)block), AV_RN64(pixels));\
00755         pixels+=line_size;\
00756         block +=line_size;\
00757     }\
00758 }\
00759 \
00760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00761 {\
00762     int i;\
00763     for(i=0; i<h; i++){\
00764         const uint64_t a= AV_RN64(pixels  );\
00765         const uint64_t b= AV_RN64(pixels+1);\
00766         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00767         pixels+=line_size;\
00768         block +=line_size;\
00769     }\
00770 }\
00771 \
00772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00773 {\
00774     int i;\
00775     for(i=0; i<h; i++){\
00776         const uint64_t a= AV_RN64(pixels  );\
00777         const uint64_t b= AV_RN64(pixels+1);\
00778         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00779         pixels+=line_size;\
00780         block +=line_size;\
00781     }\
00782 }\
00783 \
00784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00785 {\
00786     int i;\
00787     for(i=0; i<h; i++){\
00788         const uint64_t a= AV_RN64(pixels          );\
00789         const uint64_t b= AV_RN64(pixels+line_size);\
00790         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00791         pixels+=line_size;\
00792         block +=line_size;\
00793     }\
00794 }\
00795 \
00796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00797 {\
00798     int i;\
00799     for(i=0; i<h; i++){\
00800         const uint64_t a= AV_RN64(pixels          );\
00801         const uint64_t b= AV_RN64(pixels+line_size);\
00802         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00803         pixels+=line_size;\
00804         block +=line_size;\
00805     }\
00806 }\
00807 \
00808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00809 {\
00810         int i;\
00811         const uint64_t a= AV_RN64(pixels  );\
00812         const uint64_t b= AV_RN64(pixels+1);\
00813         uint64_t l0=  (a&0x0303030303030303ULL)\
00814                     + (b&0x0303030303030303ULL)\
00815                     + 0x0202020202020202ULL;\
00816         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00817                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00818         uint64_t l1,h1;\
00819 \
00820         pixels+=line_size;\
00821         for(i=0; i<h; i+=2){\
00822             uint64_t a= AV_RN64(pixels  );\
00823             uint64_t b= AV_RN64(pixels+1);\
00824             l1=  (a&0x0303030303030303ULL)\
00825                + (b&0x0303030303030303ULL);\
00826             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00827               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00828             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00829             pixels+=line_size;\
00830             block +=line_size;\
00831             a= AV_RN64(pixels  );\
00832             b= AV_RN64(pixels+1);\
00833             l0=  (a&0x0303030303030303ULL)\
00834                + (b&0x0303030303030303ULL)\
00835                + 0x0202020202020202ULL;\
00836             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00837               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00838             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00839             pixels+=line_size;\
00840             block +=line_size;\
00841         }\
00842 }\
00843 \
00844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00845 {\
00846         int i;\
00847         const uint64_t a= AV_RN64(pixels  );\
00848         const uint64_t b= AV_RN64(pixels+1);\
00849         uint64_t l0=  (a&0x0303030303030303ULL)\
00850                     + (b&0x0303030303030303ULL)\
00851                     + 0x0101010101010101ULL;\
00852         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00853                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00854         uint64_t l1,h1;\
00855 \
00856         pixels+=line_size;\
00857         for(i=0; i<h; i+=2){\
00858             uint64_t a= AV_RN64(pixels  );\
00859             uint64_t b= AV_RN64(pixels+1);\
00860             l1=  (a&0x0303030303030303ULL)\
00861                + (b&0x0303030303030303ULL);\
00862             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00863               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00864             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00865             pixels+=line_size;\
00866             block +=line_size;\
00867             a= AV_RN64(pixels  );\
00868             b= AV_RN64(pixels+1);\
00869             l0=  (a&0x0303030303030303ULL)\
00870                + (b&0x0303030303030303ULL)\
00871                + 0x0101010101010101ULL;\
00872             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00873               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00874             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00875             pixels+=line_size;\
00876             block +=line_size;\
00877         }\
00878 }\
00879 \
00880 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
00881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
00882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
00883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
00884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
00885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
00886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
00887 
00888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
00889 #else // 64 bit variant
00890 
00891 #define PIXOP2(OPNAME, OP) \
00892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00893     int i;\
00894     for(i=0; i<h; i++){\
00895         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
00896         pixels+=line_size;\
00897         block +=line_size;\
00898     }\
00899 }\
00900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00901     int i;\
00902     for(i=0; i<h; i++){\
00903         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
00904         pixels+=line_size;\
00905         block +=line_size;\
00906     }\
00907 }\
00908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00909     int i;\
00910     for(i=0; i<h; i++){\
00911         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
00912         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
00913         pixels+=line_size;\
00914         block +=line_size;\
00915     }\
00916 }\
00917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00918     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
00919 }\
00920 \
00921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00922                                                 int src_stride1, int src_stride2, int h){\
00923     int i;\
00924     for(i=0; i<h; i++){\
00925         uint32_t a,b;\
00926         a= AV_RN32(&src1[i*src_stride1  ]);\
00927         b= AV_RN32(&src2[i*src_stride2  ]);\
00928         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
00929         a= AV_RN32(&src1[i*src_stride1+4]);\
00930         b= AV_RN32(&src2[i*src_stride2+4]);\
00931         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
00932     }\
00933 }\
00934 \
00935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00936                                                 int src_stride1, int src_stride2, int h){\
00937     int i;\
00938     for(i=0; i<h; i++){\
00939         uint32_t a,b;\
00940         a= AV_RN32(&src1[i*src_stride1  ]);\
00941         b= AV_RN32(&src2[i*src_stride2  ]);\
00942         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
00943         a= AV_RN32(&src1[i*src_stride1+4]);\
00944         b= AV_RN32(&src2[i*src_stride2+4]);\
00945         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
00946     }\
00947 }\
00948 \
00949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00950                                                 int src_stride1, int src_stride2, int h){\
00951     int i;\
00952     for(i=0; i<h; i++){\
00953         uint32_t a,b;\
00954         a= AV_RN32(&src1[i*src_stride1  ]);\
00955         b= AV_RN32(&src2[i*src_stride2  ]);\
00956         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
00957     }\
00958 }\
00959 \
00960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00961                                                 int src_stride1, int src_stride2, int h){\
00962     int i;\
00963     for(i=0; i<h; i++){\
00964         uint32_t a,b;\
00965         a= AV_RN16(&src1[i*src_stride1  ]);\
00966         b= AV_RN16(&src2[i*src_stride2  ]);\
00967         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
00968     }\
00969 }\
00970 \
00971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00972                                                 int src_stride1, int src_stride2, int h){\
00973     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
00974     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
00975 }\
00976 \
00977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00978                                                 int src_stride1, int src_stride2, int h){\
00979     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
00980     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
00981 }\
00982 \
00983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00984     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
00985 }\
00986 \
00987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00988     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
00989 }\
00990 \
00991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00992     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
00993 }\
00994 \
00995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00996     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
00997 }\
00998 \
00999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01000                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01001     int i;\
01002     for(i=0; i<h; i++){\
01003         uint32_t a, b, c, d, l0, l1, h0, h1;\
01004         a= AV_RN32(&src1[i*src_stride1]);\
01005         b= AV_RN32(&src2[i*src_stride2]);\
01006         c= AV_RN32(&src3[i*src_stride3]);\
01007         d= AV_RN32(&src4[i*src_stride4]);\
01008         l0=  (a&0x03030303UL)\
01009            + (b&0x03030303UL)\
01010            + 0x02020202UL;\
01011         h0= ((a&0xFCFCFCFCUL)>>2)\
01012           + ((b&0xFCFCFCFCUL)>>2);\
01013         l1=  (c&0x03030303UL)\
01014            + (d&0x03030303UL);\
01015         h1= ((c&0xFCFCFCFCUL)>>2)\
01016           + ((d&0xFCFCFCFCUL)>>2);\
01017         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01018         a= AV_RN32(&src1[i*src_stride1+4]);\
01019         b= AV_RN32(&src2[i*src_stride2+4]);\
01020         c= AV_RN32(&src3[i*src_stride3+4]);\
01021         d= AV_RN32(&src4[i*src_stride4+4]);\
01022         l0=  (a&0x03030303UL)\
01023            + (b&0x03030303UL)\
01024            + 0x02020202UL;\
01025         h0= ((a&0xFCFCFCFCUL)>>2)\
01026           + ((b&0xFCFCFCFCUL)>>2);\
01027         l1=  (c&0x03030303UL)\
01028            + (d&0x03030303UL);\
01029         h1= ((c&0xFCFCFCFCUL)>>2)\
01030           + ((d&0xFCFCFCFCUL)>>2);\
01031         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01032     }\
01033 }\
01034 \
01035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
01036     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
01037 }\
01038 \
01039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
01040     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
01041 }\
01042 \
01043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
01044     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
01045 }\
01046 \
01047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
01048     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
01049 }\
01050 \
01051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01052                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01053     int i;\
01054     for(i=0; i<h; i++){\
01055         uint32_t a, b, c, d, l0, l1, h0, h1;\
01056         a= AV_RN32(&src1[i*src_stride1]);\
01057         b= AV_RN32(&src2[i*src_stride2]);\
01058         c= AV_RN32(&src3[i*src_stride3]);\
01059         d= AV_RN32(&src4[i*src_stride4]);\
01060         l0=  (a&0x03030303UL)\
01061            + (b&0x03030303UL)\
01062            + 0x01010101UL;\
01063         h0= ((a&0xFCFCFCFCUL)>>2)\
01064           + ((b&0xFCFCFCFCUL)>>2);\
01065         l1=  (c&0x03030303UL)\
01066            + (d&0x03030303UL);\
01067         h1= ((c&0xFCFCFCFCUL)>>2)\
01068           + ((d&0xFCFCFCFCUL)>>2);\
01069         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01070         a= AV_RN32(&src1[i*src_stride1+4]);\
01071         b= AV_RN32(&src2[i*src_stride2+4]);\
01072         c= AV_RN32(&src3[i*src_stride3+4]);\
01073         d= AV_RN32(&src4[i*src_stride4+4]);\
01074         l0=  (a&0x03030303UL)\
01075            + (b&0x03030303UL)\
01076            + 0x01010101UL;\
01077         h0= ((a&0xFCFCFCFCUL)>>2)\
01078           + ((b&0xFCFCFCFCUL)>>2);\
01079         l1=  (c&0x03030303UL)\
01080            + (d&0x03030303UL);\
01081         h1= ((c&0xFCFCFCFCUL)>>2)\
01082           + ((d&0xFCFCFCFCUL)>>2);\
01083         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01084     }\
01085 }\
01086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01087                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01088     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01089     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01090 }\
01091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01092                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01093     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01094     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01095 }\
01096 \
01097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01098 {\
01099         int i, a0, b0, a1, b1;\
01100         a0= pixels[0];\
01101         b0= pixels[1] + 2;\
01102         a0 += b0;\
01103         b0 += pixels[2];\
01104 \
01105         pixels+=line_size;\
01106         for(i=0; i<h; i+=2){\
01107             a1= pixels[0];\
01108             b1= pixels[1];\
01109             a1 += b1;\
01110             b1 += pixels[2];\
01111 \
01112             block[0]= (a1+a0)>>2; /* FIXME non put */\
01113             block[1]= (b1+b0)>>2;\
01114 \
01115             pixels+=line_size;\
01116             block +=line_size;\
01117 \
01118             a0= pixels[0];\
01119             b0= pixels[1] + 2;\
01120             a0 += b0;\
01121             b0 += pixels[2];\
01122 \
01123             block[0]= (a1+a0)>>2;\
01124             block[1]= (b1+b0)>>2;\
01125             pixels+=line_size;\
01126             block +=line_size;\
01127         }\
01128 }\
01129 \
01130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01131 {\
01132         int i;\
01133         const uint32_t a= AV_RN32(pixels  );\
01134         const uint32_t b= AV_RN32(pixels+1);\
01135         uint32_t l0=  (a&0x03030303UL)\
01136                     + (b&0x03030303UL)\
01137                     + 0x02020202UL;\
01138         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
01139                    + ((b&0xFCFCFCFCUL)>>2);\
01140         uint32_t l1,h1;\
01141 \
01142         pixels+=line_size;\
01143         for(i=0; i<h; i+=2){\
01144             uint32_t a= AV_RN32(pixels  );\
01145             uint32_t b= AV_RN32(pixels+1);\
01146             l1=  (a&0x03030303UL)\
01147                + (b&0x03030303UL);\
01148             h1= ((a&0xFCFCFCFCUL)>>2)\
01149               + ((b&0xFCFCFCFCUL)>>2);\
01150             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01151             pixels+=line_size;\
01152             block +=line_size;\
01153             a= AV_RN32(pixels  );\
01154             b= AV_RN32(pixels+1);\
01155             l0=  (a&0x03030303UL)\
01156                + (b&0x03030303UL)\
01157                + 0x02020202UL;\
01158             h0= ((a&0xFCFCFCFCUL)>>2)\
01159               + ((b&0xFCFCFCFCUL)>>2);\
01160             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01161             pixels+=line_size;\
01162             block +=line_size;\
01163         }\
01164 }\
01165 \
01166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01167 {\
01168     int j;\
01169     for(j=0; j<2; j++){\
01170         int i;\
01171         const uint32_t a= AV_RN32(pixels  );\
01172         const uint32_t b= AV_RN32(pixels+1);\
01173         uint32_t l0=  (a&0x03030303UL)\
01174                     + (b&0x03030303UL)\
01175                     + 0x02020202UL;\
01176         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
01177                    + ((b&0xFCFCFCFCUL)>>2);\
01178         uint32_t l1,h1;\
01179 \
01180         pixels+=line_size;\
01181         for(i=0; i<h; i+=2){\
01182             uint32_t a= AV_RN32(pixels  );\
01183             uint32_t b= AV_RN32(pixels+1);\
01184             l1=  (a&0x03030303UL)\
01185                + (b&0x03030303UL);\
01186             h1= ((a&0xFCFCFCFCUL)>>2)\
01187               + ((b&0xFCFCFCFCUL)>>2);\
01188             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01189             pixels+=line_size;\
01190             block +=line_size;\
01191             a= AV_RN32(pixels  );\
01192             b= AV_RN32(pixels+1);\
01193             l0=  (a&0x03030303UL)\
01194                + (b&0x03030303UL)\
01195                + 0x02020202UL;\
01196             h0= ((a&0xFCFCFCFCUL)>>2)\
01197               + ((b&0xFCFCFCFCUL)>>2);\
01198             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01199             pixels+=line_size;\
01200             block +=line_size;\
01201         }\
01202         pixels+=4-line_size*(h+1);\
01203         block +=4-line_size*h;\
01204     }\
01205 }\
01206 \
01207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01208 {\
01209     int j;\
01210     for(j=0; j<2; j++){\
01211         int i;\
01212         const uint32_t a= AV_RN32(pixels  );\
01213         const uint32_t b= AV_RN32(pixels+1);\
01214         uint32_t l0=  (a&0x03030303UL)\
01215                     + (b&0x03030303UL)\
01216                     + 0x01010101UL;\
01217         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
01218                    + ((b&0xFCFCFCFCUL)>>2);\
01219         uint32_t l1,h1;\
01220 \
01221         pixels+=line_size;\
01222         for(i=0; i<h; i+=2){\
01223             uint32_t a= AV_RN32(pixels  );\
01224             uint32_t b= AV_RN32(pixels+1);\
01225             l1=  (a&0x03030303UL)\
01226                + (b&0x03030303UL);\
01227             h1= ((a&0xFCFCFCFCUL)>>2)\
01228               + ((b&0xFCFCFCFCUL)>>2);\
01229             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01230             pixels+=line_size;\
01231             block +=line_size;\
01232             a= AV_RN32(pixels  );\
01233             b= AV_RN32(pixels+1);\
01234             l0=  (a&0x03030303UL)\
01235                + (b&0x03030303UL)\
01236                + 0x01010101UL;\
01237             h0= ((a&0xFCFCFCFCUL)>>2)\
01238               + ((b&0xFCFCFCFCUL)>>2);\
01239             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01240             pixels+=line_size;\
01241             block +=line_size;\
01242         }\
01243         pixels+=4-line_size*(h+1);\
01244         block +=4-line_size*h;\
01245     }\
01246 }\
01247 \
01248 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
01249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
01250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
01251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
01252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
01253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
01254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
01255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
01256 
01257 #define op_avg(a, b) a = rnd_avg32(a, b)
01258 #endif
01259 #define op_put(a, b) a = b
01260 
01261 PIXOP2(avg, op_avg)
01262 PIXOP2(put, op_put)
01263 #undef op_avg
01264 #undef op_put
01265 
01266 #define avg2(a,b) ((a+b+1)>>1)
01267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
01268 
01269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
01270     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
01271 }
01272 
01273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
01274     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
01275 }
01276 
01277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
01278 {
01279     const int A=(16-x16)*(16-y16);
01280     const int B=(   x16)*(16-y16);
01281     const int C=(16-x16)*(   y16);
01282     const int D=(   x16)*(   y16);
01283     int i;
01284 
01285     for(i=0; i<h; i++)
01286     {
01287         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
01288         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
01289         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
01290         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
01291         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
01292         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
01293         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
01294         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
01295         dst+= stride;
01296         src+= stride;
01297     }
01298 }
01299 
01300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01301                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01302 {
01303     int y, vx, vy;
01304     const int s= 1<<shift;
01305 
01306     width--;
01307     height--;
01308 
01309     for(y=0; y<h; y++){
01310         int x;
01311 
01312         vx= ox;
01313         vy= oy;
01314         for(x=0; x<8; x++){ //XXX FIXME optimize
01315             int src_x, src_y, frac_x, frac_y, index;
01316 
01317             src_x= vx>>16;
01318             src_y= vy>>16;
01319             frac_x= src_x&(s-1);
01320             frac_y= src_y&(s-1);
01321             src_x>>=shift;
01322             src_y>>=shift;
01323 
01324             if((unsigned)src_x < width){
01325                 if((unsigned)src_y < height){
01326                     index= src_x + src_y*stride;
01327                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
01328                                            + src[index       +1]*   frac_x )*(s-frac_y)
01329                                         + (  src[index+stride  ]*(s-frac_x)
01330                                            + src[index+stride+1]*   frac_x )*   frac_y
01331                                         + r)>>(shift*2);
01332                 }else{
01333                     index= src_x + av_clip(src_y, 0, height)*stride;
01334                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
01335                                           + src[index       +1]*   frac_x )*s
01336                                         + r)>>(shift*2);
01337                 }
01338             }else{
01339                 if((unsigned)src_y < height){
01340                     index= av_clip(src_x, 0, width) + src_y*stride;
01341                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
01342                                            + src[index+stride  ]*   frac_y )*s
01343                                         + r)>>(shift*2);
01344                 }else{
01345                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
01346                     dst[y*stride + x]=    src[index         ];
01347                 }
01348             }
01349 
01350             vx+= dxx;
01351             vy+= dyx;
01352         }
01353         ox += dxy;
01354         oy += dyy;
01355     }
01356 }
01357 
01358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01359     switch(width){
01360     case 2: put_pixels2_c (dst, src, stride, height); break;
01361     case 4: put_pixels4_c (dst, src, stride, height); break;
01362     case 8: put_pixels8_c (dst, src, stride, height); break;
01363     case 16:put_pixels16_c(dst, src, stride, height); break;
01364     }
01365 }
01366 
01367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01368     int i,j;
01369     for (i=0; i < height; i++) {
01370       for (j=0; j < width; j++) {
01371         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
01372       }
01373       src += stride;
01374       dst += stride;
01375     }
01376 }
01377 
01378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01379     int i,j;
01380     for (i=0; i < height; i++) {
01381       for (j=0; j < width; j++) {
01382         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
01383       }
01384       src += stride;
01385       dst += stride;
01386     }
01387 }
01388 
01389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01390     int i,j;
01391     for (i=0; i < height; i++) {
01392       for (j=0; j < width; j++) {
01393         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
01394       }
01395       src += stride;
01396       dst += stride;
01397     }
01398 }
01399 
01400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01401     int i,j;
01402     for (i=0; i < height; i++) {
01403       for (j=0; j < width; j++) {
01404         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
01405       }
01406       src += stride;
01407       dst += stride;
01408     }
01409 }
01410 
01411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01412     int i,j;
01413     for (i=0; i < height; i++) {
01414       for (j=0; j < width; j++) {
01415         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
01416       }
01417       src += stride;
01418       dst += stride;
01419     }
01420 }
01421 
01422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01423     int i,j;
01424     for (i=0; i < height; i++) {
01425       for (j=0; j < width; j++) {
01426         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
01427       }
01428       src += stride;
01429       dst += stride;
01430     }
01431 }
01432 
01433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01434     int i,j;
01435     for (i=0; i < height; i++) {
01436       for (j=0; j < width; j++) {
01437         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
01438       }
01439       src += stride;
01440       dst += stride;
01441     }
01442 }
01443 
01444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01445     int i,j;
01446     for (i=0; i < height; i++) {
01447       for (j=0; j < width; j++) {
01448         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
01449       }
01450       src += stride;
01451       dst += stride;
01452     }
01453 }
01454 
01455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01456     switch(width){
01457     case 2: avg_pixels2_c (dst, src, stride, height); break;
01458     case 4: avg_pixels4_c (dst, src, stride, height); break;
01459     case 8: avg_pixels8_c (dst, src, stride, height); break;
01460     case 16:avg_pixels16_c(dst, src, stride, height); break;
01461     }
01462 }
01463 
01464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01465     int i,j;
01466     for (i=0; i < height; i++) {
01467       for (j=0; j < width; j++) {
01468         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
01469       }
01470       src += stride;
01471       dst += stride;
01472     }
01473 }
01474 
01475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01476     int i,j;
01477     for (i=0; i < height; i++) {
01478       for (j=0; j < width; j++) {
01479         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
01480       }
01481       src += stride;
01482       dst += stride;
01483     }
01484 }
01485 
01486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01487     int i,j;
01488     for (i=0; i < height; i++) {
01489       for (j=0; j < width; j++) {
01490         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
01491       }
01492       src += stride;
01493       dst += stride;
01494     }
01495 }
01496 
01497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01498     int i,j;
01499     for (i=0; i < height; i++) {
01500       for (j=0; j < width; j++) {
01501         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01502       }
01503       src += stride;
01504       dst += stride;
01505     }
01506 }
01507 
01508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01509     int i,j;
01510     for (i=0; i < height; i++) {
01511       for (j=0; j < width; j++) {
01512         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01513       }
01514       src += stride;
01515       dst += stride;
01516     }
01517 }
01518 
01519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01520     int i,j;
01521     for (i=0; i < height; i++) {
01522       for (j=0; j < width; j++) {
01523         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
01524       }
01525       src += stride;
01526       dst += stride;
01527     }
01528 }
01529 
01530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01531     int i,j;
01532     for (i=0; i < height; i++) {
01533       for (j=0; j < width; j++) {
01534         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01535       }
01536       src += stride;
01537       dst += stride;
01538     }
01539 }
01540 
01541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01542     int i,j;
01543     for (i=0; i < height; i++) {
01544       for (j=0; j < width; j++) {
01545         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01546       }
01547       src += stride;
01548       dst += stride;
01549     }
01550 }
01551 #if 0
01552 #define TPEL_WIDTH(width)\
01553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01554     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
01555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01556     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
01557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01558     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
01559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01560     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
01561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01562     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
01563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01564     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
01565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01566     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
01567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01568     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
01569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01570     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
01571 #endif
01572 
01573 #define H264_CHROMA_MC(OPNAME, OP)\
01574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
01575     const int A=(8-x)*(8-y);\
01576     const int B=(  x)*(8-y);\
01577     const int C=(8-x)*(  y);\
01578     const int D=(  x)*(  y);\
01579     int i;\
01580     \
01581     assert(x<8 && y<8 && x>=0 && y>=0);\
01582 \
01583     if(D){\
01584         for(i=0; i<h; i++){\
01585             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
01586             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
01587             dst+= stride;\
01588             src+= stride;\
01589         }\
01590     }else{\
01591         const int E= B+C;\
01592         const int step= C ? stride : 1;\
01593         for(i=0; i<h; i++){\
01594             OP(dst[0], (A*src[0] + E*src[step+0]));\
01595             OP(dst[1], (A*src[1] + E*src[step+1]));\
01596             dst+= stride;\
01597             src+= stride;\
01598         }\
01599     }\
01600 }\
01601 \
01602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
01603     const int A=(8-x)*(8-y);\
01604     const int B=(  x)*(8-y);\
01605     const int C=(8-x)*(  y);\
01606     const int D=(  x)*(  y);\
01607     int i;\
01608     \
01609     assert(x<8 && y<8 && x>=0 && y>=0);\
01610 \
01611     if(D){\
01612         for(i=0; i<h; i++){\
01613             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
01614             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
01615             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
01616             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
01617             dst+= stride;\
01618             src+= stride;\
01619         }\
01620     }else{\
01621         const int E= B+C;\
01622         const int step= C ? stride : 1;\
01623         for(i=0; i<h; i++){\
01624             OP(dst[0], (A*src[0] + E*src[step+0]));\
01625             OP(dst[1], (A*src[1] + E*src[step+1]));\
01626             OP(dst[2], (A*src[2] + E*src[step+2]));\
01627             OP(dst[3], (A*src[3] + E*src[step+3]));\
01628             dst+= stride;\
01629             src+= stride;\
01630         }\
01631     }\
01632 }\
01633 \
01634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
01635     const int A=(8-x)*(8-y);\
01636     const int B=(  x)*(8-y);\
01637     const int C=(8-x)*(  y);\
01638     const int D=(  x)*(  y);\
01639     int i;\
01640     \
01641     assert(x<8 && y<8 && x>=0 && y>=0);\
01642 \
01643     if(D){\
01644         for(i=0; i<h; i++){\
01645             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
01646             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
01647             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
01648             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
01649             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
01650             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
01651             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
01652             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
01653             dst+= stride;\
01654             src+= stride;\
01655         }\
01656     }else{\
01657         const int E= B+C;\
01658         const int step= C ? stride : 1;\
01659         for(i=0; i<h; i++){\
01660             OP(dst[0], (A*src[0] + E*src[step+0]));\
01661             OP(dst[1], (A*src[1] + E*src[step+1]));\
01662             OP(dst[2], (A*src[2] + E*src[step+2]));\
01663             OP(dst[3], (A*src[3] + E*src[step+3]));\
01664             OP(dst[4], (A*src[4] + E*src[step+4]));\
01665             OP(dst[5], (A*src[5] + E*src[step+5]));\
01666             OP(dst[6], (A*src[6] + E*src[step+6]));\
01667             OP(dst[7], (A*src[7] + E*src[step+7]));\
01668             dst+= stride;\
01669             src+= stride;\
01670         }\
01671     }\
01672 }
01673 
01674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
01675 #define op_put(a, b) a = (((b) + 32)>>6)
01676 
01677 H264_CHROMA_MC(put_       , op_put)
01678 H264_CHROMA_MC(avg_       , op_avg)
01679 #undef op_avg
01680 #undef op_put
01681 
01682 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
01683     const int A=(8-x)*(8-y);
01684     const int B=(  x)*(8-y);
01685     const int C=(8-x)*(  y);
01686     const int D=(  x)*(  y);
01687     int i;
01688 
01689     assert(x<8 && y<8 && x>=0 && y>=0);
01690 
01691     for(i=0; i<h; i++)
01692     {
01693         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
01694         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
01695         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
01696         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
01697         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
01698         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
01699         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
01700         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
01701         dst+= stride;
01702         src+= stride;
01703     }
01704 }
01705 
01706 #define QPEL_MC(r, OPNAME, RND, OP) \
01707 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01708     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01709     int i;\
01710     for(i=0; i<h; i++)\
01711     {\
01712         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
01713         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
01714         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
01715         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
01716         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
01717         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
01718         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
01719         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
01720         dst+=dstStride;\
01721         src+=srcStride;\
01722     }\
01723 }\
01724 \
01725 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01726     const int w=8;\
01727     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01728     int i;\
01729     for(i=0; i<w; i++)\
01730     {\
01731         const int src0= src[0*srcStride];\
01732         const int src1= src[1*srcStride];\
01733         const int src2= src[2*srcStride];\
01734         const int src3= src[3*srcStride];\
01735         const int src4= src[4*srcStride];\
01736         const int src5= src[5*srcStride];\
01737         const int src6= src[6*srcStride];\
01738         const int src7= src[7*srcStride];\
01739         const int src8= src[8*srcStride];\
01740         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
01741         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
01742         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
01743         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
01744         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
01745         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
01746         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
01747         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
01748         dst++;\
01749         src++;\
01750     }\
01751 }\
01752 \
01753 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01755     int i;\
01756     \
01757     for(i=0; i<h; i++)\
01758     {\
01759         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
01760         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
01761         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
01762         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
01763         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
01764         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
01765         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
01766         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
01767         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
01768         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
01769         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
01770         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
01771         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
01772         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
01773         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
01774         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
01775         dst+=dstStride;\
01776         src+=srcStride;\
01777     }\
01778 }\
01779 \
01780 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01781     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01782     int i;\
01783     const int w=16;\
01784     for(i=0; i<w; i++)\
01785     {\
01786         const int src0= src[0*srcStride];\
01787         const int src1= src[1*srcStride];\
01788         const int src2= src[2*srcStride];\
01789         const int src3= src[3*srcStride];\
01790         const int src4= src[4*srcStride];\
01791         const int src5= src[5*srcStride];\
01792         const int src6= src[6*srcStride];\
01793         const int src7= src[7*srcStride];\
01794         const int src8= src[8*srcStride];\
01795         const int src9= src[9*srcStride];\
01796         const int src10= src[10*srcStride];\
01797         const int src11= src[11*srcStride];\
01798         const int src12= src[12*srcStride];\
01799         const int src13= src[13*srcStride];\
01800         const int src14= src[14*srcStride];\
01801         const int src15= src[15*srcStride];\
01802         const int src16= src[16*srcStride];\
01803         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
01804         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
01805         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
01806         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
01807         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
01808         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
01809         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
01810         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
01811         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
01812         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
01813         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
01814         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
01815         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
01816         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
01817         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
01818         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
01819         dst++;\
01820         src++;\
01821     }\
01822 }\
01823 \
01824 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
01825     OPNAME ## pixels8_c(dst, src, stride, 8);\
01826 }\
01827 \
01828 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
01829     uint8_t half[64];\
01830     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
01831     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
01832 }\
01833 \
01834 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
01835     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
01836 }\
01837 \
01838 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
01839     uint8_t half[64];\
01840     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
01841     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
01842 }\
01843 \
01844 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
01845     uint8_t full[16*9];\
01846     uint8_t half[64];\
01847     copy_block9(full, src, 16, stride, 9);\
01848     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
01849     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
01850 }\
01851 \
01852 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
01853     uint8_t full[16*9];\
01854     copy_block9(full, src, 16, stride, 9);\
01855     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
01856 }\
01857 \
01858 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
01859     uint8_t full[16*9];\
01860     uint8_t half[64];\
01861     copy_block9(full, src, 16, stride, 9);\
01862     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
01863     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
01864 }\
01865 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
01866     uint8_t full[16*9];\
01867     uint8_t halfH[72];\
01868     uint8_t halfV[64];\
01869     uint8_t halfHV[64];\
01870     copy_block9(full, src, 16, stride, 9);\
01871     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01872     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01873     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01874     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01875 }\
01876 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
01877     uint8_t full[16*9];\
01878     uint8_t halfH[72];\
01879     uint8_t halfHV[64];\
01880     copy_block9(full, src, 16, stride, 9);\
01881     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01882     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
01883     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01884     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
01885 }\
01886 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
01887     uint8_t full[16*9];\
01888     uint8_t halfH[72];\
01889     uint8_t halfV[64];\
01890     uint8_t halfHV[64];\
01891     copy_block9(full, src, 16, stride, 9);\
01892     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01893     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01894     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01895     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01896 }\
01897 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
01898     uint8_t full[16*9];\
01899     uint8_t halfH[72];\
01900     uint8_t halfHV[64];\
01901     copy_block9(full, src, 16, stride, 9);\
01902     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01903     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
01904     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01905     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
01906 }\
01907 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
01908     uint8_t full[16*9];\
01909     uint8_t halfH[72];\
01910     uint8_t halfV[64];\
01911     uint8_t halfHV[64];\
01912     copy_block9(full, src, 16, stride, 9);\
01913     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01914     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01915     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01916     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01917 }\
01918 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
01919     uint8_t full[16*9];\
01920     uint8_t halfH[72];\
01921     uint8_t halfHV[64];\
01922     copy_block9(full, src, 16, stride, 9);\
01923     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01924     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
01925     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01926     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01927 }\
01928 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
01929     uint8_t full[16*9];\
01930     uint8_t halfH[72];\
01931     uint8_t halfV[64];\
01932     uint8_t halfHV[64];\
01933     copy_block9(full, src, 16, stride, 9);\
01934     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
01935     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01936     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01937     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01938 }\
01939 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
01940     uint8_t full[16*9];\
01941     uint8_t halfH[72];\
01942     uint8_t halfHV[64];\
01943     copy_block9(full, src, 16, stride, 9);\
01944     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01945     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
01946     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01947     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01948 }\
01949 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
01950     uint8_t halfH[72];\
01951     uint8_t halfHV[64];\
01952     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01953     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01954     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
01955 }\
01956 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
01957     uint8_t halfH[72];\
01958     uint8_t halfHV[64];\
01959     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01960     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01961     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01962 }\
01963 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
01964     uint8_t full[16*9];\
01965     uint8_t halfH[72];\
01966     uint8_t halfV[64];\
01967     uint8_t halfHV[64];\
01968     copy_block9(full, src, 16, stride, 9);\
01969     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01970     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01971     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01972     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
01973 }\
01974 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
01975     uint8_t full[16*9];\
01976     uint8_t halfH[72];\
01977     copy_block9(full, src, 16, stride, 9);\
01978     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01979     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
01980     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01981 }\
01982 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
01983     uint8_t full[16*9];\
01984     uint8_t halfH[72];\
01985     uint8_t halfV[64];\
01986     uint8_t halfHV[64];\
01987     copy_block9(full, src, 16, stride, 9);\
01988     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01989     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01990     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01991     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
01992 }\
01993 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
01994     uint8_t full[16*9];\
01995     uint8_t halfH[72];\
01996     copy_block9(full, src, 16, stride, 9);\
01997     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01998     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
01999     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
02000 }\
02001 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
02002     uint8_t halfH[72];\
02003     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
02004     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
02005 }\
02006 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
02007     OPNAME ## pixels16_c(dst, src, stride, 16);\
02008 }\
02009 \
02010 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
02011     uint8_t half[256];\
02012     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
02013     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
02014 }\
02015 \
02016 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
02017     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
02018 }\
02019 \
02020 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
02021     uint8_t half[256];\
02022     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
02023     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
02024 }\
02025 \
02026 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
02027     uint8_t full[24*17];\
02028     uint8_t half[256];\
02029     copy_block17(full, src, 24, stride, 17);\
02030     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
02031     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
02032 }\
02033 \
02034 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
02035     uint8_t full[24*17];\
02036     copy_block17(full, src, 24, stride, 17);\
02037     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
02038 }\
02039 \
02040 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
02041     uint8_t full[24*17];\
02042     uint8_t half[256];\
02043     copy_block17(full, src, 24, stride, 17);\
02044     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
02045     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
02046 }\
02047 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
02048     uint8_t full[24*17];\
02049     uint8_t halfH[272];\
02050     uint8_t halfV[256];\
02051     uint8_t halfHV[256];\
02052     copy_block17(full, src, 24, stride, 17);\
02053     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02054     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
02055     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02056     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02057 }\
02058 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
02059     uint8_t full[24*17];\
02060     uint8_t halfH[272];\
02061     uint8_t halfHV[256];\
02062     copy_block17(full, src, 24, stride, 17);\
02063     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02064     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
02065     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02066     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
02067 }\
02068 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
02069     uint8_t full[24*17];\
02070     uint8_t halfH[272];\
02071     uint8_t halfV[256];\
02072     uint8_t halfHV[256];\
02073     copy_block17(full, src, 24, stride, 17);\
02074     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02075     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
02076     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02077     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02078 }\
02079 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
02080     uint8_t full[24*17];\
02081     uint8_t halfH[272];\
02082     uint8_t halfHV[256];\
02083     copy_block17(full, src, 24, stride, 17);\
02084     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02085     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
02086     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02087     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
02088 }\
02089 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
02090     uint8_t full[24*17];\
02091     uint8_t halfH[272];\
02092     uint8_t halfV[256];\
02093     uint8_t halfHV[256];\
02094     copy_block17(full, src, 24, stride, 17);\
02095     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02096     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
02097     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02098     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02099 }\
02100 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
02101     uint8_t full[24*17];\
02102     uint8_t halfH[272];\
02103     uint8_t halfHV[256];\
02104     copy_block17(full, src, 24, stride, 17);\
02105     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02106     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
02107     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02108     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
02109 }\
02110 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
02111     uint8_t full[24*17];\
02112     uint8_t halfH[272];\
02113     uint8_t halfV[256];\
02114     uint8_t halfHV[256];\
02115     copy_block17(full, src, 24, stride, 17);\
02116     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
02117     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
02118     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02119     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02120 }\
02121 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
02122     uint8_t full[24*17];\
02123     uint8_t halfH[272];\
02124     uint8_t halfHV[256];\
02125     copy_block17(full, src, 24, stride, 17);\
02126     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02127     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
02128     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02129     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
02130 }\
02131 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
02132     uint8_t halfH[272];\
02133     uint8_t halfHV[256];\
02134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
02135     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02136     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
02137 }\
02138 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
02139     uint8_t halfH[272];\
02140     uint8_t halfHV[256];\
02141     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
02142     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02143     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
02144 }\
02145 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
02146     uint8_t full[24*17];\
02147     uint8_t halfH[272];\
02148     uint8_t halfV[256];\
02149     uint8_t halfHV[256];\
02150     copy_block17(full, src, 24, stride, 17);\
02151     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02152     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
02153     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02154     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
02155 }\
02156 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
02157     uint8_t full[24*17];\
02158     uint8_t halfH[272];\
02159     copy_block17(full, src, 24, stride, 17);\
02160     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02161     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
02162     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
02163 }\
02164 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
02165     uint8_t full[24*17];\
02166     uint8_t halfH[272];\
02167     uint8_t halfV[256];\
02168     uint8_t halfHV[256];\
02169     copy_block17(full, src, 24, stride, 17);\
02170     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02171     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
02172     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02173     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
02174 }\
02175 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
02176     uint8_t full[24*17];\
02177     uint8_t halfH[272];\
02178     copy_block17(full, src, 24, stride, 17);\
02179     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02180     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
02181     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
02182 }\
02183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
02184     uint8_t halfH[272];\
02185     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
02186     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
02187 }
02188 
02189 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
02190 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
02191 #define op_put(a, b) a = cm[((b) + 16)>>5]
02192 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
02193 
02194 QPEL_MC(0, put_       , _       , op_put)
02195 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
02196 QPEL_MC(0, avg_       , _       , op_avg)
02197 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
02198 #undef op_avg
02199 #undef op_avg_no_rnd
02200 #undef op_put
02201 #undef op_put_no_rnd
02202 
02203 #if 1
02204 #define H264_LOWPASS(OPNAME, OP, OP2) \
02205 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02206     const int h=2;\
02207     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02208     int i;\
02209     for(i=0; i<h; i++)\
02210     {\
02211         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
02212         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
02213         dst+=dstStride;\
02214         src+=srcStride;\
02215     }\
02216 }\
02217 \
02218 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02219     const int w=2;\
02220     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02221     int i;\
02222     for(i=0; i<w; i++)\
02223     {\
02224         const int srcB= src[-2*srcStride];\
02225         const int srcA= src[-1*srcStride];\
02226         const int src0= src[0 *srcStride];\
02227         const int src1= src[1 *srcStride];\
02228         const int src2= src[2 *srcStride];\
02229         const int src3= src[3 *srcStride];\
02230         const int src4= src[4 *srcStride];\
02231         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
02232         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
02233         dst++;\
02234         src++;\
02235     }\
02236 }\
02237 \
02238 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02239     const int h=2;\
02240     const int w=2;\
02241     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02242     int i;\
02243     src -= 2*srcStride;\
02244     for(i=0; i<h+5; i++)\
02245     {\
02246         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
02247         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
02248         tmp+=tmpStride;\
02249         src+=srcStride;\
02250     }\
02251     tmp -= tmpStride*(h+5-2);\
02252     for(i=0; i<w; i++)\
02253     {\
02254         const int tmpB= tmp[-2*tmpStride];\
02255         const int tmpA= tmp[-1*tmpStride];\
02256         const int tmp0= tmp[0 *tmpStride];\
02257         const int tmp1= tmp[1 *tmpStride];\
02258         const int tmp2= tmp[2 *tmpStride];\
02259         const int tmp3= tmp[3 *tmpStride];\
02260         const int tmp4= tmp[4 *tmpStride];\
02261         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
02262         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
02263         dst++;\
02264         tmp++;\
02265     }\
02266 }\
02267 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02268     const int h=4;\
02269     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02270     int i;\
02271     for(i=0; i<h; i++)\
02272     {\
02273         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
02274         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
02275         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
02276         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
02277         dst+=dstStride;\
02278         src+=srcStride;\
02279     }\
02280 }\
02281 \
02282 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02283     const int w=4;\
02284     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02285     int i;\
02286     for(i=0; i<w; i++)\
02287     {\
02288         const int srcB= src[-2*srcStride];\
02289         const int srcA= src[-1*srcStride];\
02290         const int src0= src[0 *srcStride];\
02291         const int src1= src[1 *srcStride];\
02292         const int src2= src[2 *srcStride];\
02293         const int src3= src[3 *srcStride];\
02294         const int src4= src[4 *srcStride];\
02295         const int src5= src[5 *srcStride];\
02296         const int src6= src[6 *srcStride];\
02297         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
02298         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
02299         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
02300         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
02301         dst++;\
02302         src++;\
02303     }\
02304 }\
02305 \
02306 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02307     const int h=4;\
02308     const int w=4;\
02309     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02310     int i;\
02311     src -= 2*srcStride;\
02312     for(i=0; i<h+5; i++)\
02313     {\
02314         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
02315         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
02316         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
02317         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
02318         tmp+=tmpStride;\
02319         src+=srcStride;\
02320     }\
02321     tmp -= tmpStride*(h+5-2);\
02322     for(i=0; i<w; i++)\
02323     {\
02324         const int tmpB= tmp[-2*tmpStride];\
02325         const int tmpA= tmp[-1*tmpStride];\
02326         const int tmp0= tmp[0 *tmpStride];\
02327         const int tmp1= tmp[1 *tmpStride];\
02328         const int tmp2= tmp[2 *tmpStride];\
02329         const int tmp3= tmp[3 *tmpStride];\
02330         const int tmp4= tmp[4 *tmpStride];\
02331         const int tmp5= tmp[5 *tmpStride];\
02332         const int tmp6= tmp[6 *tmpStride];\
02333         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
02334         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
02335         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
02336         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
02337         dst++;\
02338         tmp++;\
02339     }\
02340 }\
02341 \
02342 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02343     const int h=8;\
02344     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02345     int i;\
02346     for(i=0; i<h; i++)\
02347     {\
02348         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
02349         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
02350         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
02351         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
02352         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
02353         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
02354         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
02355         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
02356         dst+=dstStride;\
02357         src+=srcStride;\
02358     }\
02359 }\
02360 \
02361 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02362     const int w=8;\
02363     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02364     int i;\
02365     for(i=0; i<w; i++)\
02366     {\
02367         const int srcB= src[-2*srcStride];\
02368         const int srcA= src[-1*srcStride];\
02369         const int src0= src[0 *srcStride];\
02370         const int src1= src[1 *srcStride];\
02371         const int src2= src[2 *srcStride];\
02372         const int src3= src[3 *srcStride];\
02373         const int src4= src[4 *srcStride];\
02374         const int src5= src[5 *srcStride];\
02375         const int src6= src[6 *srcStride];\
02376         const int src7= src[7 *srcStride];\
02377         const int src8= src[8 *srcStride];\
02378         const int src9= src[9 *srcStride];\
02379         const int src10=src[10*srcStride];\
02380         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
02381         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
02382         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
02383         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
02384         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
02385         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
02386         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
02387         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
02388         dst++;\
02389         src++;\
02390     }\
02391 }\
02392 \
02393 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02394     const int h=8;\
02395     const int w=8;\
02396     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02397     int i;\
02398     src -= 2*srcStride;\
02399     for(i=0; i<h+5; i++)\
02400     {\
02401         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
02402         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
02403         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
02404         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
02405         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
02406         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
02407         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
02408         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
02409         tmp+=tmpStride;\
02410         src+=srcStride;\
02411     }\
02412     tmp -= tmpStride*(h+5-2);\
02413     for(i=0; i<w; i++)\
02414     {\
02415         const int tmpB= tmp[-2*tmpStride];\
02416         const int tmpA= tmp[-1*tmpStride];\
02417         const int tmp0= tmp[0 *tmpStride];\
02418         const int tmp1= tmp[1 *tmpStride];\
02419         const int tmp2= tmp[2 *tmpStride];\
02420         const int tmp3= tmp[3 *tmpStride];\
02421         const int tmp4= tmp[4 *tmpStride];\
02422         const int tmp5= tmp[5 *tmpStride];\
02423         const int tmp6= tmp[6 *tmpStride];\
02424         const int tmp7= tmp[7 *tmpStride];\
02425         const int tmp8= tmp[8 *tmpStride];\
02426         const int tmp9= tmp[9 *tmpStride];\
02427         const int tmp10=tmp[10*tmpStride];\
02428         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
02429         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
02430         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
02431         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
02432         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
02433         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
02434         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
02435         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
02436         dst++;\
02437         tmp++;\
02438     }\
02439 }\
02440 \
02441 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02442     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
02443     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
02444     src += 8*srcStride;\
02445     dst += 8*dstStride;\
02446     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
02447     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
02448 }\
02449 \
02450 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02451     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
02452     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
02453     src += 8*srcStride;\
02454     dst += 8*dstStride;\
02455     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
02456     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
02457 }\
02458 \
02459 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02460     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
02461     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
02462     src += 8*srcStride;\
02463     dst += 8*dstStride;\
02464     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
02465     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
02466 }\
02467 
02468 #define H264_MC(OPNAME, SIZE) \
02469 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
02470     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
02471 }\
02472 \
02473 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
02474     uint8_t half[SIZE*SIZE];\
02475     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
02476     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
02477 }\
02478 \
02479 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
02480     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
02481 }\
02482 \
02483 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
02484     uint8_t half[SIZE*SIZE];\
02485     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
02486     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
02487 }\
02488 \
02489 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
02490     uint8_t full[SIZE*(SIZE+5)];\
02491     uint8_t * const full_mid= full + SIZE*2;\
02492     uint8_t half[SIZE*SIZE];\
02493     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02494     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
02495     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
02496 }\
02497 \
02498 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
02499     uint8_t full[SIZE*(SIZE+5)];\
02500     uint8_t * const full_mid= full + SIZE*2;\
02501     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02502     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
02503 }\
02504 \
02505 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
02506     uint8_t full[SIZE*(SIZE+5)];\
02507     uint8_t * const full_mid= full + SIZE*2;\
02508     uint8_t half[SIZE*SIZE];\
02509     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02510     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
02511     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
02512 }\
02513 \
02514 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
02515     uint8_t full[SIZE*(SIZE+5)];\
02516     uint8_t * const full_mid= full + SIZE*2;\
02517     uint8_t halfH[SIZE*SIZE];\
02518     uint8_t halfV[SIZE*SIZE];\
02519     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
02520     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02521     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02522     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02523 }\
02524 \
02525 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
02526     uint8_t full[SIZE*(SIZE+5)];\
02527     uint8_t * const full_mid= full + SIZE*2;\
02528     uint8_t halfH[SIZE*SIZE];\
02529     uint8_t halfV[SIZE*SIZE];\
02530     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
02531     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
02532     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02533     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02534 }\
02535 \
02536 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
02537     uint8_t full[SIZE*(SIZE+5)];\
02538     uint8_t * const full_mid= full + SIZE*2;\
02539     uint8_t halfH[SIZE*SIZE];\
02540     uint8_t halfV[SIZE*SIZE];\
02541     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
02542     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02543     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02544     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02545 }\
02546 \
02547 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
02548     uint8_t full[SIZE*(SIZE+5)];\
02549     uint8_t * const full_mid= full + SIZE*2;\
02550     uint8_t halfH[SIZE*SIZE];\
02551     uint8_t halfV[SIZE*SIZE];\
02552     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
02553     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
02554     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02555     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02556 }\
02557 \
02558 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
02559     int16_t tmp[SIZE*(SIZE+5)];\
02560     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
02561 }\
02562 \
02563 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
02564     int16_t tmp[SIZE*(SIZE+5)];\
02565     uint8_t halfH[SIZE*SIZE];\
02566     uint8_t halfHV[SIZE*SIZE];\
02567     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
02568     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02569     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
02570 }\
02571 \
02572 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
02573     int16_t tmp[SIZE*(SIZE+5)];\
02574     uint8_t halfH[SIZE*SIZE];\
02575     uint8_t halfHV[SIZE*SIZE];\
02576     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
02577     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02578     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
02579 }\
02580 \
02581 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
02582     uint8_t full[SIZE*(SIZE+5)];\
02583     uint8_t * const full_mid= full + SIZE*2;\
02584     int16_t tmp[SIZE*(SIZE+5)];\
02585     uint8_t halfV[SIZE*SIZE];\
02586     uint8_t halfHV[SIZE*SIZE];\
02587     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02588     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02589     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02590     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
02591 }\
02592 \
02593 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
02594     uint8_t full[SIZE*(SIZE+5)];\
02595     uint8_t * const full_mid= full + SIZE*2;\
02596     int16_t tmp[SIZE*(SIZE+5)];\
02597     uint8_t halfV[SIZE*SIZE];\
02598     uint8_t halfHV[SIZE*SIZE];\
02599     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
02600     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02601     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02602     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
02603 }\
02604 
02605 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
02606 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
02607 #define op_put(a, b)  a = cm[((b) + 16)>>5]
02608 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
02609 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
02610 
02611 H264_LOWPASS(put_       , op_put, op2_put)
02612 H264_LOWPASS(avg_       , op_avg, op2_avg)
02613 H264_MC(put_, 2)
02614 H264_MC(put_, 4)
02615 H264_MC(put_, 8)
02616 H264_MC(put_, 16)
02617 H264_MC(avg_, 4)
02618 H264_MC(avg_, 8)
02619 H264_MC(avg_, 16)
02620 
02621 #undef op_avg
02622 #undef op_put
02623 #undef op2_avg
02624 #undef op2_put
02625 #endif
02626 
02627 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
02628 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
02629 #define H264_WEIGHT(W,H) \
02630 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
02631     int y; \
02632     offset <<= log2_denom; \
02633     if(log2_denom) offset += 1<<(log2_denom-1); \
02634     for(y=0; y<H; y++, block += stride){ \
02635         op_scale1(0); \
02636         op_scale1(1); \
02637         if(W==2) continue; \
02638         op_scale1(2); \
02639         op_scale1(3); \
02640         if(W==4) continue; \
02641         op_scale1(4); \
02642         op_scale1(5); \
02643         op_scale1(6); \
02644         op_scale1(7); \
02645         if(W==8) continue; \
02646         op_scale1(8); \
02647         op_scale1(9); \
02648         op_scale1(10); \
02649         op_scale1(11); \
02650         op_scale1(12); \
02651         op_scale1(13); \
02652         op_scale1(14); \
02653         op_scale1(15); \
02654     } \
02655 } \
02656 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
02657     int y; \
02658     offset = ((offset + 1) | 1) << log2_denom; \
02659     for(y=0; y<H; y++, dst += stride, src += stride){ \
02660         op_scale2(0); \
02661         op_scale2(1); \
02662         if(W==2) continue; \
02663         op_scale2(2); \
02664         op_scale2(3); \
02665         if(W==4) continue; \
02666         op_scale2(4); \
02667         op_scale2(5); \
02668         op_scale2(6); \
02669         op_scale2(7); \
02670         if(W==8) continue; \
02671         op_scale2(8); \
02672         op_scale2(9); \
02673         op_scale2(10); \
02674         op_scale2(11); \
02675         op_scale2(12); \
02676         op_scale2(13); \
02677         op_scale2(14); \
02678         op_scale2(15); \
02679     } \
02680 }
02681 
02682 H264_WEIGHT(16,16)
02683 H264_WEIGHT(16,8)
02684 H264_WEIGHT(8,16)
02685 H264_WEIGHT(8,8)
02686 H264_WEIGHT(8,4)
02687 H264_WEIGHT(4,8)
02688 H264_WEIGHT(4,4)
02689 H264_WEIGHT(4,2)
02690 H264_WEIGHT(2,4)
02691 H264_WEIGHT(2,2)
02692 
02693 #undef op_scale1
02694 #undef op_scale2
02695 #undef H264_WEIGHT
02696 
02697 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
02698     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
02699     int i;
02700 
02701     for(i=0; i<h; i++){
02702         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
02703         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
02704         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
02705         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
02706         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
02707         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
02708         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
02709         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
02710         dst+=dstStride;
02711         src+=srcStride;
02712     }
02713 }
02714 
02715 #if CONFIG_CAVS_DECODER
02716 /* AVS specific */
02717 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
02718 
02719 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02720     put_pixels8_c(dst, src, stride, 8);
02721 }
02722 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02723     avg_pixels8_c(dst, src, stride, 8);
02724 }
02725 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02726     put_pixels16_c(dst, src, stride, 16);
02727 }
02728 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02729     avg_pixels16_c(dst, src, stride, 16);
02730 }
02731 #endif /* CONFIG_CAVS_DECODER */
02732 
02733 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
02734 /* VC-1 specific */
02735 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
02736 
02737 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
02738     put_pixels8_c(dst, src, stride, 8);
02739 }
02740 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
02741 
02742 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
02743 
02744 /* H264 specific */
02745 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
02746 
02747 #if CONFIG_RV30_DECODER
02748 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
02749 #endif /* CONFIG_RV30_DECODER */
02750 
02751 #if CONFIG_RV40_DECODER
02752 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02753     put_pixels16_xy2_c(dst, src, stride, 16);
02754 }
02755 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02756     avg_pixels16_xy2_c(dst, src, stride, 16);
02757 }
02758 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02759     put_pixels8_xy2_c(dst, src, stride, 8);
02760 }
02761 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02762     avg_pixels8_xy2_c(dst, src, stride, 8);
02763 }
02764 
02765 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
02766 #endif /* CONFIG_RV40_DECODER */
02767 
02768 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
02769     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
02770     int i;
02771 
02772     for(i=0; i<w; i++){
02773         const int src_1= src[ -srcStride];
02774         const int src0 = src[0          ];
02775         const int src1 = src[  srcStride];
02776         const int src2 = src[2*srcStride];
02777         const int src3 = src[3*srcStride];
02778         const int src4 = src[4*srcStride];
02779         const int src5 = src[5*srcStride];
02780         const int src6 = src[6*srcStride];
02781         const int src7 = src[7*srcStride];
02782         const int src8 = src[8*srcStride];
02783         const int src9 = src[9*srcStride];
02784         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
02785         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
02786         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
02787         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
02788         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
02789         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
02790         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
02791         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
02792         src++;
02793         dst++;
02794     }
02795 }
02796 
02797 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
02798     put_pixels8_c(dst, src, stride, 8);
02799 }
02800 
02801 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
02802     uint8_t half[64];
02803     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
02804     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
02805 }
02806 
02807 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
02808     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
02809 }
02810 
02811 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
02812     uint8_t half[64];
02813     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
02814     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
02815 }
02816 
02817 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
02818     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
02819 }
02820 
02821 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
02822     uint8_t halfH[88];
02823     uint8_t halfV[64];
02824     uint8_t halfHV[64];
02825     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
02826     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
02827     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
02828     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
02829 }
02830 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
02831     uint8_t halfH[88];
02832     uint8_t halfV[64];
02833     uint8_t halfHV[64];
02834     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
02835     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
02836     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
02837     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
02838 }
02839 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
02840     uint8_t halfH[88];
02841     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
02842     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
02843 }
02844 
02845 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
02846     if(CONFIG_ANY_H263) {
02847     int x;
02848     const int strength= ff_h263_loop_filter_strength[qscale];
02849 
02850     for(x=0; x<8; x++){
02851         int d1, d2, ad1;
02852         int p0= src[x-2*stride];
02853         int p1= src[x-1*stride];
02854         int p2= src[x+0*stride];
02855         int p3= src[x+1*stride];
02856         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
02857 
02858         if     (d<-2*strength) d1= 0;
02859         else if(d<-  strength) d1=-2*strength - d;
02860         else if(d<   strength) d1= d;
02861         else if(d< 2*strength) d1= 2*strength - d;
02862         else                   d1= 0;
02863 
02864         p1 += d1;
02865         p2 -= d1;
02866         if(p1&256) p1= ~(p1>>31);
02867         if(p2&256) p2= ~(p2>>31);
02868 
02869         src[x-1*stride] = p1;
02870         src[x+0*stride] = p2;
02871 
02872         ad1= FFABS(d1)>>1;
02873 
02874         d2= av_clip((p0-p3)/4, -ad1, ad1);
02875 
02876         src[x-2*stride] = p0 - d2;
02877         src[x+  stride] = p3 + d2;
02878     }
02879     }
02880 }
02881 
02882 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
02883     if(CONFIG_ANY_H263) {
02884     int y;
02885     const int strength= ff_h263_loop_filter_strength[qscale];
02886 
02887     for(y=0; y<8; y++){
02888         int d1, d2, ad1;
02889         int p0= src[y*stride-2];
02890         int p1= src[y*stride-1];
02891         int p2= src[y*stride+0];
02892         int p3= src[y*stride+1];
02893         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
02894 
02895         if     (d<-2*strength) d1= 0;
02896         else if(d<-  strength) d1=-2*strength - d;
02897         else if(d<   strength) d1= d;
02898         else if(d< 2*strength) d1= 2*strength - d;
02899         else                   d1= 0;
02900 
02901         p1 += d1;
02902         p2 -= d1;
02903         if(p1&256) p1= ~(p1>>31);
02904         if(p2&256) p2= ~(p2>>31);
02905 
02906         src[y*stride-1] = p1;
02907         src[y*stride+0] = p2;
02908 
02909         ad1= FFABS(d1)>>1;
02910 
02911         d2= av_clip((p0-p3)/4, -ad1, ad1);
02912 
02913         src[y*stride-2] = p0 - d2;
02914         src[y*stride+1] = p3 + d2;
02915     }
02916     }
02917 }
02918 
02919 static void h261_loop_filter_c(uint8_t *src, int stride){
02920     int x,y,xy,yz;
02921     int temp[64];
02922 
02923     for(x=0; x<8; x++){
02924         temp[x      ] = 4*src[x           ];
02925         temp[x + 7*8] = 4*src[x + 7*stride];
02926     }
02927     for(y=1; y<7; y++){
02928         for(x=0; x<8; x++){
02929             xy = y * stride + x;
02930             yz = y * 8 + x;
02931             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
02932         }
02933     }
02934 
02935     for(y=0; y<8; y++){
02936         src[  y*stride] = (temp[  y*8] + 2)>>2;
02937         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
02938         for(x=1; x<7; x++){
02939             xy = y * stride + x;
02940             yz = y * 8 + x;
02941             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
02942         }
02943     }
02944 }
02945 
02946 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
02947 {
02948     int i, d;
02949     for( i = 0; i < 4; i++ ) {
02950         if( tc0[i] < 0 ) {
02951             pix += 4*ystride;
02952             continue;
02953         }
02954         for( d = 0; d < 4; d++ ) {
02955             const int p0 = pix[-1*xstride];
02956             const int p1 = pix[-2*xstride];
02957             const int p2 = pix[-3*xstride];
02958             const int q0 = pix[0];
02959             const int q1 = pix[1*xstride];
02960             const int q2 = pix[2*xstride];
02961 
02962             if( FFABS( p0 - q0 ) < alpha &&
02963                 FFABS( p1 - p0 ) < beta &&
02964                 FFABS( q1 - q0 ) < beta ) {
02965 
02966                 int tc = tc0[i];
02967                 int i_delta;
02968 
02969                 if( FFABS( p2 - p0 ) < beta ) {
02970                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
02971                     tc++;
02972                 }
02973                 if( FFABS( q2 - q0 ) < beta ) {
02974                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
02975                     tc++;
02976                 }
02977 
02978                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
02979                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
02980                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
02981             }
02982             pix += ystride;
02983         }
02984     }
02985 }
02986 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
02987 {
02988     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
02989 }
02990 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
02991 {
02992     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
02993 }
02994 
02995 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
02996 {
02997     int d;
02998     for( d = 0; d < 16; d++ ) {
02999         const int p2 = pix[-3*xstride];
03000         const int p1 = pix[-2*xstride];
03001         const int p0 = pix[-1*xstride];
03002 
03003         const int q0 = pix[ 0*xstride];
03004         const int q1 = pix[ 1*xstride];
03005         const int q2 = pix[ 2*xstride];
03006 
03007         if( FFABS( p0 - q0 ) < alpha &&
03008             FFABS( p1 - p0 ) < beta &&
03009             FFABS( q1 - q0 ) < beta ) {
03010 
03011             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
03012                 if( FFABS( p2 - p0 ) < beta)
03013                 {
03014                     const int p3 = pix[-4*xstride];
03015                     /* p0', p1', p2' */
03016                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
03017                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
03018                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
03019                 } else {
03020                     /* p0' */
03021                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
03022                 }
03023                 if( FFABS( q2 - q0 ) < beta)
03024                 {
03025                     const int q3 = pix[3*xstride];
03026                     /* q0', q1', q2' */
03027                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
03028                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
03029                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
03030                 } else {
03031                     /* q0' */
03032                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
03033                 }
03034             }else{
03035                 /* p0', q0' */
03036                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
03037                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
03038             }
03039         }
03040         pix += ystride;
03041     }
03042 }
03043 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
03044 {
03045     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
03046 }
03047 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
03048 {
03049     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
03050 }
03051 
03052 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
03053 {
03054     int i, d;
03055     for( i = 0; i < 4; i++ ) {
03056         const int tc = tc0[i];
03057         if( tc <= 0 ) {
03058             pix += 2*ystride;
03059             continue;
03060         }
03061         for( d = 0; d < 2; d++ ) {
03062             const int p0 = pix[-1*xstride];
03063             const int p1 = pix[-2*xstride];
03064             const int q0 = pix[0];
03065             const int q1 = pix[1*xstride];
03066 
03067             if( FFABS( p0 - q0 ) < alpha &&
03068                 FFABS( p1 - p0 ) < beta &&
03069                 FFABS( q1 - q0 ) < beta ) {
03070 
03071                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
03072 
03073                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
03074                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
03075             }
03076             pix += ystride;
03077         }
03078     }
03079 }
03080 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
03081 {
03082     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
03083 }
03084 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
03085 {
03086     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
03087 }
03088 
03089 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
03090 {
03091     int d;
03092     for( d = 0; d < 8; d++ ) {
03093         const int p0 = pix[-1*xstride];
03094         const int p1 = pix[-2*xstride];
03095         const int q0 = pix[0];
03096         const int q1 = pix[1*xstride];
03097 
03098         if( FFABS( p0 - q0 ) < alpha &&
03099             FFABS( p1 - p0 ) < beta &&
03100             FFABS( q1 - q0 ) < beta ) {
03101 
03102             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
03103             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
03104         }
03105         pix += ystride;
03106     }
03107 }
03108 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
03109 {
03110     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
03111 }
03112 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
03113 {
03114     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
03115 }
03116 
03117 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03118 {
03119     int s, i;
03120 
03121     s = 0;
03122     for(i=0;i<h;i++) {
03123         s += abs(pix1[0] - pix2[0]);
03124         s += abs(pix1[1] - pix2[1]);
03125         s += abs(pix1[2] - pix2[2]);
03126         s += abs(pix1[3] - pix2[3]);
03127         s += abs(pix1[4] - pix2[4]);
03128         s += abs(pix1[5] - pix2[5]);
03129         s += abs(pix1[6] - pix2[6]);
03130         s += abs(pix1[7] - pix2[7]);
03131         s += abs(pix1[8] - pix2[8]);
03132         s += abs(pix1[9] - pix2[9]);
03133         s += abs(pix1[10] - pix2[10]);
03134         s += abs(pix1[11] - pix2[11]);
03135         s += abs(pix1[12] - pix2[12]);
03136         s += abs(pix1[13] - pix2[13]);
03137         s += abs(pix1[14] - pix2[14]);
03138         s += abs(pix1[15] - pix2[15]);
03139         pix1 += line_size;
03140         pix2 += line_size;
03141     }
03142     return s;
03143 }
03144 
03145 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03146 {
03147     int s, i;
03148 
03149     s = 0;
03150     for(i=0;i<h;i++) {
03151         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
03152         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
03153         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
03154         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
03155         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
03156         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
03157         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
03158         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
03159         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
03160         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
03161         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
03162         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
03163         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
03164         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
03165         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
03166         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
03167         pix1 += line_size;
03168         pix2 += line_size;
03169     }
03170     return s;
03171 }
03172 
03173 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03174 {
03175     int s, i;
03176     uint8_t *pix3 = pix2 + line_size;
03177 
03178     s = 0;
03179     for(i=0;i<h;i++) {
03180         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
03181         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
03182         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
03183         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
03184         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
03185         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
03186         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
03187         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
03188         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
03189         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
03190         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
03191         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
03192         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
03193         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
03194         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
03195         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
03196         pix1 += line_size;
03197         pix2 += line_size;
03198         pix3 += line_size;
03199     }
03200     return s;
03201 }
03202 
03203 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03204 {
03205     int s, i;
03206     uint8_t *pix3 = pix2 + line_size;
03207 
03208     s = 0;
03209     for(i=0;i<h;i++) {
03210         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
03211         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
03212         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
03213         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
03214         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
03215         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
03216         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
03217         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
03218         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
03219         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
03220         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
03221         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
03222         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
03223         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
03224         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
03225         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
03226         pix1 += line_size;
03227         pix2 += line_size;
03228         pix3 += line_size;
03229     }
03230     return s;
03231 }
03232 
03233 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03234 {
03235     int s, i;
03236 
03237     s = 0;
03238     for(i=0;i<h;i++) {
03239         s += abs(pix1[0] - pix2[0]);
03240         s += abs(pix1[1] - pix2[1]);
03241         s += abs(pix1[2] - pix2[2]);
03242         s += abs(pix1[3] - pix2[3]);
03243         s += abs(pix1[4] - pix2[4]);
03244         s += abs(pix1[5] - pix2[5]);
03245         s += abs(pix1[6] - pix2[6]);
03246         s += abs(pix1[7] - pix2[7]);
03247         pix1 += line_size;
03248         pix2 += line_size;
03249     }
03250     return s;
03251 }
03252 
03253 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03254 {
03255     int s, i;
03256 
03257     s = 0;
03258     for(i=0;i<h;i++) {
03259         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
03260         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
03261         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
03262         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
03263         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
03264         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
03265         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
03266         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
03267         pix1 += line_size;
03268         pix2 += line_size;
03269     }
03270     return s;
03271 }
03272 
03273 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03274 {
03275     int s, i;
03276     uint8_t *pix3 = pix2 + line_size;
03277 
03278     s = 0;
03279     for(i=0;i<h;i++) {
03280         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
03281         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
03282         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
03283         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
03284         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
03285         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
03286         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
03287         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
03288         pix1 += line_size;
03289         pix2 += line_size;
03290         pix3 += line_size;
03291     }
03292     return s;
03293 }
03294 
03295 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03296 {
03297     int s, i;
03298     uint8_t *pix3 = pix2 + line_size;
03299 
03300     s = 0;
03301     for(i=0;i<h;i++) {
03302         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
03303         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
03304         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
03305         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
03306         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
03307         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
03308         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
03309         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
03310         pix1 += line_size;
03311         pix2 += line_size;
03312         pix3 += line_size;
03313     }
03314     return s;
03315 }
03316 
03317 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
03318     MpegEncContext *c = v;
03319     int score1=0;
03320     int score2=0;
03321     int x,y;
03322 
03323     for(y=0; y<h; y++){
03324         for(x=0; x<16; x++){
03325             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
03326         }
03327         if(y+1<h){
03328             for(x=0; x<15; x++){
03329                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
03330                              - s1[x+1] + s1[x+1+stride])
03331                         -FFABS(  s2[x  ] - s2[x  +stride]
03332                              - s2[x+1] + s2[x+1+stride]);
03333             }
03334         }
03335         s1+= stride;
03336         s2+= stride;
03337     }
03338 
03339     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
03340     else  return score1 + FFABS(score2)*8;
03341 }
03342 
03343 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
03344     MpegEncContext *c = v;
03345     int score1=0;
03346     int score2=0;
03347     int x,y;
03348 
03349     for(y=0; y<h; y++){
03350         for(x=0; x<8; x++){
03351             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
03352         }
03353         if(y+1<h){
03354             for(x=0; x<7; x++){
03355                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
03356                              - s1[x+1] + s1[x+1+stride])
03357                         -FFABS(  s2[x  ] - s2[x  +stride]
03358                              - s2[x+1] + s2[x+1+stride]);
03359             }
03360         }
03361         s1+= stride;
03362         s2+= stride;
03363     }
03364 
03365     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
03366     else  return score1 + FFABS(score2)*8;
03367 }
03368 
03369 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
03370     int i;
03371     unsigned int sum=0;
03372 
03373     for(i=0; i<8*8; i++){
03374         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
03375         int w= weight[i];
03376         b>>= RECON_SHIFT;
03377         assert(-512<b && b<512);
03378 
03379         sum += (w*b)*(w*b)>>4;
03380     }
03381     return sum>>2;
03382 }
03383 
03384 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
03385     int i;
03386 
03387     for(i=0; i<8*8; i++){
03388         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
03389     }
03390 }
03391 
03400 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
03401 {
03402     int i;
03403     DCTELEM temp[64];
03404 
03405     if(last<=0) return;
03406     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
03407 
03408     for(i=0; i<=last; i++){
03409         const int j= scantable[i];
03410         temp[j]= block[j];
03411         block[j]=0;
03412     }
03413 
03414     for(i=0; i<=last; i++){
03415         const int j= scantable[i];
03416         const int perm_j= permutation[j];
03417         block[perm_j]= temp[j];
03418     }
03419 }
03420 
03421 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
03422     return 0;
03423 }
03424 
03425 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
03426     int i;
03427 
03428     memset(cmp, 0, sizeof(void*)*6);
03429 
03430     for(i=0; i<6; i++){
03431         switch(type&0xFF){
03432         case FF_CMP_SAD:
03433             cmp[i]= c->sad[i];
03434             break;
03435         case FF_CMP_SATD:
03436             cmp[i]= c->hadamard8_diff[i];
03437             break;
03438         case FF_CMP_SSE:
03439             cmp[i]= c->sse[i];
03440             break;
03441         case FF_CMP_DCT:
03442             cmp[i]= c->dct_sad[i];
03443             break;
03444         case FF_CMP_DCT264:
03445             cmp[i]= c->dct264_sad[i];
03446             break;
03447         case FF_CMP_DCTMAX:
03448             cmp[i]= c->dct_max[i];
03449             break;
03450         case FF_CMP_PSNR:
03451             cmp[i]= c->quant_psnr[i];
03452             break;
03453         case FF_CMP_BIT:
03454             cmp[i]= c->bit[i];
03455             break;
03456         case FF_CMP_RD:
03457             cmp[i]= c->rd[i];
03458             break;
03459         case FF_CMP_VSAD:
03460             cmp[i]= c->vsad[i];
03461             break;
03462         case FF_CMP_VSSE:
03463             cmp[i]= c->vsse[i];
03464             break;
03465         case FF_CMP_ZERO:
03466             cmp[i]= zero_cmp;
03467             break;
03468         case FF_CMP_NSSE:
03469             cmp[i]= c->nsse[i];
03470             break;
03471 #if CONFIG_SNOW_ENCODER
03472         case FF_CMP_W53:
03473             cmp[i]= c->w53[i];
03474             break;
03475         case FF_CMP_W97:
03476             cmp[i]= c->w97[i];
03477             break;
03478 #endif
03479         default:
03480             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
03481         }
03482     }
03483 }
03484 
03485 static void clear_block_c(DCTELEM *block)
03486 {
03487     memset(block, 0, sizeof(DCTELEM)*64);
03488 }
03489 
03493 static void clear_blocks_c(DCTELEM *blocks)
03494 {
03495     memset(blocks, 0, sizeof(DCTELEM)*6*64);
03496 }
03497 
03498 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
03499     long i;
03500     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
03501         long a = *(long*)(src+i);
03502         long b = *(long*)(dst+i);
03503         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
03504     }
03505     for(; i<w; i++)
03506         dst[i+0] += src[i+0];
03507 }
03508 
03509 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
03510     long i;
03511     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
03512         long a = *(long*)(src1+i);
03513         long b = *(long*)(src2+i);
03514         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
03515     }
03516     for(; i<w; i++)
03517         dst[i] = src1[i]+src2[i];
03518 }
03519 
03520 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
03521     long i;
03522 #if !HAVE_FAST_UNALIGNED
03523     if((long)src2 & (sizeof(long)-1)){
03524         for(i=0; i+7<w; i+=8){
03525             dst[i+0] = src1[i+0]-src2[i+0];
03526             dst[i+1] = src1[i+1]-src2[i+1];
03527             dst[i+2] = src1[i+2]-src2[i+2];
03528             dst[i+3] = src1[i+3]-src2[i+3];
03529             dst[i+4] = src1[i+4]-src2[i+4];
03530             dst[i+5] = src1[i+5]-src2[i+5];
03531             dst[i+6] = src1[i+6]-src2[i+6];
03532             dst[i+7] = src1[i+7]-src2[i+7];
03533         }
03534     }else
03535 #endif
03536     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
03537         long a = *(long*)(src1+i);
03538         long b = *(long*)(src2+i);
03539         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
03540     }
03541     for(; i<w; i++)
03542         dst[i+0] = src1[i+0]-src2[i+0];
03543 }
03544 
03545 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
03546     int i;
03547     uint8_t l, lt;
03548 
03549     l= *left;
03550     lt= *left_top;
03551 
03552     for(i=0; i<w; i++){
03553         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
03554         lt= src1[i];
03555         dst[i]= l;
03556     }
03557 
03558     *left= l;
03559     *left_top= lt;
03560 }
03561 
03562 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
03563     int i;
03564     uint8_t l, lt;
03565 
03566     l= *left;
03567     lt= *left_top;
03568 
03569     for(i=0; i<w; i++){
03570         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
03571         lt= src1[i];
03572         l= src2[i];
03573         dst[i]= l - pred;
03574     }
03575 
03576     *left= l;
03577     *left_top= lt;
03578 }
03579 
03580 #define BUTTERFLY2(o1,o2,i1,i2) \
03581 o1= (i1)+(i2);\
03582 o2= (i1)-(i2);
03583 
03584 #define BUTTERFLY1(x,y) \
03585 {\
03586     int a,b;\
03587     a= x;\
03588     b= y;\
03589     x= a+b;\
03590     y= a-b;\
03591 }
03592 
03593 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
03594 
03595 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
03596     int i;
03597     int temp[64];
03598     int sum=0;
03599 
03600     assert(h==8);
03601 
03602     for(i=0; i<8; i++){
03603         //FIXME try pointer walks
03604         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
03605         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
03606         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
03607         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
03608 
03609         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
03610         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
03611         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
03612         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
03613 
03614         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
03615         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
03616         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
03617         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
03618     }
03619 
03620     for(i=0; i<8; i++){
03621         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
03622         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
03623         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
03624         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
03625 
03626         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
03627         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
03628         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
03629         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
03630 
03631         sum +=
03632              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
03633             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
03634             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
03635             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
03636     }
03637 #if 0
03638 static int maxi=0;
03639 if(sum>maxi){
03640     maxi=sum;
03641     printf("MAX:%d\n", maxi);
03642 }
03643 #endif
03644     return sum;
03645 }
03646 
03647 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
03648     int i;
03649     int temp[64];
03650     int sum=0;
03651 
03652     assert(h==8);
03653 
03654     for(i=0; i<8; i++){
03655         //FIXME try pointer walks
03656         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
03657         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
03658         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
03659         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
03660 
03661         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
03662         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
03663         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
03664         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
03665 
03666         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
03667         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
03668         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
03669         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
03670     }
03671 
03672     for(i=0; i<8; i++){
03673         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
03674         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
03675         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
03676         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
03677 
03678         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
03679         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
03680         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
03681         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
03682 
03683         sum +=
03684              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
03685             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
03686             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
03687             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
03688     }
03689 
03690     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
03691 
03692     return sum;
03693 }
03694 
03695 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03696     MpegEncContext * const s= (MpegEncContext *)c;
03697     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
03698     DCTELEM * const temp= (DCTELEM*)aligned_temp;
03699 
03700     assert(h==8);
03701 
03702     s->dsp.diff_pixels(temp, src1, src2, stride);
03703     s->dsp.fdct(temp);
03704     return s->dsp.sum_abs_dctelem(temp);
03705 }
03706 
03707 #if CONFIG_GPL
03708 #define DCT8_1D {\
03709     const int s07 = SRC(0) + SRC(7);\
03710     const int s16 = SRC(1) + SRC(6);\
03711     const int s25 = SRC(2) + SRC(5);\
03712     const int s34 = SRC(3) + SRC(4);\
03713     const int a0 = s07 + s34;\
03714     const int a1 = s16 + s25;\
03715     const int a2 = s07 - s34;\
03716     const int a3 = s16 - s25;\
03717     const int d07 = SRC(0) - SRC(7);\
03718     const int d16 = SRC(1) - SRC(6);\
03719     const int d25 = SRC(2) - SRC(5);\
03720     const int d34 = SRC(3) - SRC(4);\
03721     const int a4 = d16 + d25 + (d07 + (d07>>1));\
03722     const int a5 = d07 - d34 - (d25 + (d25>>1));\
03723     const int a6 = d07 + d34 - (d16 + (d16>>1));\
03724     const int a7 = d16 - d25 + (d34 + (d34>>1));\
03725     DST(0,  a0 + a1     ) ;\
03726     DST(1,  a4 + (a7>>2)) ;\
03727     DST(2,  a2 + (a3>>1)) ;\
03728     DST(3,  a5 + (a6>>2)) ;\
03729     DST(4,  a0 - a1     ) ;\
03730     DST(5,  a6 - (a5>>2)) ;\
03731     DST(6, (a2>>1) - a3 ) ;\
03732     DST(7, (a4>>2) - a7 ) ;\
03733 }
03734 
03735 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03736     MpegEncContext * const s= (MpegEncContext *)c;
03737     DCTELEM dct[8][8];
03738     int i;
03739     int sum=0;
03740 
03741     s->dsp.diff_pixels(dct[0], src1, src2, stride);
03742 
03743 #define SRC(x) dct[i][x]
03744 #define DST(x,v) dct[i][x]= v
03745     for( i = 0; i < 8; i++ )
03746         DCT8_1D
03747 #undef SRC
03748 #undef DST
03749 
03750 #define SRC(x) dct[x][i]
03751 #define DST(x,v) sum += FFABS(v)
03752     for( i = 0; i < 8; i++ )
03753         DCT8_1D
03754 #undef SRC
03755 #undef DST
03756     return sum;
03757 }
03758 #endif
03759 
03760 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03761     MpegEncContext * const s= (MpegEncContext *)c;
03762     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
03763     DCTELEM * const temp= (DCTELEM*)aligned_temp;
03764     int sum=0, i;
03765 
03766     assert(h==8);
03767 
03768     s->dsp.diff_pixels(temp, src1, src2, stride);
03769     s->dsp.fdct(temp);
03770 
03771     for(i=0; i<64; i++)
03772         sum= FFMAX(sum, FFABS(temp[i]));
03773 
03774     return sum;
03775 }
03776 
03777 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03778     MpegEncContext * const s= (MpegEncContext *)c;
03779     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
03780     DCTELEM * const temp= (DCTELEM*)aligned_temp;
03781     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
03782     int sum=0, i;
03783 
03784     assert(h==8);
03785     s->mb_intra=0;
03786 
03787     s->dsp.diff_pixels(temp, src1, src2, stride);
03788 
03789     memcpy(bak, temp, 64*sizeof(DCTELEM));
03790 
03791     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
03792     s->dct_unquantize_inter(s, temp, 0, s->qscale);
03793     ff_simple_idct(temp); //FIXME
03794 
03795     for(i=0; i<64; i++)
03796         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
03797 
03798     return sum;
03799 }
03800 
03801 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03802     MpegEncContext * const s= (MpegEncContext *)c;
03803     const uint8_t *scantable= s->intra_scantable.permutated;
03804     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
03805     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
03806     DCTELEM * const temp= (DCTELEM*)aligned_temp;
03807     uint8_t * const bak= (uint8_t*)aligned_bak;
03808     int i, last, run, bits, level, distortion, start_i;
03809     const int esc_length= s->ac_esc_length;
03810     uint8_t * length;
03811     uint8_t * last_length;
03812 
03813     assert(h==8);
03814 
03815     for(i=0; i<8; i++){
03816         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
03817         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
03818     }
03819 
03820     s->dsp.diff_pixels(temp, src1, src2, stride);
03821 
03822     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
03823 
03824     bits=0;
03825 
03826     if (s->mb_intra) {
03827         start_i = 1;
03828         length     = s->intra_ac_vlc_length;
03829         last_length= s->intra_ac_vlc_last_length;
03830         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
03831     } else {
03832         start_i = 0;
03833         length     = s->inter_ac_vlc_length;
03834         last_length= s->inter_ac_vlc_last_length;
03835     }
03836 
03837     if(last>=start_i){
03838         run=0;
03839         for(i=start_i; i<last; i++){
03840             int j= scantable[i];
03841             level= temp[j];
03842 
03843             if(level){
03844                 level+=64;
03845                 if((level&(~127)) == 0){
03846                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
03847                 }else
03848                     bits+= esc_length;
03849                 run=0;
03850             }else
03851                 run++;
03852         }
03853         i= scantable[last];
03854 
03855         level= temp[i] + 64;
03856 
03857         assert(level - 64);
03858 
03859         if((level&(~127)) == 0){
03860             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
03861         }else
03862             bits+= esc_length;
03863 
03864     }
03865 
03866     if(last>=0){
03867         if(s->mb_intra)
03868             s->dct_unquantize_intra(s, temp, 0, s->qscale);
03869         else
03870             s->dct_unquantize_inter(s, temp, 0, s->qscale);
03871     }
03872 
03873     s->dsp.idct_add(bak, stride, temp);
03874 
03875     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
03876 
03877     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
03878 }
03879 
03880 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03881     MpegEncContext * const s= (MpegEncContext *)c;
03882     const uint8_t *scantable= s->intra_scantable.permutated;
03883     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
03884     DCTELEM * const temp= (DCTELEM*)aligned_temp;
03885     int i, last, run, bits, level, start_i;
03886     const int esc_length= s->ac_esc_length;
03887     uint8_t * length;
03888     uint8_t * last_length;
03889 
03890     assert(h==8);
03891 
03892     s->dsp.diff_pixels(temp, src1, src2, stride);
03893 
03894     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
03895 
03896     bits=0;
03897 
03898     if (s->mb_intra) {
03899         start_i = 1;
03900         length     = s->intra_ac_vlc_length;
03901         last_length= s->intra_ac_vlc_last_length;
03902         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
03903     } else {
03904         start_i = 0;
03905         length     = s->inter_ac_vlc_length;
03906         last_length= s->inter_ac_vlc_last_length;
03907     }
03908 
03909     if(last>=start_i){
03910         run=0;
03911         for(i=start_i; i<last; i++){
03912             int j= scantable[i];
03913             level= temp[j];
03914 
03915             if(level){
03916                 level+=64;
03917                 if((level&(~127)) == 0){
03918                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
03919                 }else
03920                     bits+= esc_length;
03921                 run=0;
03922             }else
03923                 run++;
03924         }
03925         i= scantable[last];
03926 
03927         level= temp[i] + 64;
03928 
03929         assert(level - 64);
03930 
03931         if((level&(~127)) == 0){
03932             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
03933         }else
03934             bits+= esc_length;
03935     }
03936 
03937     return bits;
03938 }
03939 
03940 #define VSAD_INTRA(size) \
03941 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
03942     int score=0;                                                                                            \
03943     int x,y;                                                                                                \
03944                                                                                                             \
03945     for(y=1; y<h; y++){                                                                                     \
03946         for(x=0; x<size; x+=4){                                                                             \
03947             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
03948                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
03949         }                                                                                                   \
03950         s+= stride;                                                                                         \
03951     }                                                                                                       \
03952                                                                                                             \
03953     return score;                                                                                           \
03954 }
03955 VSAD_INTRA(8)
03956 VSAD_INTRA(16)
03957 
03958 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
03959     int score=0;
03960     int x,y;
03961 
03962     for(y=1; y<h; y++){
03963         for(x=0; x<16; x++){
03964             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
03965         }
03966         s1+= stride;
03967         s2+= stride;
03968     }
03969 
03970     return score;
03971 }
03972 
03973 #define SQ(a) ((a)*(a))
03974 #define VSSE_INTRA(size) \
03975 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
03976     int score=0;                                                                                            \
03977     int x,y;                                                                                                \
03978                                                                                                             \
03979     for(y=1; y<h; y++){                                                                                     \
03980         for(x=0; x<size; x+=4){                                                                               \
03981             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
03982                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
03983         }                                                                                                   \
03984         s+= stride;                                                                                         \
03985     }                                                                                                       \
03986                                                                                                             \
03987     return score;                                                                                           \
03988 }
03989 VSSE_INTRA(8)
03990 VSSE_INTRA(16)
03991 
03992 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
03993     int score=0;
03994     int x,y;
03995 
03996     for(y=1; y<h; y++){
03997         for(x=0; x<16; x++){
03998             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
03999         }
04000         s1+= stride;
04001         s2+= stride;
04002     }
04003 
04004     return score;
04005 }
04006 
04007 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
04008                                int size){
04009     int score=0;
04010     int i;
04011     for(i=0; i<size; i++)
04012         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
04013     return score;
04014 }
04015 
04016 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
04017 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
04018 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
04019 #if CONFIG_GPL
04020 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
04021 #endif
04022 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
04023 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
04024 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
04025 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
04026 
04027 static void vector_fmul_c(float *dst, const float *src, int len){
04028     int i;
04029     for(i=0; i<len; i++)
04030         dst[i] *= src[i];
04031 }
04032 
04033 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
04034     int i;
04035     src1 += len-1;
04036     for(i=0; i<len; i++)
04037         dst[i] = src0[i] * src1[-i];
04038 }
04039 
04040 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
04041     int i;
04042     for(i=0; i<len; i++)
04043         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
04044 }
04045 
04046 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
04047     int i,j;
04048     dst += len;
04049     win += len;
04050     src0+= len;
04051     for(i=-len, j=len-1; i<0; i++, j--) {
04052         float s0 = src0[i];
04053         float s1 = src1[j];
04054         float wi = win[i];
04055         float wj = win[j];
04056         dst[i] = s0*wj - s1*wi + add_bias;
04057         dst[j] = s0*wi + s1*wj + add_bias;
04058     }
04059 }
04060 
04061 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
04062     int i;
04063     for(i=0; i<len; i++)
04064         dst[i] = src[i] * mul;
04065 }
04066 
04067 static av_always_inline int float_to_int16_one(const float *src){
04068     int_fast32_t tmp = *(const int32_t*)src;
04069     if(tmp & 0xf0000){
04070         tmp = (0x43c0ffff - tmp)>>31;
04071         // is this faster on some gcc/cpu combinations?
04072 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
04073 //      else                 tmp = 0;
04074     }
04075     return tmp - 0x8000;
04076 }
04077 
04078 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
04079     int i;
04080     for(i=0; i<len; i++)
04081         dst[i] = float_to_int16_one(src+i);
04082 }
04083 
04084 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
04085     int i,j,c;
04086     if(channels==2){
04087         for(i=0; i<len; i++){
04088             dst[2*i]   = float_to_int16_one(src[0]+i);
04089             dst[2*i+1] = float_to_int16_one(src[1]+i);
04090         }
04091     }else{
04092         for(c=0; c<channels; c++)
04093             for(i=0, j=c; i<len; i++, j+=channels)
04094                 dst[j] = float_to_int16_one(src[c]+i);
04095     }
04096 }
04097 
04098 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
04099 {
04100     while (order--)
04101        *v1++ += *v2++;
04102 }
04103 
04104 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
04105 {
04106     while (order--)
04107         *v1++ -= *v2++;
04108 }
04109 
04110 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
04111 {
04112     int res = 0;
04113 
04114     while (order--)
04115         res += (*v1++ * *v2++) >> shift;
04116 
04117     return res;
04118 }
04119 
04120 #define W0 2048
04121 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
04122 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
04123 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
04124 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
04125 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
04126 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
04127 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
04128 
04129 static void wmv2_idct_row(short * b)
04130 {
04131     int s1,s2;
04132     int a0,a1,a2,a3,a4,a5,a6,a7;
04133     /*step 1*/
04134     a1 = W1*b[1]+W7*b[7];
04135     a7 = W7*b[1]-W1*b[7];
04136     a5 = W5*b[5]+W3*b[3];
04137     a3 = W3*b[5]-W5*b[3];
04138     a2 = W2*b[2]+W6*b[6];
04139     a6 = W6*b[2]-W2*b[6];
04140     a0 = W0*b[0]+W0*b[4];
04141     a4 = W0*b[0]-W0*b[4];
04142     /*step 2*/
04143     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
04144     s2 = (181*(a1-a5-a7+a3)+128)>>8;
04145     /*step 3*/
04146     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
04147     b[1] = (a4+a6 +s1   + (1<<7))>>8;
04148     b[2] = (a4-a6 +s2   + (1<<7))>>8;
04149     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
04150     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
04151     b[5] = (a4-a6 -s2   + (1<<7))>>8;
04152     b[6] = (a4+a6 -s1   + (1<<7))>>8;
04153     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
04154 }
04155 static void wmv2_idct_col(short * b)
04156 {
04157     int s1,s2;
04158     int a0,a1,a2,a3,a4,a5,a6,a7;
04159     /*step 1, with extended precision*/
04160     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
04161     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
04162     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
04163     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
04164     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
04165     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
04166     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
04167     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
04168     /*step 2*/
04169     s1 = (181*(a1-a5+a7-a3)+128)>>8;
04170     s2 = (181*(a1-a5-a7+a3)+128)>>8;
04171     /*step 3*/
04172     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
04173     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
04174     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
04175     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
04176 
04177     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
04178     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
04179     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
04180     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
04181 }
04182 void ff_wmv2_idct_c(short * block){
04183     int i;
04184 
04185     for(i=0;i<64;i+=8){
04186         wmv2_idct_row(block+i);
04187     }
04188     for(i=0;i<8;i++){
04189         wmv2_idct_col(block+i);
04190     }
04191 }
04192 /* XXX: those functions should be suppressed ASAP when all IDCTs are
04193  converted */
04194 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
04195 {
04196     ff_wmv2_idct_c(block);
04197     put_pixels_clamped_c(block, dest, line_size);
04198 }
04199 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
04200 {
04201     ff_wmv2_idct_c(block);
04202     add_pixels_clamped_c(block, dest, line_size);
04203 }
04204 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
04205 {
04206     j_rev_dct (block);
04207     put_pixels_clamped_c(block, dest, line_size);
04208 }
04209 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
04210 {
04211     j_rev_dct (block);
04212     add_pixels_clamped_c(block, dest, line_size);
04213 }
04214 
04215 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
04216 {
04217     j_rev_dct4 (block);
04218     put_pixels_clamped4_c(block, dest, line_size);
04219 }
04220 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
04221 {
04222     j_rev_dct4 (block);
04223     add_pixels_clamped4_c(block, dest, line_size);
04224 }
04225 
04226 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
04227 {
04228     j_rev_dct2 (block);
04229     put_pixels_clamped2_c(block, dest, line_size);
04230 }
04231 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
04232 {
04233     j_rev_dct2 (block);
04234     add_pixels_clamped2_c(block, dest, line_size);
04235 }
04236 
04237 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
04238 {
04239     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
04240 
04241     dest[0] = cm[(block[0] + 4)>>3];
04242 }
04243 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
04244 {
04245     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
04246 
04247     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
04248 }
04249 
04250 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
04251 
04252 /* init static data */
04253 void dsputil_static_init(void)
04254 {
04255     int i;
04256 
04257     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
04258     for(i=0;i<MAX_NEG_CROP;i++) {
04259         ff_cropTbl[i] = 0;
04260         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
04261     }
04262 
04263     for(i=0;i<512;i++) {
04264         ff_squareTbl[i] = (i - 256) * (i - 256);
04265     }
04266 
04267     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
04268 }
04269 
04270 int ff_check_alignment(void){
04271     static int did_fail=0;
04272     DECLARE_ALIGNED_16(int, aligned);
04273 
04274     if((long)&aligned & 15){
04275         if(!did_fail){
04276 #if HAVE_MMX || HAVE_ALTIVEC
04277             av_log(NULL, AV_LOG_ERROR,
04278                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
04279                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
04280                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
04281                 "Do not report crashes to FFmpeg developers.\n");
04282 #endif
04283             did_fail=1;
04284         }
04285         return -1;
04286     }
04287     return 0;
04288 }
04289 
04290 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
04291 {
04292     int i;
04293 
04294     ff_check_alignment();
04295 
04296 #if CONFIG_ENCODERS
04297     if(avctx->dct_algo==FF_DCT_FASTINT) {
04298         c->fdct = fdct_ifast;
04299         c->fdct248 = fdct_ifast248;
04300     }
04301     else if(avctx->dct_algo==FF_DCT_FAAN) {
04302         c->fdct = ff_faandct;
04303         c->fdct248 = ff_faandct248;
04304     }
04305     else {
04306         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
04307         c->fdct248 = ff_fdct248_islow;
04308     }
04309 #endif //CONFIG_ENCODERS
04310 
04311     if(avctx->lowres==1){
04312         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
04313             c->idct_put= ff_jref_idct4_put;
04314             c->idct_add= ff_jref_idct4_add;
04315         }else{
04316             c->idct_put= ff_h264_lowres_idct_put_c;
04317             c->idct_add= ff_h264_lowres_idct_add_c;
04318         }
04319         c->idct    = j_rev_dct4;
04320         c->idct_permutation_type= FF_NO_IDCT_PERM;
04321     }else if(avctx->lowres==2){
04322         c->idct_put= ff_jref_idct2_put;
04323         c->idct_add= ff_jref_idct2_add;
04324         c->idct    = j_rev_dct2;
04325         c->idct_permutation_type= FF_NO_IDCT_PERM;
04326     }else if(avctx->lowres==3){
04327         c->idct_put= ff_jref_idct1_put;
04328         c->idct_add= ff_jref_idct1_add;
04329         c->idct    = j_rev_dct1;
04330         c->idct_permutation_type= FF_NO_IDCT_PERM;
04331     }else{
04332         if(avctx->idct_algo==FF_IDCT_INT){
04333             c->idct_put= ff_jref_idct_put;
04334             c->idct_add= ff_jref_idct_add;
04335             c->idct    = j_rev_dct;
04336             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
04337         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER ) &&
04338                 avctx->idct_algo==FF_IDCT_VP3){
04339             c->idct_put= ff_vp3_idct_put_c;
04340             c->idct_add= ff_vp3_idct_add_c;
04341             c->idct    = ff_vp3_idct_c;
04342             c->idct_permutation_type= FF_NO_IDCT_PERM;
04343         }else if(avctx->idct_algo==FF_IDCT_WMV2){
04344             c->idct_put= ff_wmv2_idct_put_c;
04345             c->idct_add= ff_wmv2_idct_add_c;
04346             c->idct    = ff_wmv2_idct_c;
04347             c->idct_permutation_type= FF_NO_IDCT_PERM;
04348         }else if(avctx->idct_algo==FF_IDCT_FAAN){
04349             c->idct_put= ff_faanidct_put;
04350             c->idct_add= ff_faanidct_add;
04351             c->idct    = ff_faanidct;
04352             c->idct_permutation_type= FF_NO_IDCT_PERM;
04353         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
04354             c->idct_put= ff_ea_idct_put_c;
04355             c->idct_permutation_type= FF_NO_IDCT_PERM;
04356         }else{ //accurate/default
04357             c->idct_put= ff_simple_idct_put;
04358             c->idct_add= ff_simple_idct_add;
04359             c->idct    = ff_simple_idct;
04360             c->idct_permutation_type= FF_NO_IDCT_PERM;
04361         }
04362     }
04363 
04364     if (CONFIG_H264_DECODER) {
04365         c->h264_idct_add= ff_h264_idct_add_c;
04366         c->h264_idct8_add= ff_h264_idct8_add_c;
04367         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
04368         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
04369         c->h264_idct_add16     = ff_h264_idct_add16_c;
04370         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
04371         c->h264_idct_add8      = ff_h264_idct_add8_c;
04372         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
04373     }
04374 
04375     c->get_pixels = get_pixels_c;
04376     c->diff_pixels = diff_pixels_c;
04377     c->put_pixels_clamped = put_pixels_clamped_c;
04378     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
04379     c->add_pixels_clamped = add_pixels_clamped_c;
04380     c->add_pixels8 = add_pixels8_c;
04381     c->add_pixels4 = add_pixels4_c;
04382     c->sum_abs_dctelem = sum_abs_dctelem_c;
04383     c->gmc1 = gmc1_c;
04384     c->gmc = ff_gmc_c;
04385     c->clear_block = clear_block_c;
04386     c->clear_blocks = clear_blocks_c;
04387     c->pix_sum = pix_sum_c;
04388     c->pix_norm1 = pix_norm1_c;
04389 
04390     /* TODO [0] 16  [1] 8 */
04391     c->pix_abs[0][0] = pix_abs16_c;
04392     c->pix_abs[0][1] = pix_abs16_x2_c;
04393     c->pix_abs[0][2] = pix_abs16_y2_c;
04394     c->pix_abs[0][3] = pix_abs16_xy2_c;
04395     c->pix_abs[1][0] = pix_abs8_c;
04396     c->pix_abs[1][1] = pix_abs8_x2_c;
04397     c->pix_abs[1][2] = pix_abs8_y2_c;
04398     c->pix_abs[1][3] = pix_abs8_xy2_c;
04399 
04400 #define dspfunc(PFX, IDX, NUM) \
04401     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
04402     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
04403     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
04404     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
04405 
04406     dspfunc(put, 0, 16);
04407     dspfunc(put_no_rnd, 0, 16);
04408     dspfunc(put, 1, 8);
04409     dspfunc(put_no_rnd, 1, 8);
04410     dspfunc(put, 2, 4);
04411     dspfunc(put, 3, 2);
04412 
04413     dspfunc(avg, 0, 16);
04414     dspfunc(avg_no_rnd, 0, 16);
04415     dspfunc(avg, 1, 8);
04416     dspfunc(avg_no_rnd, 1, 8);
04417     dspfunc(avg, 2, 4);
04418     dspfunc(avg, 3, 2);
04419 #undef dspfunc
04420 
04421     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
04422     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
04423 
04424     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
04425     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
04426     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
04427     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
04428     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
04429     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
04430     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
04431     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
04432     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
04433 
04434     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
04435     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
04436     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
04437     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
04438     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
04439     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
04440     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
04441     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
04442     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
04443 
04444 #define dspfunc(PFX, IDX, NUM) \
04445     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
04446     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
04447     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
04448     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
04449     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
04450     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
04451     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
04452     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
04453     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
04454     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
04455     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
04456     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
04457     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
04458     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
04459     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
04460     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
04461 
04462     dspfunc(put_qpel, 0, 16);
04463     dspfunc(put_no_rnd_qpel, 0, 16);
04464 
04465     dspfunc(avg_qpel, 0, 16);
04466     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
04467 
04468     dspfunc(put_qpel, 1, 8);
04469     dspfunc(put_no_rnd_qpel, 1, 8);
04470 
04471     dspfunc(avg_qpel, 1, 8);
04472     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
04473 
04474     dspfunc(put_h264_qpel, 0, 16);
04475     dspfunc(put_h264_qpel, 1, 8);
04476     dspfunc(put_h264_qpel, 2, 4);
04477     dspfunc(put_h264_qpel, 3, 2);
04478     dspfunc(avg_h264_qpel, 0, 16);
04479     dspfunc(avg_h264_qpel, 1, 8);
04480     dspfunc(avg_h264_qpel, 2, 4);
04481 
04482 #undef dspfunc
04483     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
04484     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
04485     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
04486     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
04487     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
04488     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
04489     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
04490 
04491     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
04492     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
04493     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
04494     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
04495     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
04496     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
04497     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
04498     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
04499     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
04500     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
04501     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
04502     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
04503     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
04504     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
04505     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
04506     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
04507     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
04508     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
04509     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
04510     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
04511 
04512     c->draw_edges = draw_edges_c;
04513 
04514 #if CONFIG_CAVS_DECODER
04515     ff_cavsdsp_init(c,avctx);
04516 #endif
04517 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
04518     ff_vc1dsp_init(c,avctx);
04519 #endif
04520 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
04521     ff_intrax8dsp_init(c,avctx);
04522 #endif
04523 #if CONFIG_RV30_DECODER
04524     ff_rv30dsp_init(c,avctx);
04525 #endif
04526 #if CONFIG_RV40_DECODER
04527     ff_rv40dsp_init(c,avctx);
04528     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
04529     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
04530     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
04531     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
04532 #endif
04533 
04534     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
04535     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
04536     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
04537     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
04538     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
04539     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
04540     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
04541     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
04542 
04543 #define SET_CMP_FUNC(name) \
04544     c->name[0]= name ## 16_c;\
04545     c->name[1]= name ## 8x8_c;
04546 
04547     SET_CMP_FUNC(hadamard8_diff)
04548     c->hadamard8_diff[4]= hadamard8_intra16_c;
04549     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
04550     SET_CMP_FUNC(dct_sad)
04551     SET_CMP_FUNC(dct_max)
04552 #if CONFIG_GPL
04553     SET_CMP_FUNC(dct264_sad)
04554 #endif
04555     c->sad[0]= pix_abs16_c;
04556     c->sad[1]= pix_abs8_c;
04557     c->sse[0]= sse16_c;
04558     c->sse[1]= sse8_c;
04559     c->sse[2]= sse4_c;
04560     SET_CMP_FUNC(quant_psnr)
04561     SET_CMP_FUNC(rd)
04562     SET_CMP_FUNC(bit)
04563     c->vsad[0]= vsad16_c;
04564     c->vsad[4]= vsad_intra16_c;
04565     c->vsad[5]= vsad_intra8_c;
04566     c->vsse[0]= vsse16_c;
04567     c->vsse[4]= vsse_intra16_c;
04568     c->vsse[5]= vsse_intra8_c;
04569     c->nsse[0]= nsse16_c;
04570     c->nsse[1]= nsse8_c;
04571 #if CONFIG_SNOW_ENCODER
04572     c->w53[0]= w53_16_c;
04573     c->w53[1]= w53_8_c;
04574     c->w97[0]= w97_16_c;
04575     c->w97[1]= w97_8_c;
04576 #endif
04577 
04578     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
04579 
04580     c->add_bytes= add_bytes_c;
04581     c->add_bytes_l2= add_bytes_l2_c;
04582     c->diff_bytes= diff_bytes_c;
04583     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
04584     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
04585     c->bswap_buf= bswap_buf;
04586 #if CONFIG_PNG_DECODER
04587     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
04588 #endif
04589 
04590     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
04591     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
04592     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
04593     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
04594     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
04595     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
04596     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
04597     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
04598     c->h264_loop_filter_strength= NULL;
04599 
04600     if (CONFIG_ANY_H263) {
04601         c->h263_h_loop_filter= h263_h_loop_filter_c;
04602         c->h263_v_loop_filter= h263_v_loop_filter_c;
04603     }
04604 
04605     if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
04606         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
04607         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
04608     }
04609     if (CONFIG_VP6_DECODER) {
04610         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
04611     }
04612 
04613     c->h261_loop_filter= h261_loop_filter_c;
04614 
04615     c->try_8x8basis= try_8x8basis_c;
04616     c->add_8x8basis= add_8x8basis_c;
04617 
04618 #if CONFIG_SNOW_DECODER
04619     c->vertical_compose97i = ff_snow_vertical_compose97i;
04620     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
04621     c->inner_add_yblock = ff_snow_inner_add_yblock;
04622 #endif
04623 
04624 #if CONFIG_VORBIS_DECODER
04625     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
04626 #endif
04627 #if CONFIG_AC3_DECODER
04628     c->ac3_downmix = ff_ac3_downmix_c;
04629 #endif
04630 #if CONFIG_FLAC_ENCODER
04631     c->flac_compute_autocorr = ff_flac_compute_autocorr;
04632 #endif
04633     c->vector_fmul = vector_fmul_c;
04634     c->vector_fmul_reverse = vector_fmul_reverse_c;
04635     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
04636     c->vector_fmul_window = ff_vector_fmul_window_c;
04637     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
04638     c->float_to_int16 = ff_float_to_int16_c;
04639     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
04640     c->add_int16 = add_int16_c;
04641     c->sub_int16 = sub_int16_c;
04642     c->scalarproduct_int16 = scalarproduct_int16_c;
04643 
04644     c->shrink[0]= ff_img_copy_plane;
04645     c->shrink[1]= ff_shrink22;
04646     c->shrink[2]= ff_shrink44;
04647     c->shrink[3]= ff_shrink88;
04648 
04649     c->prefetch= just_return;
04650 
04651     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
04652     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
04653 
04654     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
04655     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
04656     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
04657     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
04658     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
04659     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
04660     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
04661     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
04662     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
04663 
04664     for(i=0; i<64; i++){
04665         if(!c->put_2tap_qpel_pixels_tab[0][i])
04666             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
04667         if(!c->avg_2tap_qpel_pixels_tab[0][i])
04668             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
04669     }
04670 
04671     switch(c->idct_permutation_type){
04672     case FF_NO_IDCT_PERM:
04673         for(i=0; i<64; i++)
04674             c->idct_permutation[i]= i;
04675         break;
04676     case FF_LIBMPEG2_IDCT_PERM:
04677         for(i=0; i<64; i++)
04678             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
04679         break;
04680     case FF_SIMPLE_IDCT_PERM:
04681         for(i=0; i<64; i++)
04682             c->idct_permutation[i]= simple_mmx_permutation[i];
04683         break;
04684     case FF_TRANSPOSE_IDCT_PERM:
04685         for(i=0; i<64; i++)
04686             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
04687         break;
04688     case FF_PARTTRANS_IDCT_PERM:
04689         for(i=0; i<64; i++)
04690             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
04691         break;
04692     case FF_SSE2_IDCT_PERM:
04693         for(i=0; i<64; i++)
04694             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
04695         break;
04696     default:
04697         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
04698     }
04699 }
04700