FFmpeg
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/sse2 postprocess code.
24  */
25 #include "config.h"
26 
27 #include "libavutil/mem_internal.h"
28 #if ARCH_X86
29 #include "libavutil/x86/asm.h"
30 #endif
31 
32 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
33  * included. The following macros will define its dependencies to 1 as well
34  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
35  * TEMPLATE_PP_* need to be undef at the end. */
36 
37 #ifdef TEMPLATE_PP_C
38 # define RENAME(a) a ## _C
39 #else
40 # define TEMPLATE_PP_C 0
41 #endif
42 
43 #ifdef TEMPLATE_PP_ALTIVEC
44 # define RENAME(a) a ## _altivec
45 #else
46 # define TEMPLATE_PP_ALTIVEC 0
47 #endif
48 
49 #ifdef TEMPLATE_PP_MMX
50 # define RENAME(a) a ## _MMX
51 #else
52 # define TEMPLATE_PP_MMX 0
53 #endif
54 
55 #ifdef TEMPLATE_PP_MMXEXT
56 # undef TEMPLATE_PP_MMX
57 # define TEMPLATE_PP_MMX 1
58 # define RENAME(a) a ## _MMX2
59 #else
60 # define TEMPLATE_PP_MMXEXT 0
61 #endif
62 
63 #ifdef TEMPLATE_PP_SSE2
64 # undef TEMPLATE_PP_MMX
65 # define TEMPLATE_PP_MMX 1
66 # undef TEMPLATE_PP_MMXEXT
67 # define TEMPLATE_PP_MMXEXT 1
68 # define RENAME(a) a ## _SSE2
69 #else
70 # define TEMPLATE_PP_SSE2 0
71 #endif
72 
73 #undef REAL_PAVGB
74 #undef PAVGB
75 #undef PMINUB
76 #undef PMAXUB
77 
78 #if TEMPLATE_PP_MMXEXT
79 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
80 #endif
81 #define PAVGB(a,b) REAL_PAVGB(a,b)
82 
83 #if TEMPLATE_PP_MMXEXT
84 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
85 #endif
86 
87 #if TEMPLATE_PP_MMXEXT
88 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
89 #endif
90 
91 //FIXME? |255-0| = 1 (should not be a problem ...)
92 #if TEMPLATE_PP_MMXEXT
93 /**
94  * Check if the middle 8x8 Block in the given 8x16 block is flat
95  */
96 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
97  int numEq= 0, dcOk;
98  src+= stride*4; // src points to begin of the 8x8 Block
99  __asm__ volatile(
100  "movq %0, %%mm7 \n\t"
101  "movq %1, %%mm6 \n\t"
102  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
103  );
104 
105  __asm__ volatile(
106  "lea (%2, %3), %%"FF_REG_a" \n\t"
107 // 0 1 2 3 4 5 6 7 8 9
108 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
109 
110  "movq (%2), %%mm0 \n\t"
111  "movq (%%"FF_REG_a"), %%mm1 \n\t"
112  "movq %%mm0, %%mm3 \n\t"
113  "movq %%mm0, %%mm4 \n\t"
114  PMAXUB(%%mm1, %%mm4)
115  PMINUB(%%mm1, %%mm3, %%mm5)
116  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
117  "paddb %%mm7, %%mm0 \n\t"
118  "pcmpgtb %%mm6, %%mm0 \n\t"
119 
120  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
121  PMAXUB(%%mm2, %%mm4)
122  PMINUB(%%mm2, %%mm3, %%mm5)
123  "psubb %%mm2, %%mm1 \n\t"
124  "paddb %%mm7, %%mm1 \n\t"
125  "pcmpgtb %%mm6, %%mm1 \n\t"
126  "paddb %%mm1, %%mm0 \n\t"
127 
128  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm2 \n\t"
132  "paddb %%mm7, %%mm2 \n\t"
133  "pcmpgtb %%mm6, %%mm2 \n\t"
134  "paddb %%mm2, %%mm0 \n\t"
135 
136  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
137 
138  "movq (%2, %3, 4), %%mm2 \n\t"
139  PMAXUB(%%mm2, %%mm4)
140  PMINUB(%%mm2, %%mm3, %%mm5)
141  "psubb %%mm2, %%mm1 \n\t"
142  "paddb %%mm7, %%mm1 \n\t"
143  "pcmpgtb %%mm6, %%mm1 \n\t"
144  "paddb %%mm1, %%mm0 \n\t"
145 
146  "movq (%%"FF_REG_a"), %%mm1 \n\t"
147  PMAXUB(%%mm1, %%mm4)
148  PMINUB(%%mm1, %%mm3, %%mm5)
149  "psubb %%mm1, %%mm2 \n\t"
150  "paddb %%mm7, %%mm2 \n\t"
151  "pcmpgtb %%mm6, %%mm2 \n\t"
152  "paddb %%mm2, %%mm0 \n\t"
153 
154  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
155  PMAXUB(%%mm2, %%mm4)
156  PMINUB(%%mm2, %%mm3, %%mm5)
157  "psubb %%mm2, %%mm1 \n\t"
158  "paddb %%mm7, %%mm1 \n\t"
159  "pcmpgtb %%mm6, %%mm1 \n\t"
160  "paddb %%mm1, %%mm0 \n\t"
161 
162  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
163  PMAXUB(%%mm1, %%mm4)
164  PMINUB(%%mm1, %%mm3, %%mm5)
165  "psubb %%mm1, %%mm2 \n\t"
166  "paddb %%mm7, %%mm2 \n\t"
167  "pcmpgtb %%mm6, %%mm2 \n\t"
168  "paddb %%mm2, %%mm0 \n\t"
169  "psubusb %%mm3, %%mm4 \n\t"
170 
171  " \n\t"
172  "pxor %%mm7, %%mm7 \n\t"
173  "psadbw %%mm7, %%mm0 \n\t"
174  "movq %4, %%mm7 \n\t" // QP,..., QP
175  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
176  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
177  "packssdw %%mm4, %%mm4 \n\t"
178  "movd %%mm0, %0 \n\t"
179  "movd %%mm4, %1 \n\t"
180 
181  : "=r" (numEq), "=r" (dcOk)
182  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
183  : "%"FF_REG_a
184  );
185 
186  numEq= (-numEq) &0xFF;
187  if(numEq > c->ppMode.flatnessThreshold){
188  if(dcOk) return 0;
189  else return 1;
190  }else{
191  return 2;
192  }
193 }
194 #endif //TEMPLATE_PP_MMXEXT
195 
196 /**
197  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
198  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
199  */
200 #if !TEMPLATE_PP_ALTIVEC
201 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
202 {
203 #if TEMPLATE_PP_MMXEXT
204  src+= stride*3;
205  __asm__ volatile( //"movv %0 %1 %2\n\t"
206  "movq %2, %%mm0 \n\t" // QP,..., QP
207  "pxor %%mm4, %%mm4 \n\t"
208 
209  "movq (%0), %%mm6 \n\t"
210  "movq (%0, %1), %%mm5 \n\t"
211  "movq %%mm5, %%mm1 \n\t"
212  "movq %%mm6, %%mm2 \n\t"
213  "psubusb %%mm6, %%mm5 \n\t"
214  "psubusb %%mm1, %%mm2 \n\t"
215  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
216  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
217  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
218 
219  "pand %%mm2, %%mm6 \n\t"
220  "pandn %%mm1, %%mm2 \n\t"
221  "por %%mm2, %%mm6 \n\t"// First Line to Filter
222 
223  "movq (%0, %1, 8), %%mm5 \n\t"
224  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
225  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
226  "sub %1, %%"FF_REG_c" \n\t"
227  "add %1, %0 \n\t" // %0 points to line 1 not 0
228  "movq (%0, %1, 8), %%mm7 \n\t"
229  "movq %%mm5, %%mm1 \n\t"
230  "movq %%mm7, %%mm2 \n\t"
231  "psubusb %%mm7, %%mm5 \n\t"
232  "psubusb %%mm1, %%mm2 \n\t"
233  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
234  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
235  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
236 
237  "pand %%mm2, %%mm7 \n\t"
238  "pandn %%mm1, %%mm2 \n\t"
239  "por %%mm2, %%mm7 \n\t" // First Line to Filter
240 
241 
242  // 1 2 3 4 5 6 7 8
243  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
244  // 6 4 2 2 1 1
245  // 6 4 4 2
246  // 6 8 2
247 
248  "movq (%0, %1), %%mm0 \n\t" // 1
249  "movq %%mm0, %%mm1 \n\t" // 1
250  PAVGB(%%mm6, %%mm0) //1 1 /2
251  PAVGB(%%mm6, %%mm0) //3 1 /4
252 
253  "movq (%0, %1, 4), %%mm2 \n\t" // 1
254  "movq %%mm2, %%mm5 \n\t" // 1
255  PAVGB((%%FF_REGa), %%mm2) // 11 /2
256  PAVGB((%0, %1, 2), %%mm2) // 211 /4
257  "movq %%mm2, %%mm3 \n\t" // 211 /4
258  "movq (%0), %%mm4 \n\t" // 1
259  PAVGB(%%mm4, %%mm3) // 4 211 /8
260  PAVGB(%%mm0, %%mm3) //642211 /16
261  "movq %%mm3, (%0) \n\t" // X
262  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
263  "movq %%mm1, %%mm0 \n\t" // 1
264  PAVGB(%%mm6, %%mm0) //1 1 /2
265  "movq %%mm4, %%mm3 \n\t" // 1
266  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
267  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
268  PAVGB((%%FF_REGa), %%mm5) // 211 /4
269  PAVGB(%%mm5, %%mm3) // 2 2211 /8
270  PAVGB(%%mm0, %%mm3) //4242211 /16
271  "movq %%mm3, (%0,%1) \n\t" // X
272  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
273  PAVGB(%%mm4, %%mm6) //11 /2
274  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
275  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
276  "movq %%mm0, %%mm3 \n\t" // 11/2
277  PAVGB(%%mm1, %%mm0) // 2 11/4
278  PAVGB(%%mm6, %%mm0) //222 11/8
279  PAVGB(%%mm2, %%mm0) //22242211/16
280  "movq (%0, %1, 2), %%mm2 \n\t" // 1
281  "movq %%mm0, (%0, %1, 2) \n\t" // X
282  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
283  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
284  PAVGB((%%FF_REGc), %%mm0) // 11 /2
285  PAVGB(%%mm0, %%mm6) //11 11 /4
286  PAVGB(%%mm1, %%mm4) // 11 /2
287  PAVGB(%%mm2, %%mm1) // 11 /2
288  PAVGB(%%mm1, %%mm6) //1122 11 /8
289  PAVGB(%%mm5, %%mm6) //112242211 /16
290  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
291  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
292  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
293  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
294  PAVGB(%%mm7, %%mm6) // 11 /2
295  PAVGB(%%mm4, %%mm6) // 11 11 /4
296  PAVGB(%%mm3, %%mm6) // 11 2211 /8
297  PAVGB(%%mm5, %%mm2) // 11 /2
298  "movq (%0, %1, 4), %%mm4 \n\t" // 1
299  PAVGB(%%mm4, %%mm2) // 112 /4
300  PAVGB(%%mm2, %%mm6) // 112242211 /16
301  "movq %%mm6, (%0, %1, 4) \n\t" // X
302  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
303  PAVGB(%%mm7, %%mm1) // 11 2 /4
304  PAVGB(%%mm4, %%mm5) // 11 /2
305  PAVGB(%%mm5, %%mm0) // 11 11 /4
306  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
307  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
308  PAVGB(%%mm0, %%mm1) // 11224222 /16
309  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
310  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
311  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
312  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
313  PAVGB(%%mm0, %%mm6) // 1 1 /2
314  PAVGB(%%mm7, %%mm6) // 1 12 /4
315  PAVGB(%%mm2, %%mm6) // 1122424 /4
316  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
317  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
318  PAVGB(%%mm7, %%mm5) // 11 2 /4
319  PAVGB(%%mm7, %%mm5) // 11 6 /8
320 
321  PAVGB(%%mm3, %%mm0) // 112 /4
322  PAVGB(%%mm0, %%mm5) // 112246 /16
323  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
324  "sub %1, %0 \n\t"
325 
326  :
327  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
328  : "%"FF_REG_a, "%"FF_REG_c
329  );
330 #else //TEMPLATE_PP_MMXEXT
331  const int l1= stride;
332  const int l2= stride + l1;
333  const int l3= stride + l2;
334  const int l4= stride + l3;
335  const int l5= stride + l4;
336  const int l6= stride + l5;
337  const int l7= stride + l6;
338  const int l8= stride + l7;
339  const int l9= stride + l8;
340  int x;
341  src+= stride*3;
342  for(x=0; x<BLOCK_SIZE; x++){
343  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
344  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
345 
346  int sums[10];
347  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
348  sums[1] = sums[0] - first + src[l4];
349  sums[2] = sums[1] - first + src[l5];
350  sums[3] = sums[2] - first + src[l6];
351  sums[4] = sums[3] - first + src[l7];
352  sums[5] = sums[4] - src[l1] + src[l8];
353  sums[6] = sums[5] - src[l2] + last;
354  sums[7] = sums[6] - src[l3] + last;
355  sums[8] = sums[7] - src[l4] + last;
356  sums[9] = sums[8] - src[l5] + last;
357 
358  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
359  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
360  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
361  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
362  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
363  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
364  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
365  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
366 
367  src++;
368  }
369 #endif //TEMPLATE_PP_MMXEXT
370 }
371 #endif //TEMPLATE_PP_ALTIVEC
372 
373 /**
374  * Experimental Filter 1
375  * will not damage linear gradients
376  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
377  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
378  * MMX2 version does correct clipping C version does not
379  */
380 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
381 {
382 #if TEMPLATE_PP_MMXEXT
383  src+= stride*3;
384 
385  __asm__ volatile(
386  "pxor %%mm7, %%mm7 \n\t" // 0
387  "lea (%0, %1), %%"FF_REG_a" \n\t"
388  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
389 // 0 1 2 3 4 5 6 7 8 9
390 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
391  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
392  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
393  "movq %%mm1, %%mm2 \n\t" // line 4
394  "psubusb %%mm0, %%mm1 \n\t"
395  "psubusb %%mm2, %%mm0 \n\t"
396  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
397  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
398  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
399  "movq %%mm3, %%mm5 \n\t" // line 5
400  "psubusb %%mm4, %%mm3 \n\t"
401  "psubusb %%mm5, %%mm4 \n\t"
402  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
403  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
404  "movq %%mm2, %%mm1 \n\t" // line 4
405  "psubusb %%mm5, %%mm2 \n\t"
406  "movq %%mm2, %%mm4 \n\t"
407  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
408  "psubusb %%mm1, %%mm5 \n\t"
409  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
410  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
411  "movq %%mm4, %%mm3 \n\t" // d
412  "movq %2, %%mm0 \n\t"
413  "paddusb %%mm0, %%mm0 \n\t"
414  "psubusb %%mm0, %%mm4 \n\t"
415  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
416  "psubusb "MANGLE(b01)", %%mm3 \n\t"
417  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
418 
419  PAVGB(%%mm7, %%mm3) // d/2
420  "movq %%mm3, %%mm1 \n\t" // d/2
421  PAVGB(%%mm7, %%mm3) // d/4
422  PAVGB(%%mm1, %%mm3) // 3*d/8
423 
424  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
425  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
426  "psubusb %%mm3, %%mm0 \n\t"
427  "pxor %%mm2, %%mm0 \n\t"
428  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
429 
430  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
431  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
432  "paddusb %%mm3, %%mm0 \n\t"
433  "pxor %%mm2, %%mm0 \n\t"
434  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
435 
436  PAVGB(%%mm7, %%mm1) // d/4
437 
438  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
439  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
440  "psubusb %%mm1, %%mm0 \n\t"
441  "pxor %%mm2, %%mm0 \n\t"
442  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
443 
444  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
445  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
446  "paddusb %%mm1, %%mm0 \n\t"
447  "pxor %%mm2, %%mm0 \n\t"
448  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
449 
450  PAVGB(%%mm7, %%mm1) // d/8
451 
452  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
453  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
454  "psubusb %%mm1, %%mm0 \n\t"
455  "pxor %%mm2, %%mm0 \n\t"
456  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
457 
458  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
459  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
460  "paddusb %%mm1, %%mm0 \n\t"
461  "pxor %%mm2, %%mm0 \n\t"
462  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
463 
464  :
465  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
467  : "%"FF_REG_a, "%"FF_REG_c
468  );
469 #else //TEMPLATE_PP_MMXEXT
470 
471  const int l1= stride;
472  const int l2= stride + l1;
473  const int l3= stride + l2;
474  const int l4= stride + l3;
475  const int l5= stride + l4;
476  const int l6= stride + l5;
477  const int l7= stride + l6;
478 // const int l8= stride + l7;
479 // const int l9= stride + l8;
480  int x;
481 
482  src+= stride*3;
483  for(x=0; x<BLOCK_SIZE; x++){
484  int a= src[l3] - src[l4];
485  int b= src[l4] - src[l5];
486  int c= src[l5] - src[l6];
487 
488  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
489  d= FFMAX(d, 0);
490 
491  if(d < co->QP*2){
492  int v = d * FFSIGN(-b);
493 
494  src[l2] +=v>>3;
495  src[l3] +=v>>2;
496  src[l4] +=(3*v)>>3;
497  src[l5] -=(3*v)>>3;
498  src[l6] -=v>>2;
499  src[l7] -=v>>3;
500  }
501  src++;
502  }
503 #endif //TEMPLATE_PP_MMXEXT
504 }
505 
506 #if !TEMPLATE_PP_ALTIVEC
507 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
508 {
509 #if TEMPLATE_PP_MMXEXT
510 /*
511  uint8_t tmp[16];
512  const int l1= stride;
513  const int l2= stride + l1;
514  const int l3= stride + l2;
515  const int l4= (int)tmp - (int)src - stride*3;
516  const int l5= (int)tmp - (int)src - stride*3 + 8;
517  const int l6= stride*3 + l3;
518  const int l7= stride + l6;
519  const int l8= stride + l7;
520 
521  memcpy(tmp, src+stride*7, 8);
522  memcpy(tmp+8, src+stride*8, 8);
523 */
524  src+= stride*4;
525  __asm__ volatile(
526 
527 #if 0 //slightly more accurate and slightly slower
528  "pxor %%mm7, %%mm7 \n\t" // 0
529  "lea (%0, %1), %%"FF_REG_a" \n\t"
530  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
531 // 0 1 2 3 4 5 6 7
532 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
533 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
534 
535 
536  "movq (%0, %1, 2), %%mm0 \n\t" // l2
537  "movq (%0), %%mm1 \n\t" // l0
538  "movq %%mm0, %%mm2 \n\t" // l2
539  PAVGB(%%mm7, %%mm0) // ~l2/2
540  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
541  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
542 
543  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
544  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
545  "movq %%mm1, %%mm4 \n\t" // l1
546  PAVGB(%%mm7, %%mm1) // ~l1/2
547  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
548  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
549 
550  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
551  "psubusb %%mm1, %%mm0 \n\t"
552  "psubusb %%mm4, %%mm1 \n\t"
553  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
554 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
555 
556  "movq (%0, %1, 4), %%mm0 \n\t" // l4
557  "movq %%mm0, %%mm4 \n\t" // l4
558  PAVGB(%%mm7, %%mm0) // ~l4/2
559  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
560  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
561 
562  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
563  "movq %%mm3, %%mm5 \n\t" // l3
564  PAVGB(%%mm7, %%mm3) // ~l3/2
565  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
566  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
567 
568  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
569  "psubusb %%mm3, %%mm0 \n\t"
570  "psubusb %%mm6, %%mm3 \n\t"
571  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
572  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
573 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
574 
575  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
576  "movq %%mm6, %%mm5 \n\t" // l6
577  PAVGB(%%mm7, %%mm6) // ~l6/2
578  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
579  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
580 
581  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
582  "movq %%mm2, %%mm4 \n\t" // l5
583  PAVGB(%%mm7, %%mm2) // ~l5/2
584  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
585  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
586 
587  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
588  "psubusb %%mm2, %%mm6 \n\t"
589  "psubusb %%mm4, %%mm2 \n\t"
590  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
591 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
592 
593 
594  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
595  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
596  "paddusb "MANGLE(b01)", %%mm4 \n\t"
597  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
598  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
599  "pand %%mm4, %%mm3 \n\t"
600 
601  "movq %%mm3, %%mm1 \n\t"
602 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
603  PAVGB(%%mm7, %%mm3)
604  PAVGB(%%mm7, %%mm3)
605  "paddusb %%mm1, %%mm3 \n\t"
606 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
607 
608  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
609  "movq (%0, %1, 4), %%mm5 \n\t" //l4
610  "movq (%0, %1, 4), %%mm4 \n\t" //l4
611  "psubusb %%mm6, %%mm5 \n\t"
612  "psubusb %%mm4, %%mm6 \n\t"
613  "por %%mm6, %%mm5 \n\t" // |l3-l4|
614  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
615  "pxor %%mm6, %%mm0 \n\t"
616  "pand %%mm0, %%mm3 \n\t"
617  PMINUB(%%mm5, %%mm3, %%mm0)
618 
619  "psubusb "MANGLE(b01)", %%mm3 \n\t"
620  PAVGB(%%mm7, %%mm3)
621 
622  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
623  "movq (%0, %1, 4), %%mm2 \n\t"
624  "pxor %%mm6, %%mm0 \n\t"
625  "pxor %%mm6, %%mm2 \n\t"
626  "psubb %%mm3, %%mm0 \n\t"
627  "paddb %%mm3, %%mm2 \n\t"
628  "pxor %%mm6, %%mm0 \n\t"
629  "pxor %%mm6, %%mm2 \n\t"
630  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
631  "movq %%mm2, (%0, %1, 4) \n\t"
632 #endif //0
633 
634  "lea (%0, %1), %%"FF_REG_a" \n\t"
635  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
636 // 0 1 2 3 4 5 6 7
637 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
638 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
639 
640 
641  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
642  "movq (%0, %1, 4), %%mm0 \n\t" // l4
643  "pxor %%mm6, %%mm1 \n\t" // -l3-1
644  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
645 // mm1=-l3-1, mm0=128-q
646 
647  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
648  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
649  "pxor %%mm6, %%mm2 \n\t" // -l5-1
650  "movq %%mm2, %%mm5 \n\t" // -l5-1
651  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
652  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
653  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
654  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
655  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
656  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
657 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
658 
659  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
660  "pxor %%mm6, %%mm2 \n\t" // -l1-1
661  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
662  PAVGB((%0), %%mm1) // (l0-l3+256)/2
663  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
664  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
665  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
666  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
667 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
668 
669  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
670  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
671  "pxor %%mm6, %%mm1 \n\t" // -l7-1
672  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
673  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
674  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
675  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
676  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
677 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
678 
679  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
680  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
681  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
682  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
683  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
684  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
685  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
686 
687 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
688 
689  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
690  "movq %2, %%mm2 \n\t" // QP
691  PAVGB(%%mm6, %%mm2) // 128 + QP/2
692  "psubb %%mm6, %%mm2 \n\t"
693 
694  "movq %%mm4, %%mm1 \n\t"
695  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
696  "pxor %%mm1, %%mm4 \n\t"
697  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
698  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
699  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
700 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
701 
702  "movq %%mm4, %%mm3 \n\t" // d
703  "psubusb "MANGLE(b01)", %%mm4 \n\t"
704  PAVGB(%%mm7, %%mm4) // d/32
705  PAVGB(%%mm7, %%mm4) // (d + 32)/64
706  "paddb %%mm3, %%mm4 \n\t" // 5d/64
707  "pand %%mm2, %%mm4 \n\t"
708 
709  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
710  "psubb %%mm0, %%mm5 \n\t" // q
711  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
712  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
713  "pxor %%mm7, %%mm5 \n\t"
714 
715  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
716  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
717 
718  "pand %%mm7, %%mm4 \n\t"
719  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
720  "movq (%0, %1, 4), %%mm2 \n\t"
721  "pxor %%mm1, %%mm0 \n\t"
722  "pxor %%mm1, %%mm2 \n\t"
723  "paddb %%mm4, %%mm0 \n\t"
724  "psubb %%mm4, %%mm2 \n\t"
725  "pxor %%mm1, %%mm0 \n\t"
726  "pxor %%mm1, %%mm2 \n\t"
727  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
728  "movq %%mm2, (%0, %1, 4) \n\t"
729 
730  :
731  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
732  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
733  : "%"FF_REG_a, "%"FF_REG_c
734  );
735 
736 /*
737  {
738  int x;
739  src-= stride;
740  for(x=0; x<BLOCK_SIZE; x++){
741  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
742  if(FFABS(middleEnergy)< 8*QP){
743  const int q=(src[l4] - src[l5])/2;
744  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
745  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
746 
747  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
748  d= FFMAX(d, 0);
749 
750  d= (5*d + 32) >> 6;
751  d*= FFSIGN(-middleEnergy);
752 
753  if(q>0){
754  d= d<0 ? 0 : d;
755  d= d>q ? q : d;
756  }else{
757  d= d>0 ? 0 : d;
758  d= d<q ? q : d;
759  }
760 
761  src[l4]-= d;
762  src[l5]+= d;
763  }
764  src++;
765  }
766  src-=8;
767  for(x=0; x<8; x++){
768  int y;
769  for(y=4; y<6; y++){
770  int d= src[x+y*stride] - tmp[x+(y-4)*8];
771  int ad= FFABS(d);
772  static int max=0;
773  static int sum=0;
774  static int num=0;
775  static int bias=0;
776 
777  if(max<ad) max=ad;
778  sum+= ad>3 ? 1 : 0;
779  if(ad>3){
780  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
781  }
782  if(y==4) bias+=d;
783  num++;
784  if(num%1000000 == 0){
785  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
786  }
787  }
788  }
789 }
790 */
791 #else //TEMPLATE_PP_MMXEXT
792  const int l1= stride;
793  const int l2= stride + l1;
794  const int l3= stride + l2;
795  const int l4= stride + l3;
796  const int l5= stride + l4;
797  const int l6= stride + l5;
798  const int l7= stride + l6;
799  const int l8= stride + l7;
800 // const int l9= stride + l8;
801  int x;
802  src+= stride*3;
803  for(x=0; x<BLOCK_SIZE; x++){
804  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
805  if(FFABS(middleEnergy) < 8*c->QP){
806  const int q=(src[l4] - src[l5])/2;
807  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
808  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
809 
810  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
811  d= FFMAX(d, 0);
812 
813  d= (5*d + 32) >> 6;
814  d*= FFSIGN(-middleEnergy);
815 
816  if(q>0){
817  d = FFMAX(d, 0);
818  d = FFMIN(d, q);
819  }else{
820  d = FFMIN(d, 0);
821  d = FFMAX(d, q);
822  }
823 
824  src[l4]-= d;
825  src[l5]+= d;
826  }
827  src++;
828  }
829 #endif //TEMPLATE_PP_MMXEXT
830 }
831 #endif //TEMPLATE_PP_ALTIVEC
832 
833 #if !TEMPLATE_PP_ALTIVEC
834 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c, int leftborder, int rightborder, int topborder)
835 {
836 #if TEMPLATE_PP_MMXEXT && HAVE_7REGS
837  if (topborder)
838  return;
839  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
840  __asm__ volatile(
841  "pxor %%mm6, %%mm6 \n\t"
842  "pcmpeqb %%mm7, %%mm7 \n\t"
843  "movq %2, %%mm0 \n\t"
844  "punpcklbw %%mm6, %%mm0 \n\t"
845  "psrlw $1, %%mm0 \n\t"
846  "psubw %%mm7, %%mm0 \n\t"
847  "packuswb %%mm0, %%mm0 \n\t"
848  "movq %%mm0, %3 \n\t"
849 
850  "lea (%0, %1), %%"FF_REG_a" \n\t"
851  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
852 
853 // 0 1 2 3 4 5 6 7 8 9
854 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
855 
856 #undef REAL_FIND_MIN_MAX
857 #undef FIND_MIN_MAX
858 #define REAL_FIND_MIN_MAX(addr)\
859  "movq " #addr ", %%mm0 \n\t"\
860  "pminub %%mm0, %%mm7 \n\t"\
861  "pmaxub %%mm0, %%mm6 \n\t"
862 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
863 
864 FIND_MIN_MAX((%%FF_REGa))
865 FIND_MIN_MAX((%%FF_REGa, %1))
866 FIND_MIN_MAX((%%FF_REGa, %1, 2))
867 FIND_MIN_MAX((%0, %1, 4))
868 FIND_MIN_MAX((%%FF_REGd))
869 FIND_MIN_MAX((%%FF_REGd, %1))
870 FIND_MIN_MAX((%%FF_REGd, %1, 2))
871 FIND_MIN_MAX((%0, %1, 8))
872 
873  "movq %%mm7, %%mm4 \n\t"
874  "psrlq $8, %%mm7 \n\t"
875  "pminub %%mm4, %%mm7 \n\t" // min of pixels
876  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
877  "pminub %%mm4, %%mm7 \n\t" // min of pixels
878  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
879  "pminub %%mm4, %%mm7 \n\t"
880 
881 
882  "movq %%mm6, %%mm4 \n\t"
883  "psrlq $8, %%mm6 \n\t"
884  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
885  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
886  "pmaxub %%mm4, %%mm6 \n\t"
887  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
888  "pmaxub %%mm4, %%mm6 \n\t"
889  "movq %%mm6, %%mm0 \n\t" // max
890  "psubb %%mm7, %%mm6 \n\t" // max - min
891  "push %%"FF_REG_a" \n\t"
892  "movd %%mm6, %%eax \n\t"
893  "cmpb "MANGLE(deringThreshold)", %%al \n\t"
894  "pop %%"FF_REG_a" \n\t"
895  " jb 1f \n\t"
896  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
897  "punpcklbw %%mm7, %%mm7 \n\t"
898  "punpcklbw %%mm7, %%mm7 \n\t"
899  "punpcklbw %%mm7, %%mm7 \n\t"
900  "movq %%mm7, (%4) \n\t"
901 
902  "movq (%0), %%mm0 \n\t" // L10
903  "movq %%mm0, %%mm1 \n\t" // L10
904  "movq %%mm0, %%mm2 \n\t" // L10
905  "psllq $8, %%mm1 \n\t"
906  "psrlq $8, %%mm2 \n\t"
907  "movd -4(%0), %%mm3 \n\t"
908  "movd 8(%0), %%mm4 \n\t"
909  "psrlq $24, %%mm3 \n\t"
910  "psllq $56, %%mm4 \n\t"
911  "por %%mm3, %%mm1 \n\t" // L00
912  "por %%mm4, %%mm2 \n\t" // L20
913  "movq %%mm1, %%mm3 \n\t" // L00
914  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
915  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
916  "psubusb %%mm7, %%mm0 \n\t"
917  "psubusb %%mm7, %%mm2 \n\t"
918  "psubusb %%mm7, %%mm3 \n\t"
919  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
920  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
921  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
922  "paddb %%mm2, %%mm0 \n\t"
923  "paddb %%mm3, %%mm0 \n\t"
924 
925  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
926  "movq %%mm2, %%mm3 \n\t" // L11
927  "movq %%mm2, %%mm4 \n\t" // L11
928  "psllq $8, %%mm3 \n\t"
929  "psrlq $8, %%mm4 \n\t"
930  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
931  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
932  "psrlq $24, %%mm5 \n\t"
933  "psllq $56, %%mm6 \n\t"
934  "por %%mm5, %%mm3 \n\t" // L01
935  "por %%mm6, %%mm4 \n\t" // L21
936  "movq %%mm3, %%mm5 \n\t" // L01
937  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
938  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
939  "psubusb %%mm7, %%mm2 \n\t"
940  "psubusb %%mm7, %%mm4 \n\t"
941  "psubusb %%mm7, %%mm5 \n\t"
942  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
943  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
944  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
945  "paddb %%mm4, %%mm2 \n\t"
946  "paddb %%mm5, %%mm2 \n\t"
947 // 0, 2, 3, 1
948 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
949  "movq " #src ", " #sx " \n\t" /* src[0] */\
950  "movq " #sx ", " #lx " \n\t" /* src[0] */\
951  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
952  "psllq $8, " #lx " \n\t"\
953  "psrlq $8, " #t0 " \n\t"\
954  "movd -4" #src ", " #t1 " \n\t"\
955  "psrlq $24, " #t1 " \n\t"\
956  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
957  "movd 8" #src ", " #t1 " \n\t"\
958  "psllq $56, " #t1 " \n\t"\
959  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
960  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
961  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
962  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
963  PAVGB(lx, pplx) \
964  "movq " #lx ", 8(%4) \n\t"\
965  "movq (%4), " #lx " \n\t"\
966  "psubusb " #lx ", " #t1 " \n\t"\
967  "psubusb " #lx ", " #t0 " \n\t"\
968  "psubusb " #lx ", " #sx " \n\t"\
969  "movq "MANGLE(b00)", " #lx " \n\t"\
970  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
971  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
972  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
973  "paddb " #t1 ", " #t0 " \n\t"\
974  "paddb " #t0 ", " #sx " \n\t"\
975 \
976  PAVGB(plx, pplx) /* filtered */\
977  "movq " #dst ", " #t0 " \n\t" /* dst */\
978  "movq " #t0 ", " #t1 " \n\t" /* dst */\
979  "psubusb %3, " #t0 " \n\t"\
980  "paddusb %3, " #t1 " \n\t"\
981  PMAXUB(t0, pplx)\
982  PMINUB(t1, pplx, t0)\
983  "paddb " #sx ", " #ppsx " \n\t"\
984  "paddb " #psx ", " #ppsx " \n\t"\
985  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
986  "pand "MANGLE(b08)", " #ppsx " \n\t"\
987  "pcmpeqb " #lx ", " #ppsx " \n\t"\
988  "pand " #ppsx ", " #pplx " \n\t"\
989  "pandn " #dst ", " #ppsx " \n\t"\
990  "por " #pplx ", " #ppsx " \n\t"\
991  "movq " #ppsx ", " #dst " \n\t"\
992  "movq 8(%4), " #lx " \n\t"
993 
994 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
995  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
996 /*
997 0000000
998 1111111
999 
1000 1111110
1001 1111101
1002 1111100
1003 1111011
1004 1111010
1005 1111001
1006 
1007 1111000
1008 1110111
1009 
1010 */
1011 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1012 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1013 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1014 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1015 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1016 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1017 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1018 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1019 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1020 
1021  "1: \n\t"
1022  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1023  NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
1024  : "%"FF_REG_a, "%"FF_REG_d
1025  );
1026 #else // HAVE_7REGS && TEMPLATE_PP_MMXEXT
1027  int y;
1028  int min=255;
1029  int max=0;
1030  int avg;
1031  uint8_t *p;
1032  int s[10];
1033  const int QP2= c->QP/2 + 1;
1034 
1035  src --;
1036  for(y=1; y<9; y++){
1037  int x;
1038  p= src + stride*y;
1039  for(x=1; x<9; x++){
1040  p++;
1041  if(*p > max) max= *p;
1042  if(*p < min) min= *p;
1043  }
1044  }
1045  avg= (min + max + 1)>>1;
1046 
1047  if(max - min <deringThreshold) return;
1048 
1049  s[0] = 0;
1050  for(y=topborder; y<10; y++){
1051  int t = 0;
1052 
1053  if(!leftborder && src[stride*y + 0] > avg) t+= 1;
1054  if(src[stride*y + 1] > avg) t+= 2;
1055  if(src[stride*y + 2] > avg) t+= 4;
1056  if(src[stride*y + 3] > avg) t+= 8;
1057  if(src[stride*y + 4] > avg) t+= 16;
1058  if(src[stride*y + 5] > avg) t+= 32;
1059  if(src[stride*y + 6] > avg) t+= 64;
1060  if(src[stride*y + 7] > avg) t+= 128;
1061  if(src[stride*y + 8] > avg) t+= 256;
1062  if(!rightborder && src[stride*y + 9] > avg) t+= 512;
1063 
1064  t |= (~t)<<16;
1065  t &= (t<<1) & (t>>1);
1066  s[y] = t;
1067  }
1068 
1069  for(y=1; y<9; y++){
1070  int t = s[y-1] & s[y] & s[y+1];
1071  t|= t>>16;
1072  s[y-1]= t;
1073  }
1074 
1075  for(y=1; y<9; y++){
1076  int x;
1077  int t = s[y-1];
1078 
1079  p= src + stride*y + leftborder;
1080  for(x=1+leftborder; x<9-rightborder; x++){
1081  p++;
1082  if(t & (1<<x)){
1083  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1084  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1085  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1086  f= (f + 8)>>4;
1087 
1088 #ifdef DEBUG_DERING_THRESHOLD
1089  __asm__ volatile("emms\n\t":);
1090  {
1091  static uint64_t numPixels=0;
1092  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1093 // if((max-min)<20 || (max-min)*QP<200)
1094 // if((max-min)*QP < 500)
1095 // if(max-min<QP/2)
1096  if(max-min < 20){
1097  static int numSkipped=0;
1098  static int errorSum=0;
1099  static int worstQP=0;
1100  static int worstRange=0;
1101  static int worstDiff=0;
1102  int diff= (f - *p);
1103  int absDiff= FFABS(diff);
1104  int error= diff*diff;
1105 
1106  if(x==1 || x==8 || y==1 || y==8) continue;
1107 
1108  numSkipped++;
1109  if(absDiff > worstDiff){
1110  worstDiff= absDiff;
1111  worstQP= QP;
1112  worstRange= max-min;
1113  }
1114  errorSum+= error;
1115 
1116  if(1024LL*1024LL*1024LL % numSkipped == 0){
1117  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1118  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1119  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1120  worstDiff, (float)numSkipped/numPixels);
1121  }
1122  }
1123  }
1124 #endif
1125  if (*p + QP2 < f) *p= *p + QP2;
1126  else if(*p - QP2 > f) *p= *p - QP2;
1127  else *p=f;
1128  }
1129  }
1130  }
1131 #ifdef DEBUG_DERING_THRESHOLD
1132  if(max-min < 20){
1133  for(y=1; y<9; y++){
1134  int x;
1135  int t = 0;
1136  p= src + stride*y;
1137  for(x=1; x<9; x++){
1138  p++;
1139  *p = FFMIN(*p + 20, 255);
1140  }
1141  }
1142 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1143  }
1144 #endif
1145 #endif //TEMPLATE_PP_MMXEXT
1146 }
1147 #endif //TEMPLATE_PP_ALTIVEC
1148 
1149 /**
1150  * Deinterlace the given block by linearly interpolating every second line.
1151  * will be called for every 8x8 block and can read & write from line 4-15
1152  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1153  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1154  */
1155 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1156 {
1157 #if TEMPLATE_PP_MMXEXT
1158  src+= 4*stride;
1159  __asm__ volatile(
1160  "lea (%0, %1), %%"FF_REG_a" \n\t"
1161  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1162 // 0 1 2 3 4 5 6 7 8 9
1163 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1164 
1165  "movq (%0), %%mm0 \n\t"
1166  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1167  PAVGB(%%mm1, %%mm0)
1168  "movq %%mm0, (%%"FF_REG_a") \n\t"
1169  "movq (%0, %1, 4), %%mm0 \n\t"
1170  PAVGB(%%mm0, %%mm1)
1171  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1172  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1173  PAVGB(%%mm1, %%mm0)
1174  "movq %%mm0, (%%"FF_REG_c") \n\t"
1175  "movq (%0, %1, 8), %%mm0 \n\t"
1176  PAVGB(%%mm0, %%mm1)
1177  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1178 
1179  : : "r" (src), "r" ((x86_reg)stride)
1180  : "%"FF_REG_a, "%"FF_REG_c
1181  );
1182 #else
1183  int a, b, x;
1184  src+= 4*stride;
1185 
1186  for(x=0; x<2; x++){
1187  a= *(uint32_t*)&src[stride*0];
1188  b= *(uint32_t*)&src[stride*2];
1189  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1190  a= *(uint32_t*)&src[stride*4];
1191  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1192  b= *(uint32_t*)&src[stride*6];
1193  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1194  a= *(uint32_t*)&src[stride*8];
1195  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1196  src += 4;
1197  }
1198 #endif
1199 }
1200 
1201 /**
1202  * Deinterlace the given block by cubic interpolating every second line.
1203  * will be called for every 8x8 block and can read & write from line 4-15
1204  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1205  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1206  * this filter will read lines 3-15 and write 7-13
1207  */
1208 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1209 {
1210 #if TEMPLATE_PP_SSE2
1211  src+= stride*3;
1212  __asm__ volatile(
1213  "lea (%0, %1), %%"FF_REG_a" \n\t"
1214  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1215  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1216  "add %1, %%"FF_REG_c" \n\t"
1217  "pxor %%xmm7, %%xmm7 \n\t"
1218 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1219  "movq " #a ", %%xmm0 \n\t"\
1220  "movq " #b ", %%xmm1 \n\t"\
1221  "movq " #d ", %%xmm2 \n\t"\
1222  "movq " #e ", %%xmm3 \n\t"\
1223  "pavgb %%xmm2, %%xmm1 \n\t"\
1224  "pavgb %%xmm3, %%xmm0 \n\t"\
1225  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1226  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1227  "psubw %%xmm1, %%xmm0 \n\t"\
1228  "psraw $3, %%xmm0 \n\t"\
1229  "psubw %%xmm0, %%xmm1 \n\t"\
1230  "packuswb %%xmm1, %%xmm1 \n\t"\
1231  "movlps %%xmm1, " #c " \n\t"
1232 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1233 
1234 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1235 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1236 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1237 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1238 
1239  : : "r" (src), "r" ((x86_reg)stride)
1240  :
1241  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1242  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1243  );
1244 #undef REAL_DEINT_CUBIC
1245 #else //TEMPLATE_PP_SSE2
1246  int x;
1247  src+= stride*3;
1248  for(x=0; x<8; x++){
1249  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1250  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1251  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1252  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1253  src++;
1254  }
1255 #endif //TEMPLATE_PP_SSE2
1256 }
1257 
1258 /**
1259  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1260  * will be called for every 8x8 block and can read & write from line 4-15
1261  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1262  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1263  * this filter will read lines 4-13 and write 5-11
1264  */
1265 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1266 {
1267 #if TEMPLATE_PP_MMXEXT
1268  src+= stride*4;
1269  __asm__ volatile(
1270  "lea (%0, %1), %%"FF_REG_a" \n\t"
1271  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1272  "pxor %%mm7, %%mm7 \n\t"
1273  "movq (%2), %%mm0 \n\t"
1274 // 0 1 2 3 4 5 6 7 8 9 10
1275 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1276 
1277 #define REAL_DEINT_FF(a,b,c,d)\
1278  "movq " #a ", %%mm1 \n\t"\
1279  "movq " #b ", %%mm2 \n\t"\
1280  "movq " #c ", %%mm3 \n\t"\
1281  "movq " #d ", %%mm4 \n\t"\
1282  PAVGB(%%mm3, %%mm1) \
1283  PAVGB(%%mm4, %%mm0) \
1284  "movq %%mm0, %%mm3 \n\t"\
1285  "punpcklbw %%mm7, %%mm0 \n\t"\
1286  "punpckhbw %%mm7, %%mm3 \n\t"\
1287  "movq %%mm1, %%mm4 \n\t"\
1288  "punpcklbw %%mm7, %%mm1 \n\t"\
1289  "punpckhbw %%mm7, %%mm4 \n\t"\
1290  "psllw $2, %%mm1 \n\t"\
1291  "psllw $2, %%mm4 \n\t"\
1292  "psubw %%mm0, %%mm1 \n\t"\
1293  "psubw %%mm3, %%mm4 \n\t"\
1294  "movq %%mm2, %%mm5 \n\t"\
1295  "movq %%mm2, %%mm0 \n\t"\
1296  "punpcklbw %%mm7, %%mm2 \n\t"\
1297  "punpckhbw %%mm7, %%mm5 \n\t"\
1298  "paddw %%mm2, %%mm1 \n\t"\
1299  "paddw %%mm5, %%mm4 \n\t"\
1300  "psraw $2, %%mm1 \n\t"\
1301  "psraw $2, %%mm4 \n\t"\
1302  "packuswb %%mm4, %%mm1 \n\t"\
1303  "movq %%mm1, " #b " \n\t"\
1304 
1305 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1306 
1307 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1308 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1309 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1310 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1311 
1312  "movq %%mm0, (%2) \n\t"
1313  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1314  : "%"FF_REG_a, "%"FF_REG_d
1315  );
1316 #else //TEMPLATE_PP_MMXEXT
1317  int x;
1318  src+= stride*4;
1319  for(x=0; x<8; x++){
1320  int t1= tmp[x];
1321  int t2= src[stride*1];
1322 
1323  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1324  t1= src[stride*4];
1325  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1326  t2= src[stride*6];
1327  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1328  t1= src[stride*8];
1329  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1330  tmp[x]= t1;
1331 
1332  src++;
1333  }
1334 #endif //TEMPLATE_PP_MMXEXT
1335 }
1336 
1337 /**
1338  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1339  * will be called for every 8x8 block and can read & write from line 4-15
1340  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1341  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1342  * this filter will read lines 4-13 and write 4-11
1343  */
1344 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1345 {
1346 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1347  src+= stride*4;
1348  __asm__ volatile(
1349  "lea (%0, %1), %%"FF_REG_a" \n\t"
1350  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1351  "pxor %%mm7, %%mm7 \n\t"
1352  "movq (%2), %%mm0 \n\t"
1353  "movq (%3), %%mm1 \n\t"
1354 // 0 1 2 3 4 5 6 7 8 9 10
1355 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1356 
1357 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1358  "movq " #a ", %%mm2 \n\t"\
1359  "movq " #b ", %%mm3 \n\t"\
1360  "movq " #c ", %%mm4 \n\t"\
1361  PAVGB(t2, %%mm3) \
1362  PAVGB(t1, %%mm4) \
1363  "movq %%mm2, %%mm5 \n\t"\
1364  "movq %%mm2, " #t1 " \n\t"\
1365  "punpcklbw %%mm7, %%mm2 \n\t"\
1366  "punpckhbw %%mm7, %%mm5 \n\t"\
1367  "movq %%mm2, %%mm6 \n\t"\
1368  "paddw %%mm2, %%mm2 \n\t"\
1369  "paddw %%mm6, %%mm2 \n\t"\
1370  "movq %%mm5, %%mm6 \n\t"\
1371  "paddw %%mm5, %%mm5 \n\t"\
1372  "paddw %%mm6, %%mm5 \n\t"\
1373  "movq %%mm3, %%mm6 \n\t"\
1374  "punpcklbw %%mm7, %%mm3 \n\t"\
1375  "punpckhbw %%mm7, %%mm6 \n\t"\
1376  "paddw %%mm3, %%mm3 \n\t"\
1377  "paddw %%mm6, %%mm6 \n\t"\
1378  "paddw %%mm3, %%mm2 \n\t"\
1379  "paddw %%mm6, %%mm5 \n\t"\
1380  "movq %%mm4, %%mm6 \n\t"\
1381  "punpcklbw %%mm7, %%mm4 \n\t"\
1382  "punpckhbw %%mm7, %%mm6 \n\t"\
1383  "psubw %%mm4, %%mm2 \n\t"\
1384  "psubw %%mm6, %%mm5 \n\t"\
1385  "psraw $2, %%mm2 \n\t"\
1386  "psraw $2, %%mm5 \n\t"\
1387  "packuswb %%mm5, %%mm2 \n\t"\
1388  "movq %%mm2, " #a " \n\t"\
1389 
1390 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1391 
1392 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1393 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1394 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1395 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1396 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1397 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1398 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1399 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1400 
1401  "movq %%mm0, (%2) \n\t"
1402  "movq %%mm1, (%3) \n\t"
1403  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1404  : "%"FF_REG_a, "%"FF_REG_d
1405  );
1406 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
1407  int x;
1408  src+= stride*4;
1409  for(x=0; x<8; x++){
1410  int t1= tmp[x];
1411  int t2= tmp2[x];
1412  int t3= src[0];
1413 
1414  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1415  t1= src[stride*1];
1416  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1417  t2= src[stride*2];
1418  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1419  t3= src[stride*3];
1420  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1421  t1= src[stride*4];
1422  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1423  t2= src[stride*5];
1424  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1425  t3= src[stride*6];
1426  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1427  t1= src[stride*7];
1428  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1429 
1430  tmp[x]= t3;
1431  tmp2[x]= t1;
1432 
1433  src++;
1434  }
1435 #endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS
1436 }
1437 
1438 /**
1439  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1440  * will be called for every 8x8 block and can read & write from line 4-15
1441  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1442  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1443  * this filter will read lines 4-13 and write 4-11
1444  */
1445 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1446 {
1447 #if TEMPLATE_PP_MMXEXT
1448  src+= 4*stride;
1449  __asm__ volatile(
1450  "lea (%0, %1), %%"FF_REG_a" \n\t"
1451  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1452 // 0 1 2 3 4 5 6 7 8 9
1453 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1454 
1455  "movq (%2), %%mm0 \n\t" // L0
1456  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1457  PAVGB(%%mm1, %%mm0) // L0+L2
1458  "movq (%0), %%mm2 \n\t" // L1
1459  PAVGB(%%mm2, %%mm0)
1460  "movq %%mm0, (%0) \n\t"
1461  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1462  PAVGB(%%mm0, %%mm2) // L1+L3
1463  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1464  "movq %%mm2, (%%"FF_REG_a") \n\t"
1465  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1466  PAVGB(%%mm2, %%mm1) // L2+L4
1467  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1468  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1469  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1470  PAVGB(%%mm1, %%mm0) // L3+L5
1471  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1472  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1473  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1474  PAVGB(%%mm0, %%mm2) // L4+L6
1475  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1476  "movq %%mm2, (%0, %1, 4) \n\t"
1477  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1478  PAVGB(%%mm2, %%mm1) // L5+L7
1479  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1480  "movq %%mm1, (%%"FF_REG_d") \n\t"
1481  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1482  PAVGB(%%mm1, %%mm0) // L6+L8
1483  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1484  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1485  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1486  PAVGB(%%mm0, %%mm2) // L7+L9
1487  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1488  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1489  "movq %%mm1, (%2) \n\t"
1490 
1491  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1492  : "%"FF_REG_a, "%"FF_REG_d
1493  );
1494 #else //TEMPLATE_PP_MMXEXT
1495  int a, b, c, x;
1496  src+= 4*stride;
1497 
1498  for(x=0; x<2; x++){
1499  a= *(uint32_t*)&tmp[stride*0];
1500  b= *(uint32_t*)&src[stride*0];
1501  c= *(uint32_t*)&src[stride*1];
1502  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1503  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1504 
1505  a= *(uint32_t*)&src[stride*2];
1506  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1507  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1508 
1509  b= *(uint32_t*)&src[stride*3];
1510  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1511  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1512 
1513  c= *(uint32_t*)&src[stride*4];
1514  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1515  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1516 
1517  a= *(uint32_t*)&src[stride*5];
1518  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1519  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1520 
1521  b= *(uint32_t*)&src[stride*6];
1522  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1523  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1524 
1525  c= *(uint32_t*)&src[stride*7];
1526  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1527  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1528 
1529  a= *(uint32_t*)&src[stride*8];
1530  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1531  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1532 
1533  *(uint32_t*)&tmp[stride*0]= c;
1534  src += 4;
1535  tmp += 4;
1536  }
1537 #endif //TEMPLATE_PP_MMXEXT
1538 }
1539 
1540 /**
1541  * Deinterlace the given block by applying a median filter to every second line.
1542  * will be called for every 8x8 block and can read & write from line 4-15,
1543  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1544  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1545  */
1546 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1547 {
1548 #if TEMPLATE_PP_MMXEXT
1549  src+= 4*stride;
1550  __asm__ volatile(
1551  "lea (%0, %1), %%"FF_REG_a" \n\t"
1552  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1553 // 0 1 2 3 4 5 6 7 8 9
1554 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1555 
1556  "movq (%0), %%mm0 \n\t"
1557  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1558  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1559  "movq %%mm0, %%mm3 \n\t"
1560  "pmaxub %%mm1, %%mm0 \n\t"
1561  "pminub %%mm3, %%mm1 \n\t"
1562  "pmaxub %%mm2, %%mm1 \n\t"
1563  "pminub %%mm1, %%mm0 \n\t"
1564  "movq %%mm0, (%%"FF_REG_a") \n\t"
1565 
1566  "movq (%0, %1, 4), %%mm0 \n\t"
1567  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1568  "movq %%mm2, %%mm3 \n\t"
1569  "pmaxub %%mm1, %%mm2 \n\t"
1570  "pminub %%mm3, %%mm1 \n\t"
1571  "pmaxub %%mm0, %%mm1 \n\t"
1572  "pminub %%mm1, %%mm2 \n\t"
1573  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1574 
1575  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1576  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1577  "movq %%mm2, %%mm3 \n\t"
1578  "pmaxub %%mm0, %%mm2 \n\t"
1579  "pminub %%mm3, %%mm0 \n\t"
1580  "pmaxub %%mm1, %%mm0 \n\t"
1581  "pminub %%mm0, %%mm2 \n\t"
1582  "movq %%mm2, (%%"FF_REG_d") \n\t"
1583 
1584  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1585  "movq (%0, %1, 8), %%mm0 \n\t"
1586  "movq %%mm2, %%mm3 \n\t"
1587  "pmaxub %%mm0, %%mm2 \n\t"
1588  "pminub %%mm3, %%mm0 \n\t"
1589  "pmaxub %%mm1, %%mm0 \n\t"
1590  "pminub %%mm0, %%mm2 \n\t"
1591  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1592 
1593 
1594  : : "r" (src), "r" ((x86_reg)stride)
1595  : "%"FF_REG_a, "%"FF_REG_d
1596  );
1597 
1598 #else //TEMPLATE_PP_MMX
1599  int x, y;
1600  src+= 4*stride;
1601  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1602  for(x=0; x<8; x++){
1603  uint8_t *colsrc = src;
1604  for (y=0; y<4; y++){
1605  int a, b, c, d, e, f;
1606  a = colsrc[0 ];
1607  b = colsrc[stride ];
1608  c = colsrc[stride*2];
1609  d = (a-b)>>31;
1610  e = (b-c)>>31;
1611  f = (c-a)>>31;
1612  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1613  colsrc += stride*2;
1614  }
1615  src++;
1616  }
1617 #endif //TEMPLATE_PP_MMX
1618 }
1619 
1620 #if TEMPLATE_PP_MMX
1621 /**
1622  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1623  */
1624 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1625 {
1626  __asm__(
1627  "lea (%0, %1), %%"FF_REG_a" \n\t"
1628 // 0 1 2 3 4 5 6 7 8 9
1629 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1630  "movq (%0), %%mm0 \n\t" // 12345678
1631  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1632  "movq %%mm0, %%mm2 \n\t" // 12345678
1633  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1634  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1635 
1636  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1637  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1638  "movq %%mm1, %%mm4 \n\t"
1639  "punpcklbw %%mm3, %%mm1 \n\t"
1640  "punpckhbw %%mm3, %%mm4 \n\t"
1641 
1642  "movq %%mm0, %%mm3 \n\t"
1643  "punpcklwd %%mm1, %%mm0 \n\t"
1644  "punpckhwd %%mm1, %%mm3 \n\t"
1645  "movq %%mm2, %%mm1 \n\t"
1646  "punpcklwd %%mm4, %%mm2 \n\t"
1647  "punpckhwd %%mm4, %%mm1 \n\t"
1648 
1649  "movd %%mm0, 128(%2) \n\t"
1650  "psrlq $32, %%mm0 \n\t"
1651  "movd %%mm0, 144(%2) \n\t"
1652  "movd %%mm3, 160(%2) \n\t"
1653  "psrlq $32, %%mm3 \n\t"
1654  "movd %%mm3, 176(%2) \n\t"
1655  "movd %%mm3, 48(%3) \n\t"
1656  "movd %%mm2, 192(%2) \n\t"
1657  "movd %%mm2, 64(%3) \n\t"
1658  "psrlq $32, %%mm2 \n\t"
1659  "movd %%mm2, 80(%3) \n\t"
1660  "movd %%mm1, 96(%3) \n\t"
1661  "psrlq $32, %%mm1 \n\t"
1662  "movd %%mm1, 112(%3) \n\t"
1663 
1664  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
1665 
1666  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1667  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1668  "movq %%mm0, %%mm2 \n\t" // 12345678
1669  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1670  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1671 
1672  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1673  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1674  "movq %%mm1, %%mm4 \n\t"
1675  "punpcklbw %%mm3, %%mm1 \n\t"
1676  "punpckhbw %%mm3, %%mm4 \n\t"
1677 
1678  "movq %%mm0, %%mm3 \n\t"
1679  "punpcklwd %%mm1, %%mm0 \n\t"
1680  "punpckhwd %%mm1, %%mm3 \n\t"
1681  "movq %%mm2, %%mm1 \n\t"
1682  "punpcklwd %%mm4, %%mm2 \n\t"
1683  "punpckhwd %%mm4, %%mm1 \n\t"
1684 
1685  "movd %%mm0, 132(%2) \n\t"
1686  "psrlq $32, %%mm0 \n\t"
1687  "movd %%mm0, 148(%2) \n\t"
1688  "movd %%mm3, 164(%2) \n\t"
1689  "psrlq $32, %%mm3 \n\t"
1690  "movd %%mm3, 180(%2) \n\t"
1691  "movd %%mm3, 52(%3) \n\t"
1692  "movd %%mm2, 196(%2) \n\t"
1693  "movd %%mm2, 68(%3) \n\t"
1694  "psrlq $32, %%mm2 \n\t"
1695  "movd %%mm2, 84(%3) \n\t"
1696  "movd %%mm1, 100(%3) \n\t"
1697  "psrlq $32, %%mm1 \n\t"
1698  "movd %%mm1, 116(%3) \n\t"
1699 
1700 
1701  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
1702  : "%"FF_REG_a
1703  );
1704 }
1705 
1706 /**
1707  * Transpose the given 8x8 block.
1708  */
1709 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
1710 {
1711  __asm__(
1712  "lea (%0, %1), %%"FF_REG_a" \n\t"
1713  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
1714 // 0 1 2 3 4 5 6 7 8 9
1715 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1716  "movq (%2), %%mm0 \n\t" // 12345678
1717  "movq 16(%2), %%mm1 \n\t" // abcdefgh
1718  "movq %%mm0, %%mm2 \n\t" // 12345678
1719  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1720  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1721 
1722  "movq 32(%2), %%mm1 \n\t"
1723  "movq 48(%2), %%mm3 \n\t"
1724  "movq %%mm1, %%mm4 \n\t"
1725  "punpcklbw %%mm3, %%mm1 \n\t"
1726  "punpckhbw %%mm3, %%mm4 \n\t"
1727 
1728  "movq %%mm0, %%mm3 \n\t"
1729  "punpcklwd %%mm1, %%mm0 \n\t"
1730  "punpckhwd %%mm1, %%mm3 \n\t"
1731  "movq %%mm2, %%mm1 \n\t"
1732  "punpcklwd %%mm4, %%mm2 \n\t"
1733  "punpckhwd %%mm4, %%mm1 \n\t"
1734 
1735  "movd %%mm0, (%0) \n\t"
1736  "psrlq $32, %%mm0 \n\t"
1737  "movd %%mm0, (%%"FF_REG_a") \n\t"
1738  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
1739  "psrlq $32, %%mm3 \n\t"
1740  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
1741  "movd %%mm2, (%0, %1, 4) \n\t"
1742  "psrlq $32, %%mm2 \n\t"
1743  "movd %%mm2, (%%"FF_REG_d") \n\t"
1744  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
1745  "psrlq $32, %%mm1 \n\t"
1746  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
1747 
1748 
1749  "movq 64(%2), %%mm0 \n\t" // 12345678
1750  "movq 80(%2), %%mm1 \n\t" // abcdefgh
1751  "movq %%mm0, %%mm2 \n\t" // 12345678
1752  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1753  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1754 
1755  "movq 96(%2), %%mm1 \n\t"
1756  "movq 112(%2), %%mm3 \n\t"
1757  "movq %%mm1, %%mm4 \n\t"
1758  "punpcklbw %%mm3, %%mm1 \n\t"
1759  "punpckhbw %%mm3, %%mm4 \n\t"
1760 
1761  "movq %%mm0, %%mm3 \n\t"
1762  "punpcklwd %%mm1, %%mm0 \n\t"
1763  "punpckhwd %%mm1, %%mm3 \n\t"
1764  "movq %%mm2, %%mm1 \n\t"
1765  "punpcklwd %%mm4, %%mm2 \n\t"
1766  "punpckhwd %%mm4, %%mm1 \n\t"
1767 
1768  "movd %%mm0, 4(%0) \n\t"
1769  "psrlq $32, %%mm0 \n\t"
1770  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
1771  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
1772  "psrlq $32, %%mm3 \n\t"
1773  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
1774  "movd %%mm2, 4(%0, %1, 4) \n\t"
1775  "psrlq $32, %%mm2 \n\t"
1776  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
1777  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
1778  "psrlq $32, %%mm1 \n\t"
1779  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
1780 
1781  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
1782  : "%"FF_REG_a, "%"FF_REG_d
1783  );
1784 }
1785 #endif //TEMPLATE_PP_MMX
1786 //static long test=0;
1787 
1788 #if !TEMPLATE_PP_ALTIVEC
1789 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
1790  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
1791 {
1792  // to save a register (FIXME do this outside of the loops)
1793  tempBlurredPast[127]= maxNoise[0];
1794  tempBlurredPast[128]= maxNoise[1];
1795  tempBlurredPast[129]= maxNoise[2];
1796 
1797 #define FAST_L2_DIFF
1798 //#define L1_DIFF //u should change the thresholds too if u try that one
1799 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1800  __asm__ volatile(
1801  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
1802  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
1803  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1804 // 0 1 2 3 4 5 6 7 8 9
1805 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
1806 //FIXME reorder?
1807 #ifdef L1_DIFF //needs mmx2
1808  "movq (%0), %%mm0 \n\t" // L0
1809  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
1810  "movq (%0, %2), %%mm1 \n\t" // L1
1811  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
1812  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1813  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
1814  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1815  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
1816 
1817  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1818  "paddw %%mm1, %%mm0 \n\t"
1819  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
1820  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1821  "paddw %%mm2, %%mm0 \n\t"
1822  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
1823  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1824  "paddw %%mm3, %%mm0 \n\t"
1825  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
1826  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1827  "paddw %%mm4, %%mm0 \n\t"
1828  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
1829  "paddw %%mm5, %%mm6 \n\t"
1830  "paddw %%mm7, %%mm6 \n\t"
1831  "paddw %%mm6, %%mm0 \n\t"
1832 #else //L1_DIFF
1833 #if defined (FAST_L2_DIFF)
1834  "pcmpeqb %%mm7, %%mm7 \n\t"
1835  "movq "MANGLE(b80)", %%mm6 \n\t"
1836  "pxor %%mm0, %%mm0 \n\t"
1837 #define REAL_L2_DIFF_CORE(a, b)\
1838  "movq " #a ", %%mm5 \n\t"\
1839  "movq " #b ", %%mm2 \n\t"\
1840  "pxor %%mm7, %%mm2 \n\t"\
1841  PAVGB(%%mm2, %%mm5)\
1842  "paddb %%mm6, %%mm5 \n\t"\
1843  "movq %%mm5, %%mm2 \n\t"\
1844  "psllw $8, %%mm5 \n\t"\
1845  "pmaddwd %%mm5, %%mm5 \n\t"\
1846  "pmaddwd %%mm2, %%mm2 \n\t"\
1847  "paddd %%mm2, %%mm5 \n\t"\
1848  "psrld $14, %%mm5 \n\t"\
1849  "paddd %%mm5, %%mm0 \n\t"
1850 
1851 #else //defined (FAST_L2_DIFF)
1852  "pxor %%mm7, %%mm7 \n\t"
1853  "pxor %%mm0, %%mm0 \n\t"
1854 #define REAL_L2_DIFF_CORE(a, b)\
1855  "movq " #a ", %%mm5 \n\t"\
1856  "movq " #b ", %%mm2 \n\t"\
1857  "movq %%mm5, %%mm1 \n\t"\
1858  "movq %%mm2, %%mm3 \n\t"\
1859  "punpcklbw %%mm7, %%mm5 \n\t"\
1860  "punpckhbw %%mm7, %%mm1 \n\t"\
1861  "punpcklbw %%mm7, %%mm2 \n\t"\
1862  "punpckhbw %%mm7, %%mm3 \n\t"\
1863  "psubw %%mm2, %%mm5 \n\t"\
1864  "psubw %%mm3, %%mm1 \n\t"\
1865  "pmaddwd %%mm5, %%mm5 \n\t"\
1866  "pmaddwd %%mm1, %%mm1 \n\t"\
1867  "paddd %%mm1, %%mm5 \n\t"\
1868  "paddd %%mm5, %%mm0 \n\t"
1869 
1870 #endif //defined (FAST_L2_DIFF)
1871 
1872 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
1873 
1874 L2_DIFF_CORE((%0) , (%1))
1875 L2_DIFF_CORE((%0, %2) , (%1, %2))
1876 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
1877 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
1878 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
1879 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
1880 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
1881 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
1882 
1883 #endif //L1_DIFF
1884 
1885  "movq %%mm0, %%mm4 \n\t"
1886  "psrlq $32, %%mm0 \n\t"
1887  "paddd %%mm0, %%mm4 \n\t"
1888  "movd %%mm4, %%ecx \n\t"
1889  "shll $2, %%ecx \n\t"
1890  "mov %3, %%"FF_REG_d" \n\t"
1891  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
1892  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
1893  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
1894  "addl $4, %%ecx \n\t"
1895  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
1896  "shrl $3, %%ecx \n\t"
1897  "movl %%ecx, (%%"FF_REG_d") \n\t"
1898 
1899 // "mov %3, %%"FF_REG_c" \n\t"
1900 // "mov %%"FF_REG_c", test \n\t"
1901 // "jmp 4f \n\t"
1902  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
1903  " jb 2f \n\t"
1904  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
1905  " jb 1f \n\t"
1906 
1907  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1908  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1909  "movq (%0), %%mm0 \n\t" // L0
1910  "movq (%0, %2), %%mm1 \n\t" // L1
1911  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1912  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1913  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1914  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1915  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1916  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1917  "movq %%mm0, (%1) \n\t" // L0
1918  "movq %%mm1, (%1, %2) \n\t" // L1
1919  "movq %%mm2, (%1, %2, 2) \n\t" // L2
1920  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
1921  "movq %%mm4, (%1, %2, 4) \n\t" // L4
1922  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
1923  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
1924  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
1925  "jmp 4f \n\t"
1926 
1927  "1: \n\t"
1928  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1929  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1930  "movq (%0), %%mm0 \n\t" // L0
1931  PAVGB((%1), %%mm0) // L0
1932  "movq (%0, %2), %%mm1 \n\t" // L1
1933  PAVGB((%1, %2), %%mm1) // L1
1934  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1935  PAVGB((%1, %2, 2), %%mm2) // L2
1936  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1937  PAVGB((%1, %%FF_REGa), %%mm3) // L3
1938  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1939  PAVGB((%1, %2, 4), %%mm4) // L4
1940  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1941  PAVGB((%1, %%FF_REGd), %%mm5) // L5
1942  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1943  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
1944  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1945  PAVGB((%1, %%FF_REGc), %%mm7) // L7
1946  "movq %%mm0, (%1) \n\t" // R0
1947  "movq %%mm1, (%1, %2) \n\t" // R1
1948  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1949  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1950  "movq %%mm4, (%1, %2, 4) \n\t" // R4
1951  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
1952  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
1953  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
1954  "movq %%mm0, (%0) \n\t" // L0
1955  "movq %%mm1, (%0, %2) \n\t" // L1
1956  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1957  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1958  "movq %%mm4, (%0, %2, 4) \n\t" // L4
1959  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
1960  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
1961  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
1962  "jmp 4f \n\t"
1963 
1964  "2: \n\t"
1965  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
1966  " jb 3f \n\t"
1967 
1968  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1969  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1970  "movq (%0), %%mm0 \n\t" // L0
1971  "movq (%0, %2), %%mm1 \n\t" // L1
1972  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1973  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1974  "movq (%1), %%mm4 \n\t" // R0
1975  "movq (%1, %2), %%mm5 \n\t" // R1
1976  "movq (%1, %2, 2), %%mm6 \n\t" // R2
1977  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
1978  PAVGB(%%mm4, %%mm0)
1979  PAVGB(%%mm5, %%mm1)
1980  PAVGB(%%mm6, %%mm2)
1981  PAVGB(%%mm7, %%mm3)
1982  PAVGB(%%mm4, %%mm0)
1983  PAVGB(%%mm5, %%mm1)
1984  PAVGB(%%mm6, %%mm2)
1985  PAVGB(%%mm7, %%mm3)
1986  "movq %%mm0, (%1) \n\t" // R0
1987  "movq %%mm1, (%1, %2) \n\t" // R1
1988  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1989  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1990  "movq %%mm0, (%0) \n\t" // L0
1991  "movq %%mm1, (%0, %2) \n\t" // L1
1992  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1993  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1994 
1995  "movq (%0, %2, 4), %%mm0 \n\t" // L4
1996  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
1997  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
1998  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
1999  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2000  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2001  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2002  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2003  PAVGB(%%mm4, %%mm0)
2004  PAVGB(%%mm5, %%mm1)
2005  PAVGB(%%mm6, %%mm2)
2006  PAVGB(%%mm7, %%mm3)
2007  PAVGB(%%mm4, %%mm0)
2008  PAVGB(%%mm5, %%mm1)
2009  PAVGB(%%mm6, %%mm2)
2010  PAVGB(%%mm7, %%mm3)
2011  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2012  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2013  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2014  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2015  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2016  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2017  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2018  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2019  "jmp 4f \n\t"
2020 
2021  "3: \n\t"
2022  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2023  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2024  "movq (%0), %%mm0 \n\t" // L0
2025  "movq (%0, %2), %%mm1 \n\t" // L1
2026  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2027  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2028  "movq (%1), %%mm4 \n\t" // R0
2029  "movq (%1, %2), %%mm5 \n\t" // R1
2030  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2031  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2032  PAVGB(%%mm4, %%mm0)
2033  PAVGB(%%mm5, %%mm1)
2034  PAVGB(%%mm6, %%mm2)
2035  PAVGB(%%mm7, %%mm3)
2036  PAVGB(%%mm4, %%mm0)
2037  PAVGB(%%mm5, %%mm1)
2038  PAVGB(%%mm6, %%mm2)
2039  PAVGB(%%mm7, %%mm3)
2040  PAVGB(%%mm4, %%mm0)
2041  PAVGB(%%mm5, %%mm1)
2042  PAVGB(%%mm6, %%mm2)
2043  PAVGB(%%mm7, %%mm3)
2044  "movq %%mm0, (%1) \n\t" // R0
2045  "movq %%mm1, (%1, %2) \n\t" // R1
2046  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2047  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2048  "movq %%mm0, (%0) \n\t" // L0
2049  "movq %%mm1, (%0, %2) \n\t" // L1
2050  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2051  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2052 
2053  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2054  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2055  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2056  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2057  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2058  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2059  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2060  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2061  PAVGB(%%mm4, %%mm0)
2062  PAVGB(%%mm5, %%mm1)
2063  PAVGB(%%mm6, %%mm2)
2064  PAVGB(%%mm7, %%mm3)
2065  PAVGB(%%mm4, %%mm0)
2066  PAVGB(%%mm5, %%mm1)
2067  PAVGB(%%mm6, %%mm2)
2068  PAVGB(%%mm7, %%mm3)
2069  PAVGB(%%mm4, %%mm0)
2070  PAVGB(%%mm5, %%mm1)
2071  PAVGB(%%mm6, %%mm2)
2072  PAVGB(%%mm7, %%mm3)
2073  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2074  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2075  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2076  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2077  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2078  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2079  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2080  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2081 
2082  "4: \n\t"
2083 
2084  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2086  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2087  );
2088 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2089 {
2090  int y;
2091  int d=0;
2092 // int sysd=0;
2093  int i;
2094 
2095  for(y=0; y<8; y++){
2096  int x;
2097  for(x=0; x<8; x++){
2098  int ref= tempBlurred[ x + y*stride ];
2099  int cur= src[ x + y*stride ];
2100  int d1=ref - cur;
2101 // if(x==0 || x==7) d1+= d1>>1;
2102 // if(y==0 || y==7) d1+= d1>>1;
2103 // d+= FFABS(d1);
2104  d+= d1*d1;
2105 // sysd+= d1;
2106  }
2107  }
2108  i=d;
2109  d= (
2110  4*d
2111  +(*(tempBlurredPast-256))
2112  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2113  +(*(tempBlurredPast+256))
2114  +4)>>3;
2115  *tempBlurredPast=i;
2116 // ((*tempBlurredPast)*3 + d + 2)>>2;
2117 
2118 /*
2119 Switch between
2120  1 0 0 0 0 0 0 (0)
2121 64 32 16 8 4 2 1 (1)
2122 64 48 36 27 20 15 11 (33) (approx)
2123 64 56 49 43 37 33 29 (200) (approx)
2124 */
2125  if(d > maxNoise[1]){
2126  if(d < maxNoise[2]){
2127  for(y=0; y<8; y++){
2128  int x;
2129  for(x=0; x<8; x++){
2130  int ref= tempBlurred[ x + y*stride ];
2131  int cur= src[ x + y*stride ];
2132  tempBlurred[ x + y*stride ]=
2133  src[ x + y*stride ]=
2134  (ref + cur + 1)>>1;
2135  }
2136  }
2137  }else{
2138  for(y=0; y<8; y++){
2139  int x;
2140  for(x=0; x<8; x++){
2141  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2142  }
2143  }
2144  }
2145  }else{
2146  if(d < maxNoise[0]){
2147  for(y=0; y<8; y++){
2148  int x;
2149  for(x=0; x<8; x++){
2150  int ref= tempBlurred[ x + y*stride ];
2151  int cur= src[ x + y*stride ];
2152  tempBlurred[ x + y*stride ]=
2153  src[ x + y*stride ]=
2154  (ref*7 + cur + 4)>>3;
2155  }
2156  }
2157  }else{
2158  for(y=0; y<8; y++){
2159  int x;
2160  for(x=0; x<8; x++){
2161  int ref= tempBlurred[ x + y*stride ];
2162  int cur= src[ x + y*stride ];
2163  tempBlurred[ x + y*stride ]=
2164  src[ x + y*stride ]=
2165  (ref*3 + cur + 2)>>2;
2166  }
2167  }
2168  }
2169  }
2170 }
2171 #endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2172 }
2173 #endif //TEMPLATE_PP_ALTIVEC
2174 
2175 #if TEMPLATE_PP_MMXEXT
2176 /**
2177  * accurate deblock filter
2178  */
2179 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2180  int64_t dc_mask, eq_mask, both_masks;
2181  int64_t sums[10*8*2];
2182  src+= step*3; // src points to begin of the 8x8 Block
2183 
2184  __asm__ volatile(
2185  "movq %0, %%mm7 \n\t"
2186  "movq %1, %%mm6 \n\t"
2187  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2188  );
2189 
2190  __asm__ volatile(
2191  "lea (%2, %3), %%"FF_REG_a" \n\t"
2192 // 0 1 2 3 4 5 6 7 8 9
2193 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2194 
2195  "movq (%2), %%mm0 \n\t"
2196  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2197  "movq %%mm1, %%mm3 \n\t"
2198  "movq %%mm1, %%mm4 \n\t"
2199  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2200  "paddb %%mm7, %%mm0 \n\t"
2201  "pcmpgtb %%mm6, %%mm0 \n\t"
2202 
2203  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2204  PMAXUB(%%mm2, %%mm4)
2205  PMINUB(%%mm2, %%mm3, %%mm5)
2206  "psubb %%mm2, %%mm1 \n\t"
2207  "paddb %%mm7, %%mm1 \n\t"
2208  "pcmpgtb %%mm6, %%mm1 \n\t"
2209  "paddb %%mm1, %%mm0 \n\t"
2210 
2211  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2212  PMAXUB(%%mm1, %%mm4)
2213  PMINUB(%%mm1, %%mm3, %%mm5)
2214  "psubb %%mm1, %%mm2 \n\t"
2215  "paddb %%mm7, %%mm2 \n\t"
2216  "pcmpgtb %%mm6, %%mm2 \n\t"
2217  "paddb %%mm2, %%mm0 \n\t"
2218 
2219  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2220 
2221  "movq (%2, %3, 4), %%mm2 \n\t"
2222  PMAXUB(%%mm2, %%mm4)
2223  PMINUB(%%mm2, %%mm3, %%mm5)
2224  "psubb %%mm2, %%mm1 \n\t"
2225  "paddb %%mm7, %%mm1 \n\t"
2226  "pcmpgtb %%mm6, %%mm1 \n\t"
2227  "paddb %%mm1, %%mm0 \n\t"
2228 
2229  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2230  PMAXUB(%%mm1, %%mm4)
2231  PMINUB(%%mm1, %%mm3, %%mm5)
2232  "psubb %%mm1, %%mm2 \n\t"
2233  "paddb %%mm7, %%mm2 \n\t"
2234  "pcmpgtb %%mm6, %%mm2 \n\t"
2235  "paddb %%mm2, %%mm0 \n\t"
2236 
2237  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2238  PMAXUB(%%mm2, %%mm4)
2239  PMINUB(%%mm2, %%mm3, %%mm5)
2240  "psubb %%mm2, %%mm1 \n\t"
2241  "paddb %%mm7, %%mm1 \n\t"
2242  "pcmpgtb %%mm6, %%mm1 \n\t"
2243  "paddb %%mm1, %%mm0 \n\t"
2244 
2245  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2246  PMAXUB(%%mm1, %%mm4)
2247  PMINUB(%%mm1, %%mm3, %%mm5)
2248  "psubb %%mm1, %%mm2 \n\t"
2249  "paddb %%mm7, %%mm2 \n\t"
2250  "pcmpgtb %%mm6, %%mm2 \n\t"
2251  "paddb %%mm2, %%mm0 \n\t"
2252 
2253  "movq (%2, %3, 8), %%mm2 \n\t"
2254  PMAXUB(%%mm2, %%mm4)
2255  PMINUB(%%mm2, %%mm3, %%mm5)
2256  "psubb %%mm2, %%mm1 \n\t"
2257  "paddb %%mm7, %%mm1 \n\t"
2258  "pcmpgtb %%mm6, %%mm1 \n\t"
2259  "paddb %%mm1, %%mm0 \n\t"
2260 
2261  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2262  "psubb %%mm1, %%mm2 \n\t"
2263  "paddb %%mm7, %%mm2 \n\t"
2264  "pcmpgtb %%mm6, %%mm2 \n\t"
2265  "paddb %%mm2, %%mm0 \n\t"
2266  "psubusb %%mm3, %%mm4 \n\t"
2267 
2268  "pxor %%mm6, %%mm6 \n\t"
2269  "movq %4, %%mm7 \n\t" // QP,..., QP
2270  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2271  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2272  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2273  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2274  "movq %%mm7, %1 \n\t"
2275 
2276  "movq %5, %%mm7 \n\t"
2277  "punpcklbw %%mm7, %%mm7 \n\t"
2278  "punpcklbw %%mm7, %%mm7 \n\t"
2279  "punpcklbw %%mm7, %%mm7 \n\t"
2280  "psubb %%mm0, %%mm6 \n\t"
2281  "pcmpgtb %%mm7, %%mm6 \n\t"
2282  "movq %%mm6, %0 \n\t"
2283 
2284  : "=m" (eq_mask), "=m" (dc_mask)
2285  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2286  : "%"FF_REG_a
2287  );
2288 
2289  both_masks = dc_mask & eq_mask;
2290 
2291  if(both_masks){
2292  x86_reg offset= -8*step;
2293  int64_t *temp_sums= sums;
2294 
2295  __asm__ volatile(
2296  "movq %2, %%mm0 \n\t" // QP,..., QP
2297  "pxor %%mm4, %%mm4 \n\t"
2298 
2299  "movq (%0), %%mm6 \n\t"
2300  "movq (%0, %1), %%mm5 \n\t"
2301  "movq %%mm5, %%mm1 \n\t"
2302  "movq %%mm6, %%mm2 \n\t"
2303  "psubusb %%mm6, %%mm5 \n\t"
2304  "psubusb %%mm1, %%mm2 \n\t"
2305  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2306  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2307  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2308 
2309  "pxor %%mm6, %%mm1 \n\t"
2310  "pand %%mm0, %%mm1 \n\t"
2311  "pxor %%mm1, %%mm6 \n\t"
2312  // 0:QP 6:First
2313 
2314  "movq (%0, %1, 8), %%mm5 \n\t"
2315  "add %1, %0 \n\t" // %0 points to line 1 not 0
2316  "movq (%0, %1, 8), %%mm7 \n\t"
2317  "movq %%mm5, %%mm1 \n\t"
2318  "movq %%mm7, %%mm2 \n\t"
2319  "psubusb %%mm7, %%mm5 \n\t"
2320  "psubusb %%mm1, %%mm2 \n\t"
2321  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2322  "movq %2, %%mm0 \n\t" // QP,..., QP
2323  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2324  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2325 
2326  "pxor %%mm7, %%mm1 \n\t"
2327  "pand %%mm0, %%mm1 \n\t"
2328  "pxor %%mm1, %%mm7 \n\t"
2329 
2330  "movq %%mm6, %%mm5 \n\t"
2331  "punpckhbw %%mm4, %%mm6 \n\t"
2332  "punpcklbw %%mm4, %%mm5 \n\t"
2333  // 4:0 5/6:First 7:Last
2334 
2335  "movq %%mm5, %%mm0 \n\t"
2336  "movq %%mm6, %%mm1 \n\t"
2337  "psllw $2, %%mm0 \n\t"
2338  "psllw $2, %%mm1 \n\t"
2339  "paddw "MANGLE(w04)", %%mm0 \n\t"
2340  "paddw "MANGLE(w04)", %%mm1 \n\t"
2341 
2342 #define NEXT\
2343  "movq (%0), %%mm2 \n\t"\
2344  "movq (%0), %%mm3 \n\t"\
2345  "add %1, %0 \n\t"\
2346  "punpcklbw %%mm4, %%mm2 \n\t"\
2347  "punpckhbw %%mm4, %%mm3 \n\t"\
2348  "paddw %%mm2, %%mm0 \n\t"\
2349  "paddw %%mm3, %%mm1 \n\t"
2350 
2351 #define PREV\
2352  "movq (%0), %%mm2 \n\t"\
2353  "movq (%0), %%mm3 \n\t"\
2354  "add %1, %0 \n\t"\
2355  "punpcklbw %%mm4, %%mm2 \n\t"\
2356  "punpckhbw %%mm4, %%mm3 \n\t"\
2357  "psubw %%mm2, %%mm0 \n\t"\
2358  "psubw %%mm3, %%mm1 \n\t"
2359 
2360 
2361  NEXT //0
2362  NEXT //1
2363  NEXT //2
2364  "movq %%mm0, (%3) \n\t"
2365  "movq %%mm1, 8(%3) \n\t"
2366 
2367  NEXT //3
2368  "psubw %%mm5, %%mm0 \n\t"
2369  "psubw %%mm6, %%mm1 \n\t"
2370  "movq %%mm0, 16(%3) \n\t"
2371  "movq %%mm1, 24(%3) \n\t"
2372 
2373  NEXT //4
2374  "psubw %%mm5, %%mm0 \n\t"
2375  "psubw %%mm6, %%mm1 \n\t"
2376  "movq %%mm0, 32(%3) \n\t"
2377  "movq %%mm1, 40(%3) \n\t"
2378 
2379  NEXT //5
2380  "psubw %%mm5, %%mm0 \n\t"
2381  "psubw %%mm6, %%mm1 \n\t"
2382  "movq %%mm0, 48(%3) \n\t"
2383  "movq %%mm1, 56(%3) \n\t"
2384 
2385  NEXT //6
2386  "psubw %%mm5, %%mm0 \n\t"
2387  "psubw %%mm6, %%mm1 \n\t"
2388  "movq %%mm0, 64(%3) \n\t"
2389  "movq %%mm1, 72(%3) \n\t"
2390 
2391  "movq %%mm7, %%mm6 \n\t"
2392  "punpckhbw %%mm4, %%mm7 \n\t"
2393  "punpcklbw %%mm4, %%mm6 \n\t"
2394 
2395  NEXT //7
2396  "mov %4, %0 \n\t"
2397  "add %1, %0 \n\t"
2398  PREV //0
2399  "movq %%mm0, 80(%3) \n\t"
2400  "movq %%mm1, 88(%3) \n\t"
2401 
2402  PREV //1
2403  "paddw %%mm6, %%mm0 \n\t"
2404  "paddw %%mm7, %%mm1 \n\t"
2405  "movq %%mm0, 96(%3) \n\t"
2406  "movq %%mm1, 104(%3) \n\t"
2407 
2408  PREV //2
2409  "paddw %%mm6, %%mm0 \n\t"
2410  "paddw %%mm7, %%mm1 \n\t"
2411  "movq %%mm0, 112(%3) \n\t"
2412  "movq %%mm1, 120(%3) \n\t"
2413 
2414  PREV //3
2415  "paddw %%mm6, %%mm0 \n\t"
2416  "paddw %%mm7, %%mm1 \n\t"
2417  "movq %%mm0, 128(%3) \n\t"
2418  "movq %%mm1, 136(%3) \n\t"
2419 
2420  PREV //4
2421  "paddw %%mm6, %%mm0 \n\t"
2422  "paddw %%mm7, %%mm1 \n\t"
2423  "movq %%mm0, 144(%3) \n\t"
2424  "movq %%mm1, 152(%3) \n\t"
2425 
2426  "mov %4, %0 \n\t" //FIXME
2427 
2428  : "+&r"(src)
2429  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2431  );
2432 
2433  src+= step; // src points to begin of the 8x8 Block
2434 
2435  __asm__ volatile(
2436  "movq %4, %%mm6 \n\t"
2437  "pcmpeqb %%mm5, %%mm5 \n\t"
2438  "pxor %%mm6, %%mm5 \n\t"
2439  "pxor %%mm7, %%mm7 \n\t"
2440 
2441  "1: \n\t"
2442  "movq (%1), %%mm0 \n\t"
2443  "movq 8(%1), %%mm1 \n\t"
2444  "paddw 32(%1), %%mm0 \n\t"
2445  "paddw 40(%1), %%mm1 \n\t"
2446  "movq (%0, %3), %%mm2 \n\t"
2447  "movq %%mm2, %%mm3 \n\t"
2448  "movq %%mm2, %%mm4 \n\t"
2449  "punpcklbw %%mm7, %%mm2 \n\t"
2450  "punpckhbw %%mm7, %%mm3 \n\t"
2451  "paddw %%mm2, %%mm0 \n\t"
2452  "paddw %%mm3, %%mm1 \n\t"
2453  "paddw %%mm2, %%mm0 \n\t"
2454  "paddw %%mm3, %%mm1 \n\t"
2455  "psrlw $4, %%mm0 \n\t"
2456  "psrlw $4, %%mm1 \n\t"
2457  "packuswb %%mm1, %%mm0 \n\t"
2458  "pand %%mm6, %%mm0 \n\t"
2459  "pand %%mm5, %%mm4 \n\t"
2460  "por %%mm4, %%mm0 \n\t"
2461  "movq %%mm0, (%0, %3) \n\t"
2462  "add $16, %1 \n\t"
2463  "add %2, %0 \n\t"
2464  " js 1b \n\t"
2465 
2466  : "+r"(offset), "+r"(temp_sums)
2467  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2468  );
2469  }else
2470  src+= step; // src points to begin of the 8x8 Block
2471 
2472  if(eq_mask != -1LL){
2473  uint8_t *temp_src= src;
2474  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2475  __asm__ volatile(
2476  "pxor %%mm7, %%mm7 \n\t"
2477 // 0 1 2 3 4 5 6 7 8 9
2478 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2479 
2480  "movq (%0), %%mm0 \n\t"
2481  "movq %%mm0, %%mm1 \n\t"
2482  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2483  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2484 
2485  "movq (%0, %1), %%mm2 \n\t"
2486  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2487  "movq %%mm2, %%mm3 \n\t"
2488  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2489  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2490 
2491  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2492  "movq %%mm4, %%mm5 \n\t"
2493  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2494  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2495 
2496  "paddw %%mm0, %%mm0 \n\t" // 2L0
2497  "paddw %%mm1, %%mm1 \n\t" // 2H0
2498  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2499  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2500  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2501  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2502 
2503  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2504  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2505  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2506  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2507 
2508  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2509  "movq %%mm2, %%mm3 \n\t"
2510  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2511  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2512 
2513  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2514  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2515  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2516  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2517  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2518  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2519 
2520  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2521  "movq %%mm0, %%mm1 \n\t"
2522  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2523  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2524 
2525  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2526  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2527  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2528  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2529  "paddw %%mm4, %%mm4 \n\t" // 2L2
2530  "paddw %%mm5, %%mm5 \n\t" // 2H2
2531  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2532  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2533 
2534  "lea (%%"FF_REG_a", %1), %0 \n\t"
2535  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2536  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2537  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2538  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2539 //50 opcodes so far
2540  "movq (%0, %1, 2), %%mm2 \n\t"
2541  "movq %%mm2, %%mm3 \n\t"
2542  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2543  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2544  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2545  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2546  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2547  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2548 
2549  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2550  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2551  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2552  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2553  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2554  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2555 
2556  "paddw %%mm0, %%mm0 \n\t" // 2L4
2557  "paddw %%mm1, %%mm1 \n\t" // 2H4
2558  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2559  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2560 
2561  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2562  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2563  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2564  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2565 
2566  "movq (%0, %1, 4), %%mm2 \n\t"
2567  "movq %%mm2, %%mm3 \n\t"
2568  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2569  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2570 
2571  "paddw %%mm2, %%mm2 \n\t" // 2L7
2572  "paddw %%mm3, %%mm3 \n\t" // 2H7
2573  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2574  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2575 
2576  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2577  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2578 
2579  "movq %%mm7, %%mm6 \n\t" // 0
2580  "psubw %%mm0, %%mm6 \n\t"
2581  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2582  "movq %%mm7, %%mm6 \n\t" // 0
2583  "psubw %%mm1, %%mm6 \n\t"
2584  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2585  "movq %%mm7, %%mm6 \n\t" // 0
2586  "psubw %%mm2, %%mm6 \n\t"
2587  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2588  "movq %%mm7, %%mm6 \n\t" // 0
2589  "psubw %%mm3, %%mm6 \n\t"
2590  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2591 
2592  "pminsw %%mm2, %%mm0 \n\t"
2593  "pminsw %%mm3, %%mm1 \n\t"
2594 
2595  "movd %2, %%mm2 \n\t" // QP
2596  "punpcklbw %%mm7, %%mm2 \n\t"
2597 
2598  "movq %%mm7, %%mm6 \n\t" // 0
2599  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2600  "pxor %%mm6, %%mm4 \n\t"
2601  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2602  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2603  "pxor %%mm7, %%mm5 \n\t"
2604  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2605 // 100 opcodes
2606  "psllw $3, %%mm2 \n\t" // 8QP
2607  "movq %%mm2, %%mm3 \n\t" // 8QP
2608  "pcmpgtw %%mm4, %%mm2 \n\t"
2609  "pcmpgtw %%mm5, %%mm3 \n\t"
2610  "pand %%mm2, %%mm4 \n\t"
2611  "pand %%mm3, %%mm5 \n\t"
2612 
2613 
2614  "psubusw %%mm0, %%mm4 \n\t" // hd
2615  "psubusw %%mm1, %%mm5 \n\t" // ld
2616 
2617 
2618  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2619  "pmullw %%mm2, %%mm4 \n\t"
2620  "pmullw %%mm2, %%mm5 \n\t"
2621  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2622  "paddw %%mm2, %%mm4 \n\t"
2623  "paddw %%mm2, %%mm5 \n\t"
2624  "psrlw $6, %%mm4 \n\t"
2625  "psrlw $6, %%mm5 \n\t"
2626 
2627  "movq 16(%4), %%mm0 \n\t" // L3 - L4
2628  "movq 24(%4), %%mm1 \n\t" // H3 - H4
2629 
2630  "pxor %%mm2, %%mm2 \n\t"
2631  "pxor %%mm3, %%mm3 \n\t"
2632 
2633  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
2634  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
2635  "pxor %%mm2, %%mm0 \n\t"
2636  "pxor %%mm3, %%mm1 \n\t"
2637  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
2638  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2639  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
2640  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2641 
2642  "pxor %%mm6, %%mm2 \n\t"
2643  "pxor %%mm7, %%mm3 \n\t"
2644  "pand %%mm2, %%mm4 \n\t"
2645  "pand %%mm3, %%mm5 \n\t"
2646 
2647  "pminsw %%mm0, %%mm4 \n\t"
2648  "pminsw %%mm1, %%mm5 \n\t"
2649  "pxor %%mm6, %%mm4 \n\t"
2650  "pxor %%mm7, %%mm5 \n\t"
2651  "psubw %%mm6, %%mm4 \n\t"
2652  "psubw %%mm7, %%mm5 \n\t"
2653  "packsswb %%mm5, %%mm4 \n\t"
2654  "movq %3, %%mm1 \n\t"
2655  "pandn %%mm4, %%mm1 \n\t"
2656  "movq (%0), %%mm0 \n\t"
2657  "paddb %%mm1, %%mm0 \n\t"
2658  "movq %%mm0, (%0) \n\t"
2659  "movq (%0, %1), %%mm0 \n\t"
2660  "psubb %%mm1, %%mm0 \n\t"
2661  "movq %%mm0, (%0, %1) \n\t"
2662 
2663  : "+r" (temp_src)
2664  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
2665  NAMED_CONSTRAINTS_ADD(w05,w20)
2666  : "%"FF_REG_a
2667  );
2668  }
2669 }
2670 #endif //TEMPLATE_PP_MMX
2671 
2672 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2673  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
2674 
2675 /**
2676  * Copy a block from src to dst and fixes the blacklevel.
2677  * levelFix == 0 -> do not touch the brightness & contrast
2678  */
2679 #undef REAL_SCALED_CPY
2680 #undef SCALED_CPY
2681 
2682 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
2683  int levelFix, int64_t *packedOffsetAndScale)
2684 {
2685  if(levelFix){
2686 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
2687  __asm__ volatile(
2688  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
2689  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
2690  "lea (%2,%4), %%"FF_REG_a" \n\t"
2691  "lea (%3,%5), %%"FF_REG_d" \n\t"
2692  "pxor %%mm4, %%mm4 \n\t"
2693 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
2694  "movq " #src1 ", %%mm0 \n\t"\
2695  "movq " #src1 ", %%mm5 \n\t"\
2696  "movq " #src2 ", %%mm1 \n\t"\
2697  "movq " #src2 ", %%mm6 \n\t"\
2698  "punpcklbw %%mm0, %%mm0 \n\t"\
2699  "punpckhbw %%mm5, %%mm5 \n\t"\
2700  "punpcklbw %%mm1, %%mm1 \n\t"\
2701  "punpckhbw %%mm6, %%mm6 \n\t"\
2702  "pmulhuw %%mm3, %%mm0 \n\t"\
2703  "pmulhuw %%mm3, %%mm5 \n\t"\
2704  "pmulhuw %%mm3, %%mm1 \n\t"\
2705  "pmulhuw %%mm3, %%mm6 \n\t"\
2706  "psubw %%mm2, %%mm0 \n\t"\
2707  "psubw %%mm2, %%mm5 \n\t"\
2708  "psubw %%mm2, %%mm1 \n\t"\
2709  "psubw %%mm2, %%mm6 \n\t"\
2710  "packuswb %%mm5, %%mm0 \n\t"\
2711  "packuswb %%mm6, %%mm1 \n\t"\
2712  "movq %%mm0, " #dst1 " \n\t"\
2713  "movq %%mm1, " #dst2 " \n\t"\
2714 
2715 #define SCALED_CPY(src1, src2, dst1, dst2)\
2716  REAL_SCALED_CPY(src1, src2, dst1, dst2)
2717 
2718 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
2719 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
2720 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
2721  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
2722  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
2723 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
2724 
2725 
2726  : "=&a" (packedOffsetAndScale)
2727  : "0" (packedOffsetAndScale),
2728  "r"(src),
2729  "r"(dst),
2730  "r" ((x86_reg)srcStride),
2731  "r" ((x86_reg)dstStride)
2732  : "%"FF_REG_d
2733  );
2734 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2735  for (int i = 0; i < 8; i++)
2736  memcpy( &(dst[dstStride*i]),
2737  &(src[srcStride*i]), BLOCK_SIZE);
2738 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2739  }else{
2740 #if TEMPLATE_PP_MMX && HAVE_6REGS
2741  __asm__ volatile(
2742  "lea (%0,%2), %%"FF_REG_a" \n\t"
2743  "lea (%1,%3), %%"FF_REG_d" \n\t"
2744 
2745 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
2746  "movq " #src1 ", %%mm0 \n\t"\
2747  "movq " #src2 ", %%mm1 \n\t"\
2748  "movq %%mm0, " #dst1 " \n\t"\
2749  "movq %%mm1, " #dst2 " \n\t"\
2750 
2751 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
2752  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
2753 
2754 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
2755 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
2756 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
2757  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
2758  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
2759 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
2760 
2761  : : "r" (src),
2762  "r" (dst),
2763  "r" ((x86_reg)srcStride),
2764  "r" ((x86_reg)dstStride)
2765  : "%"FF_REG_a, "%"FF_REG_d
2766  );
2767 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2768  for (int i = 0; i < 8; i++)
2769  memcpy( &(dst[dstStride*i]),
2770  &(src[srcStride*i]), BLOCK_SIZE);
2771 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2772  }
2773 }
2774 
2775 /**
2776  * Duplicate the given 8 src pixels ? times upward
2777  */
2778 static inline void RENAME(duplicate)(uint8_t src[], int stride)
2779 {
2780 #if TEMPLATE_PP_MMX
2781  __asm__ volatile(
2782  "movq (%0), %%mm0 \n\t"
2783  "movq %%mm0, (%0, %1, 4) \n\t"
2784  "add %1, %0 \n\t"
2785  "movq %%mm0, (%0) \n\t"
2786  "movq %%mm0, (%0, %1) \n\t"
2787  "movq %%mm0, (%0, %1, 2) \n\t"
2788  "movq %%mm0, (%0, %1, 4) \n\t"
2789  : "+r" (src)
2790  : "r" ((x86_reg)-stride)
2791  );
2792 #else
2793  int i;
2794  uint8_t *p=src;
2795  for(i=0; i<5; i++){
2796  p-= stride;
2797  memcpy(p, src, 8);
2798  }
2799 #endif
2800 }
2801 
2802 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
2803 static inline void RENAME(prefetchnta)(const void *p)
2804 {
2805  __asm__ volatile( "prefetchnta (%0)\n\t"
2806  : : "r" (p)
2807  );
2808 }
2809 
2810 static inline void RENAME(prefetcht0)(const void *p)
2811 {
2812  __asm__ volatile( "prefetcht0 (%0)\n\t"
2813  : : "r" (p)
2814  );
2815 }
2816 
2817 static inline void RENAME(prefetcht1)(const void *p)
2818 {
2819  __asm__ volatile( "prefetcht1 (%0)\n\t"
2820  : : "r" (p)
2821  );
2822 }
2823 
2824 static inline void RENAME(prefetcht2)(const void *p)
2825 {
2826  __asm__ volatile( "prefetcht2 (%0)\n\t"
2827  : : "r" (p)
2828  );
2829 }
2830 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
2831 static inline void RENAME(prefetchnta)(const void *p)
2832 {
2833  __builtin_prefetch(p,0,0);
2834 }
2835 static inline void RENAME(prefetcht0)(const void *p)
2836 {
2837  __builtin_prefetch(p,0,1);
2838 }
2839 static inline void RENAME(prefetcht1)(const void *p)
2840 {
2841  __builtin_prefetch(p,0,2);
2842 }
2843 static inline void RENAME(prefetcht2)(const void *p)
2844 {
2845  __builtin_prefetch(p,0,3);
2846 }
2847 #else
2848 static inline void RENAME(prefetchnta)(const void *p)
2849 {
2850  return;
2851 }
2852 static inline void RENAME(prefetcht0)(const void *p)
2853 {
2854  return;
2855 }
2856 static inline void RENAME(prefetcht1)(const void *p)
2857 {
2858  return;
2859 }
2860 static inline void RENAME(prefetcht2)(const void *p)
2861 {
2862  return;
2863 }
2864 #endif
2865 /**
2866  * Filter array of bytes (Y or U or V values)
2867  */
2868 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2869  const int8_t QPs[], int QPStride, int isColor, PPContext *c)
2870 {
2871  int x,y;
2872 #ifdef TEMPLATE_PP_TIME_MODE
2873  const int mode= TEMPLATE_PP_TIME_MODE;
2874 #else
2875  const int mode = isColor ? c->ppMode.chromMode : c->ppMode.lumMode;
2876 #endif
2877  int black=0, white=255; // blackest black and whitest white in the picture
2878  int QPCorrecture= 256*256;
2879 
2880  int copyAhead;
2881 #if TEMPLATE_PP_MMX
2882  int i;
2883 #endif
2884 
2885  const int qpHShift = isColor ? 4 - c->hChromaSubSample : 4;
2886  const int qpVShift = isColor ? 4 - c->vChromaSubSample : 4;
2887 
2888  //FIXME remove
2889  uint64_t * const yHistogram= c->yHistogram;
2890  uint8_t * const tempSrc = srcStride > 0 ? c->tempSrc : c->tempSrc - 23*srcStride;
2891  uint8_t * const tempDst = (dstStride > 0 ? c->tempDst : c->tempDst - 23*dstStride) + 32;
2892  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2893 
2894  if (mode & VISUALIZE){
2895  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
2896  av_log(c, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
2897  }
2898  }
2899 
2900 #if TEMPLATE_PP_MMX
2901  for(i=0; i<57; i++){
2902  int offset = ((i * c->ppMode.baseDcDiff) >> 8) + 1;
2903  int threshold= offset*2 + 1;
2904  c->mmxDcOffset[i] = 0x7F - offset;
2905  c->mmxDcThreshold[i] = 0x7F - threshold;
2906  c->mmxDcOffset[i] *= 0x0101010101010101LL;
2907  c->mmxDcThreshold[i] *= 0x0101010101010101LL;
2908  }
2909 #endif
2910 
2911  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2912  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
2913  || (mode & FFMPEG_DEINT_FILTER)
2914  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
2915  else if( (mode & V_DEBLOCK)
2917  || (mode & MEDIAN_DEINT_FILTER)
2918  || (mode & V_A_DEBLOCK)) copyAhead=13;
2919  else if(mode & V_X1_FILTER) copyAhead=11;
2920 // else if(mode & V_RK1_FILTER) copyAhead=10;
2921  else if(mode & DERING) copyAhead=9;
2922  else copyAhead=8;
2923 
2924  copyAhead-= 8;
2925 
2926  if(!isColor){
2927  uint64_t sum= 0;
2928  int i;
2929  uint64_t maxClipped;
2930  uint64_t clipped;
2931  AVRational scale;
2932 
2933  c->frameNum++;
2934  // first frame is fscked so we ignore it
2935  if (c->frameNum == 1)
2936  yHistogram[0] = width * (uint64_t)height/64*15/256;
2937 
2938  for(i=0; i<256; i++){
2939  sum+= yHistogram[i];
2940  }
2941 
2942  /* We always get a completely black picture first. */
2943  maxClipped = av_rescale(sum, c->ppMode.maxClippedThreshold.num,
2944  c->ppMode.maxClippedThreshold.den);
2945 
2946  clipped= sum;
2947  for(black=255; black>0; black--){
2948  if(clipped < maxClipped) break;
2949  clipped-= yHistogram[black];
2950  }
2951 
2952  clipped= sum;
2953  for(white=0; white<256; white++){
2954  if(clipped < maxClipped) break;
2955  clipped-= yHistogram[white];
2956  }
2957 
2958  scale = (AVRational){c->ppMode.maxAllowedY - c->ppMode.minAllowedY, white - black};
2959 
2960 #if TEMPLATE_PP_MMXEXT
2961  c->packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
2962  c->packedYOffset = (((black*c->packedYScale)>>8) - c->ppMode.minAllowedY) & 0xFFFF;
2963 #else
2964  c->packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
2965  c->packedYOffset = (black - c->ppMode.minAllowedY) & 0xFFFF;
2966 #endif
2967 
2968  c->packedYOffset |= c->packedYOffset<<32;
2969  c->packedYOffset |= c->packedYOffset<<16;
2970 
2971  c->packedYScale |= c->packedYScale<<32;
2972  c->packedYScale |= c->packedYScale<<16;
2973 
2974  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
2975  else QPCorrecture= 256*256;
2976  }else{
2977  c->packedYScale = 0x0100010001000100LL;
2978  c->packedYOffset = 0;
2979  QPCorrecture= 256*256;
2980  }
2981 
2982  /* copy & deinterlace first row of blocks */
2983  y=-BLOCK_SIZE;
2984  {
2985  const uint8_t *srcBlock= &(src[y*srcStride]);
2986  uint8_t *dstBlock= tempDst + dstStride;
2987 
2988  // From this point on it is guaranteed that we can read and write 16 lines downward
2989  // finish 1 block before the next otherwise we might have a problem
2990  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
2991  for(x=0; x<width; x+=BLOCK_SIZE){
2992  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
2993  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
2994  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
2995  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
2996 
2997  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2998  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
2999 
3000  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3001 
3003  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3004  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3005  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3006  else if(mode & MEDIAN_DEINT_FILTER)
3007  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3008  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3009  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3010  else if(mode & FFMPEG_DEINT_FILTER)
3011  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3012  else if(mode & LOWPASS5_DEINT_FILTER)
3013  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3014 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3015  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3016 */
3017  dstBlock+=8;
3018  srcBlock+=8;
3019  }
3020  if(width==FFABS(dstStride))
3021  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3022  else{
3023  int i;
3024  for(i=0; i<copyAhead; i++){
3025  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3026  }
3027  }
3028  }
3029 
3030  for(y=0; y<height; y+=BLOCK_SIZE){
3031  //1% speedup if these are here instead of the inner loop
3032  const uint8_t *srcBlock= &(src[y*srcStride]);
3033  uint8_t *dstBlock= &(dst[y*dstStride]);
3034 #if TEMPLATE_PP_MMX
3035  uint8_t *tempBlock1 = c->tempBlocks;
3036  uint8_t *tempBlock2 = c->tempBlocks + 8;
3037 #endif
3038  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3039  int8_t *nonBQPptr = &c->nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3040  int QP=0, nonBQP=0;
3041  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3042  if not than use a temporary buffer */
3043  if(y+15 >= height){
3044  int i;
3045  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3046  blockcopy to dst later */
3047  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3048  FFMAX(height-y-copyAhead, 0), srcStride);
3049 
3050  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3051  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3052  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3053 
3054  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3055  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3056 
3057  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3058  for(i=height-y+1; i<=copyAhead; i++)
3059  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3060 
3061  dstBlock= tempDst + dstStride;
3062  srcBlock= tempSrc;
3063  }
3064 
3065  // From this point on it is guaranteed that we can read and write 16 lines downward
3066  // finish 1 block before the next otherwise we might have a problem
3067  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3068  for(x=0; x<width; ){
3069  int startx = x;
3070  int endx = FFMIN(width, x+32);
3071  uint8_t *dstBlockStart = dstBlock;
3072  const uint8_t *srcBlockStart = srcBlock;
3073  int qp_index = 0;
3074  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3075  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3076  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3077  if(!isColor){
3078  QP= (QP* QPCorrecture + 256*128)>>16;
3079  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3080  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3081  }
3082  c->QP_block[qp_index] = QP;
3083  c->nonBQP_block[qp_index] = nonBQP;
3084 #if TEMPLATE_PP_MMX
3085  __asm__ volatile(
3086  "movd %1, %%mm7 \n\t"
3087  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3088  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3089  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3090  "movq %%mm7, %0 \n\t"
3091  : "=m" (c->pQPb_block[qp_index])
3092  : "r" (QP)
3093  );
3094 #endif
3095  }
3096  for(; x < endx; x+=BLOCK_SIZE){
3097  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3098  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3099  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3100  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3101 
3102  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3103  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
3104 
3106  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3107  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3108  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3109  else if(mode & MEDIAN_DEINT_FILTER)
3110  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3111  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3112  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3113  else if(mode & FFMPEG_DEINT_FILTER)
3114  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3115  else if(mode & LOWPASS5_DEINT_FILTER)
3116  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3117 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3118  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3119 */
3120  dstBlock+=8;
3121  srcBlock+=8;
3122  }
3123 
3124  dstBlock = dstBlockStart;
3125  srcBlock = srcBlockStart;
3126 
3127  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3128  const int stride= dstStride;
3129  //temporary while changing QP stuff to make things continue to work
3130  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3131  c->QP = c->QP_block[qp_index];
3132  c->nonBQP = c->nonBQP_block[qp_index];
3133  c->pQPb = c->pQPb_block[qp_index];
3134  c->pQPb2 = c->pQPb2_block[qp_index];
3135 
3136  /* only deblock if we have 2 blocks */
3137  if(y + 8 < height){
3138  if(mode & V_X1_FILTER)
3139  RENAME(vertX1Filter)(dstBlock, stride, c);
3140  else if(mode & V_DEBLOCK){
3141  const int t = RENAME(vertClassify)(dstBlock, stride, c);
3142 
3143  if(t==1)
3144  RENAME(doVertLowPass)(dstBlock, stride, c);
3145  else if(t==2)
3146  RENAME(doVertDefFilter)(dstBlock, stride, c);
3147  }else if(mode & V_A_DEBLOCK){
3148  RENAME(do_a_deblock)(dstBlock, stride, 1, c, mode);
3149  }
3150  }
3151 
3152  dstBlock+=8;
3153  srcBlock+=8;
3154  }
3155 
3156  dstBlock = dstBlockStart;
3157  srcBlock = srcBlockStart;
3158 
3159  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3160  const int stride= dstStride;
3161  c->QP = c->QP_block[qp_index];
3162  c->nonBQP = c->nonBQP_block[qp_index];
3163  c->pQPb = c->pQPb_block[qp_index];
3164  c->pQPb2 = c->pQPb2_block[qp_index];
3165 #if TEMPLATE_PP_MMX
3166  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3167 #endif
3168  /* check if we have a previous block to deblock it with dstBlock */
3169  if(x - 8 >= 0){
3170 #if TEMPLATE_PP_MMX
3171  if(mode & H_X1_FILTER)
3172  RENAME(vertX1Filter)(tempBlock1, 16, c);
3173  else if(mode & H_DEBLOCK){
3174  const int t= RENAME(vertClassify)(tempBlock1, 16, c);
3175  if(t==1)
3176  RENAME(doVertLowPass)(tempBlock1, 16, c);
3177  else if(t==2)
3178  RENAME(doVertDefFilter)(tempBlock1, 16, c);
3179  }else if(mode & H_A_DEBLOCK){
3180  RENAME(do_a_deblock)(tempBlock1, 16, 1, c, mode);
3181  }
3182 
3183  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3184 
3185 #else
3186  if(mode & H_X1_FILTER)
3187  horizX1Filter(dstBlock-4, stride, c->QP);
3188  else if(mode & H_DEBLOCK){
3189 #if TEMPLATE_PP_ALTIVEC
3190  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3191  int t;
3192  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3193 
3194  t = vertClassify_altivec(tempBlock-48, 16, c);
3195  if(t==1) {
3196  doVertLowPass_altivec(tempBlock-48, 16, c);
3197  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3198  }
3199  else if(t==2) {
3200  doVertDefFilter_altivec(tempBlock-48, 16, c);
3201  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3202  }
3203 #else
3204  const int t= RENAME(horizClassify)(dstBlock-4, stride, c);
3205 
3206  if(t==1)
3207  RENAME(doHorizLowPass)(dstBlock-4, stride, c);
3208  else if(t==2)
3209  RENAME(doHorizDefFilter)(dstBlock-4, stride, c);
3210 #endif
3211  }else if(mode & H_A_DEBLOCK){
3212  RENAME(do_a_deblock)(dstBlock-8, 1, stride, c, mode);
3213  }
3214 #endif //TEMPLATE_PP_MMX
3215  if(mode & DERING){
3216  RENAME(dering)(dstBlock - stride - 8, stride, c, x<=8, 0, y<=0);
3217  }
3218 
3219  if(mode & TEMP_NOISE_FILTER)
3220  {
3221  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3222  c->tempBlurred[isColor] + y*dstStride + x,
3223  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3224  c->ppMode.maxTmpNoise);
3225  }
3226  }
3227 
3228  dstBlock+=8;
3229  srcBlock+=8;
3230 
3231 #if TEMPLATE_PP_MMX
3232  FFSWAP(uint8_t *, tempBlock1, tempBlock2);
3233 #endif
3234  }
3235  }
3236 
3237  if(mode & DERING){
3238  RENAME(dering)(dstBlock - dstStride - 8, dstStride, c, x<=8, 1, y<=0);
3239  }
3240 
3241  if((mode & TEMP_NOISE_FILTER)){
3242  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3243  c->tempBlurred[isColor] + y*dstStride + x,
3244  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3245  c->ppMode.maxTmpNoise);
3246  }
3247 
3248  /* did we use a tmp buffer for the last lines*/
3249  if(y+15 >= height){
3250  uint8_t *dstBlock= &(dst[y*dstStride]);
3251  if(width==FFABS(dstStride))
3252  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3253  else{
3254  int i;
3255  for(i=0; i<height-y; i++){
3256  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3257  }
3258  }
3259  }
3260  }
3261 #if TEMPLATE_PP_MMX
3262  __asm__ volatile("emms");
3263 #endif
3264 
3265 #ifdef DEBUG_BRIGHTNESS
3266  if(!isColor){
3267  int max=1;
3268  int i;
3269  for(i=0; i<256; i++)
3270  if(yHistogram[i] > max) max=yHistogram[i];
3271 
3272  for(i=1; i<256; i++){
3273  int x;
3274  int start=yHistogram[i-1]/(max/256+1);
3275  int end=yHistogram[i]/(max/256+1);
3276  int inc= end > start ? 1 : -1;
3277  for(x=start; x!=end+inc; x+=inc)
3278  dst[ i*dstStride + x]+=128;
3279  }
3280 
3281  for(i=0; i<100; i+=2){
3282  dst[ (white)*dstStride + i]+=128;
3283  dst[ (black)*dstStride + i]+=128;
3284  }
3285  }
3286 #endif
3287 }
3288 
3289 #undef RENAME
3290 #undef TEMPLATE_PP_C
3291 #undef TEMPLATE_PP_ALTIVEC
3292 #undef TEMPLATE_PP_MMX
3293 #undef TEMPLATE_PP_MMXEXT
3294 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:67
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:186
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
x86_reg
int x86_reg
Definition: asm.h:72
int64_t
long long int64_t
Definition: coverity.c:34
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:68
b
#define b
Definition: input.c:41
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:324
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:56
FFSIGN
#define FFSIGN(a)
Definition: common.h:75
QP
#define QP(qP, depth)
Definition: h264data.c:190
MANGLE
#define MANGLE(a)
Definition: asm.h:127
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:523
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
s
#define s(width, name)
Definition: cbs_vp9.c:198
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:52
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:36
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:70
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
asm.h
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:66
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:177
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:51
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
inc
static int inc(int num, int period)
Definition: perlin.c:34
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1019
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:81
f
f
Definition: af_crystalizer.c:122
height
#define height
Definition: dsp.h:85
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:109
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1124
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:37
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:164
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:37
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:191
DERING
#define DERING
Definition: postprocess_internal.h:38
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:73
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
stride
#define stride
Definition: h264pred_template.c:537
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:38
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:65
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:52
RENAME
#define RENAME(element)
Definition: ac3enc_template.c:44
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:112
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:62
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:55
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:51
width
#define width
Definition: dsp.h:85
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:39
src
#define src
Definition: vp8dsp.c:248
min
float min
Definition: vorbis_enc_data.h:429