FFmpeg
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/sse2 postprocess code.
24  */
25 
26 #include "libavutil/mem_internal.h"
27 #include "libavutil/x86/asm.h"
28 
29 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
30  * included. The following macros will define its dependencies to 1 as well
31  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
32  * TEMPLATE_PP_* need to be undef at the end. */
33 
34 #ifdef TEMPLATE_PP_C
35 # define RENAME(a) a ## _C
36 #else
37 # define TEMPLATE_PP_C 0
38 #endif
39 
40 #ifdef TEMPLATE_PP_ALTIVEC
41 # define RENAME(a) a ## _altivec
42 #else
43 # define TEMPLATE_PP_ALTIVEC 0
44 #endif
45 
46 #ifdef TEMPLATE_PP_MMX
47 # define RENAME(a) a ## _MMX
48 #else
49 # define TEMPLATE_PP_MMX 0
50 #endif
51 
52 #ifdef TEMPLATE_PP_MMXEXT
53 # undef TEMPLATE_PP_MMX
54 # define TEMPLATE_PP_MMX 1
55 # define RENAME(a) a ## _MMX2
56 #else
57 # define TEMPLATE_PP_MMXEXT 0
58 #endif
59 
60 #ifdef TEMPLATE_PP_SSE2
61 # undef TEMPLATE_PP_MMX
62 # define TEMPLATE_PP_MMX 1
63 # undef TEMPLATE_PP_MMXEXT
64 # define TEMPLATE_PP_MMXEXT 1
65 # define RENAME(a) a ## _SSE2
66 #else
67 # define TEMPLATE_PP_SSE2 0
68 #endif
69 
70 #undef REAL_PAVGB
71 #undef PAVGB
72 #undef PMINUB
73 #undef PMAXUB
74 
75 #if TEMPLATE_PP_MMXEXT
76 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
77 #endif
78 #define PAVGB(a,b) REAL_PAVGB(a,b)
79 
80 #if TEMPLATE_PP_MMXEXT
81 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
82 #endif
83 
84 #if TEMPLATE_PP_MMXEXT
85 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
86 #endif
87 
88 //FIXME? |255-0| = 1 (should not be a problem ...)
89 #if TEMPLATE_PP_MMXEXT
90 /**
91  * Check if the middle 8x8 Block in the given 8x16 block is flat
92  */
93 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
94  int numEq= 0, dcOk;
95  src+= stride*4; // src points to begin of the 8x8 Block
96  __asm__ volatile(
97  "movq %0, %%mm7 \n\t"
98  "movq %1, %%mm6 \n\t"
99  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
100  );
101 
102  __asm__ volatile(
103  "lea (%2, %3), %%"FF_REG_a" \n\t"
104 // 0 1 2 3 4 5 6 7 8 9
105 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
106 
107  "movq (%2), %%mm0 \n\t"
108  "movq (%%"FF_REG_a"), %%mm1 \n\t"
109  "movq %%mm0, %%mm3 \n\t"
110  "movq %%mm0, %%mm4 \n\t"
111  PMAXUB(%%mm1, %%mm4)
112  PMINUB(%%mm1, %%mm3, %%mm5)
113  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
114  "paddb %%mm7, %%mm0 \n\t"
115  "pcmpgtb %%mm6, %%mm0 \n\t"
116 
117  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
118  PMAXUB(%%mm2, %%mm4)
119  PMINUB(%%mm2, %%mm3, %%mm5)
120  "psubb %%mm2, %%mm1 \n\t"
121  "paddb %%mm7, %%mm1 \n\t"
122  "pcmpgtb %%mm6, %%mm1 \n\t"
123  "paddb %%mm1, %%mm0 \n\t"
124 
125  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
126  PMAXUB(%%mm1, %%mm4)
127  PMINUB(%%mm1, %%mm3, %%mm5)
128  "psubb %%mm1, %%mm2 \n\t"
129  "paddb %%mm7, %%mm2 \n\t"
130  "pcmpgtb %%mm6, %%mm2 \n\t"
131  "paddb %%mm2, %%mm0 \n\t"
132 
133  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
134 
135  "movq (%2, %3, 4), %%mm2 \n\t"
136  PMAXUB(%%mm2, %%mm4)
137  PMINUB(%%mm2, %%mm3, %%mm5)
138  "psubb %%mm2, %%mm1 \n\t"
139  "paddb %%mm7, %%mm1 \n\t"
140  "pcmpgtb %%mm6, %%mm1 \n\t"
141  "paddb %%mm1, %%mm0 \n\t"
142 
143  "movq (%%"FF_REG_a"), %%mm1 \n\t"
144  PMAXUB(%%mm1, %%mm4)
145  PMINUB(%%mm1, %%mm3, %%mm5)
146  "psubb %%mm1, %%mm2 \n\t"
147  "paddb %%mm7, %%mm2 \n\t"
148  "pcmpgtb %%mm6, %%mm2 \n\t"
149  "paddb %%mm2, %%mm0 \n\t"
150 
151  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
152  PMAXUB(%%mm2, %%mm4)
153  PMINUB(%%mm2, %%mm3, %%mm5)
154  "psubb %%mm2, %%mm1 \n\t"
155  "paddb %%mm7, %%mm1 \n\t"
156  "pcmpgtb %%mm6, %%mm1 \n\t"
157  "paddb %%mm1, %%mm0 \n\t"
158 
159  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
160  PMAXUB(%%mm1, %%mm4)
161  PMINUB(%%mm1, %%mm3, %%mm5)
162  "psubb %%mm1, %%mm2 \n\t"
163  "paddb %%mm7, %%mm2 \n\t"
164  "pcmpgtb %%mm6, %%mm2 \n\t"
165  "paddb %%mm2, %%mm0 \n\t"
166  "psubusb %%mm3, %%mm4 \n\t"
167 
168  " \n\t"
169  "pxor %%mm7, %%mm7 \n\t"
170  "psadbw %%mm7, %%mm0 \n\t"
171  "movq %4, %%mm7 \n\t" // QP,..., QP
172  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
173  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
174  "packssdw %%mm4, %%mm4 \n\t"
175  "movd %%mm0, %0 \n\t"
176  "movd %%mm4, %1 \n\t"
177 
178  : "=r" (numEq), "=r" (dcOk)
179  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
180  : "%"FF_REG_a
181  );
182 
183  numEq= (-numEq) &0xFF;
184  if(numEq > c->ppMode.flatnessThreshold){
185  if(dcOk) return 0;
186  else return 1;
187  }else{
188  return 2;
189  }
190 }
191 #endif //TEMPLATE_PP_MMXEXT
192 
193 /**
194  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
195  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
196  */
197 #if !TEMPLATE_PP_ALTIVEC
198 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
199 {
200 #if TEMPLATE_PP_MMXEXT
201  src+= stride*3;
202  __asm__ volatile( //"movv %0 %1 %2\n\t"
203  "movq %2, %%mm0 \n\t" // QP,..., QP
204  "pxor %%mm4, %%mm4 \n\t"
205 
206  "movq (%0), %%mm6 \n\t"
207  "movq (%0, %1), %%mm5 \n\t"
208  "movq %%mm5, %%mm1 \n\t"
209  "movq %%mm6, %%mm2 \n\t"
210  "psubusb %%mm6, %%mm5 \n\t"
211  "psubusb %%mm1, %%mm2 \n\t"
212  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
213  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
214  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
215 
216  "pand %%mm2, %%mm6 \n\t"
217  "pandn %%mm1, %%mm2 \n\t"
218  "por %%mm2, %%mm6 \n\t"// First Line to Filter
219 
220  "movq (%0, %1, 8), %%mm5 \n\t"
221  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
222  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
223  "sub %1, %%"FF_REG_c" \n\t"
224  "add %1, %0 \n\t" // %0 points to line 1 not 0
225  "movq (%0, %1, 8), %%mm7 \n\t"
226  "movq %%mm5, %%mm1 \n\t"
227  "movq %%mm7, %%mm2 \n\t"
228  "psubusb %%mm7, %%mm5 \n\t"
229  "psubusb %%mm1, %%mm2 \n\t"
230  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
231  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
232  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
233 
234  "pand %%mm2, %%mm7 \n\t"
235  "pandn %%mm1, %%mm2 \n\t"
236  "por %%mm2, %%mm7 \n\t" // First Line to Filter
237 
238 
239  // 1 2 3 4 5 6 7 8
240  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
241  // 6 4 2 2 1 1
242  // 6 4 4 2
243  // 6 8 2
244 
245  "movq (%0, %1), %%mm0 \n\t" // 1
246  "movq %%mm0, %%mm1 \n\t" // 1
247  PAVGB(%%mm6, %%mm0) //1 1 /2
248  PAVGB(%%mm6, %%mm0) //3 1 /4
249 
250  "movq (%0, %1, 4), %%mm2 \n\t" // 1
251  "movq %%mm2, %%mm5 \n\t" // 1
252  PAVGB((%%FF_REGa), %%mm2) // 11 /2
253  PAVGB((%0, %1, 2), %%mm2) // 211 /4
254  "movq %%mm2, %%mm3 \n\t" // 211 /4
255  "movq (%0), %%mm4 \n\t" // 1
256  PAVGB(%%mm4, %%mm3) // 4 211 /8
257  PAVGB(%%mm0, %%mm3) //642211 /16
258  "movq %%mm3, (%0) \n\t" // X
259  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
260  "movq %%mm1, %%mm0 \n\t" // 1
261  PAVGB(%%mm6, %%mm0) //1 1 /2
262  "movq %%mm4, %%mm3 \n\t" // 1
263  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
264  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
265  PAVGB((%%FF_REGa), %%mm5) // 211 /4
266  PAVGB(%%mm5, %%mm3) // 2 2211 /8
267  PAVGB(%%mm0, %%mm3) //4242211 /16
268  "movq %%mm3, (%0,%1) \n\t" // X
269  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
270  PAVGB(%%mm4, %%mm6) //11 /2
271  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
272  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
273  "movq %%mm0, %%mm3 \n\t" // 11/2
274  PAVGB(%%mm1, %%mm0) // 2 11/4
275  PAVGB(%%mm6, %%mm0) //222 11/8
276  PAVGB(%%mm2, %%mm0) //22242211/16
277  "movq (%0, %1, 2), %%mm2 \n\t" // 1
278  "movq %%mm0, (%0, %1, 2) \n\t" // X
279  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
280  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
281  PAVGB((%%FF_REGc), %%mm0) // 11 /2
282  PAVGB(%%mm0, %%mm6) //11 11 /4
283  PAVGB(%%mm1, %%mm4) // 11 /2
284  PAVGB(%%mm2, %%mm1) // 11 /2
285  PAVGB(%%mm1, %%mm6) //1122 11 /8
286  PAVGB(%%mm5, %%mm6) //112242211 /16
287  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
288  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
289  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
290  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
291  PAVGB(%%mm7, %%mm6) // 11 /2
292  PAVGB(%%mm4, %%mm6) // 11 11 /4
293  PAVGB(%%mm3, %%mm6) // 11 2211 /8
294  PAVGB(%%mm5, %%mm2) // 11 /2
295  "movq (%0, %1, 4), %%mm4 \n\t" // 1
296  PAVGB(%%mm4, %%mm2) // 112 /4
297  PAVGB(%%mm2, %%mm6) // 112242211 /16
298  "movq %%mm6, (%0, %1, 4) \n\t" // X
299  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
300  PAVGB(%%mm7, %%mm1) // 11 2 /4
301  PAVGB(%%mm4, %%mm5) // 11 /2
302  PAVGB(%%mm5, %%mm0) // 11 11 /4
303  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
304  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
305  PAVGB(%%mm0, %%mm1) // 11224222 /16
306  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
307  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
308  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
309  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
310  PAVGB(%%mm0, %%mm6) // 1 1 /2
311  PAVGB(%%mm7, %%mm6) // 1 12 /4
312  PAVGB(%%mm2, %%mm6) // 1122424 /4
313  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
314  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
315  PAVGB(%%mm7, %%mm5) // 11 2 /4
316  PAVGB(%%mm7, %%mm5) // 11 6 /8
317 
318  PAVGB(%%mm3, %%mm0) // 112 /4
319  PAVGB(%%mm0, %%mm5) // 112246 /16
320  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
321  "sub %1, %0 \n\t"
322 
323  :
324  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
325  : "%"FF_REG_a, "%"FF_REG_c
326  );
327 #else //TEMPLATE_PP_MMXEXT
328  const int l1= stride;
329  const int l2= stride + l1;
330  const int l3= stride + l2;
331  const int l4= stride + l3;
332  const int l5= stride + l4;
333  const int l6= stride + l5;
334  const int l7= stride + l6;
335  const int l8= stride + l7;
336  const int l9= stride + l8;
337  int x;
338  src+= stride*3;
339  for(x=0; x<BLOCK_SIZE; x++){
340  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
341  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
342 
343  int sums[10];
344  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
345  sums[1] = sums[0] - first + src[l4];
346  sums[2] = sums[1] - first + src[l5];
347  sums[3] = sums[2] - first + src[l6];
348  sums[4] = sums[3] - first + src[l7];
349  sums[5] = sums[4] - src[l1] + src[l8];
350  sums[6] = sums[5] - src[l2] + last;
351  sums[7] = sums[6] - src[l3] + last;
352  sums[8] = sums[7] - src[l4] + last;
353  sums[9] = sums[8] - src[l5] + last;
354 
355  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
356  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
357  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
358  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
359  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
360  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
361  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
362  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
363 
364  src++;
365  }
366 #endif //TEMPLATE_PP_MMXEXT
367 }
368 #endif //TEMPLATE_PP_ALTIVEC
369 
370 /**
371  * Experimental Filter 1
372  * will not damage linear gradients
373  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
374  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
375  * MMX2 version does correct clipping C version does not
376  */
377 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
378 {
379 #if TEMPLATE_PP_MMXEXT
380  src+= stride*3;
381 
382  __asm__ volatile(
383  "pxor %%mm7, %%mm7 \n\t" // 0
384  "lea (%0, %1), %%"FF_REG_a" \n\t"
385  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
386 // 0 1 2 3 4 5 6 7 8 9
387 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
388  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
389  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
390  "movq %%mm1, %%mm2 \n\t" // line 4
391  "psubusb %%mm0, %%mm1 \n\t"
392  "psubusb %%mm2, %%mm0 \n\t"
393  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
394  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
395  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
396  "movq %%mm3, %%mm5 \n\t" // line 5
397  "psubusb %%mm4, %%mm3 \n\t"
398  "psubusb %%mm5, %%mm4 \n\t"
399  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
400  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
401  "movq %%mm2, %%mm1 \n\t" // line 4
402  "psubusb %%mm5, %%mm2 \n\t"
403  "movq %%mm2, %%mm4 \n\t"
404  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
405  "psubusb %%mm1, %%mm5 \n\t"
406  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
407  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
408  "movq %%mm4, %%mm3 \n\t" // d
409  "movq %2, %%mm0 \n\t"
410  "paddusb %%mm0, %%mm0 \n\t"
411  "psubusb %%mm0, %%mm4 \n\t"
412  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
413  "psubusb "MANGLE(b01)", %%mm3 \n\t"
414  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
415 
416  PAVGB(%%mm7, %%mm3) // d/2
417  "movq %%mm3, %%mm1 \n\t" // d/2
418  PAVGB(%%mm7, %%mm3) // d/4
419  PAVGB(%%mm1, %%mm3) // 3*d/8
420 
421  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
422  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
423  "psubusb %%mm3, %%mm0 \n\t"
424  "pxor %%mm2, %%mm0 \n\t"
425  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
426 
427  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
428  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
429  "paddusb %%mm3, %%mm0 \n\t"
430  "pxor %%mm2, %%mm0 \n\t"
431  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
432 
433  PAVGB(%%mm7, %%mm1) // d/4
434 
435  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
436  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
437  "psubusb %%mm1, %%mm0 \n\t"
438  "pxor %%mm2, %%mm0 \n\t"
439  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
440 
441  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
442  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
443  "paddusb %%mm1, %%mm0 \n\t"
444  "pxor %%mm2, %%mm0 \n\t"
445  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
446 
447  PAVGB(%%mm7, %%mm1) // d/8
448 
449  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
450  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
451  "psubusb %%mm1, %%mm0 \n\t"
452  "pxor %%mm2, %%mm0 \n\t"
453  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
454 
455  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
456  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
457  "paddusb %%mm1, %%mm0 \n\t"
458  "pxor %%mm2, %%mm0 \n\t"
459  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
460 
461  :
462  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
464  : "%"FF_REG_a, "%"FF_REG_c
465  );
466 #else //TEMPLATE_PP_MMXEXT
467 
468  const int l1= stride;
469  const int l2= stride + l1;
470  const int l3= stride + l2;
471  const int l4= stride + l3;
472  const int l5= stride + l4;
473  const int l6= stride + l5;
474  const int l7= stride + l6;
475 // const int l8= stride + l7;
476 // const int l9= stride + l8;
477  int x;
478 
479  src+= stride*3;
480  for(x=0; x<BLOCK_SIZE; x++){
481  int a= src[l3] - src[l4];
482  int b= src[l4] - src[l5];
483  int c= src[l5] - src[l6];
484 
485  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
486  d= FFMAX(d, 0);
487 
488  if(d < co->QP*2){
489  int v = d * FFSIGN(-b);
490 
491  src[l2] +=v>>3;
492  src[l3] +=v>>2;
493  src[l4] +=(3*v)>>3;
494  src[l5] -=(3*v)>>3;
495  src[l6] -=v>>2;
496  src[l7] -=v>>3;
497  }
498  src++;
499  }
500 #endif //TEMPLATE_PP_MMXEXT
501 }
502 
503 #if !TEMPLATE_PP_ALTIVEC
504 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
505 {
506 #if TEMPLATE_PP_MMXEXT
507 /*
508  uint8_t tmp[16];
509  const int l1= stride;
510  const int l2= stride + l1;
511  const int l3= stride + l2;
512  const int l4= (int)tmp - (int)src - stride*3;
513  const int l5= (int)tmp - (int)src - stride*3 + 8;
514  const int l6= stride*3 + l3;
515  const int l7= stride + l6;
516  const int l8= stride + l7;
517 
518  memcpy(tmp, src+stride*7, 8);
519  memcpy(tmp+8, src+stride*8, 8);
520 */
521  src+= stride*4;
522  __asm__ volatile(
523 
524 #if 0 //slightly more accurate and slightly slower
525  "pxor %%mm7, %%mm7 \n\t" // 0
526  "lea (%0, %1), %%"FF_REG_a" \n\t"
527  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
528 // 0 1 2 3 4 5 6 7
529 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
530 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
531 
532 
533  "movq (%0, %1, 2), %%mm0 \n\t" // l2
534  "movq (%0), %%mm1 \n\t" // l0
535  "movq %%mm0, %%mm2 \n\t" // l2
536  PAVGB(%%mm7, %%mm0) // ~l2/2
537  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
538  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
539 
540  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
541  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
542  "movq %%mm1, %%mm4 \n\t" // l1
543  PAVGB(%%mm7, %%mm1) // ~l1/2
544  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
545  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
546 
547  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
548  "psubusb %%mm1, %%mm0 \n\t"
549  "psubusb %%mm4, %%mm1 \n\t"
550  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
551 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
552 
553  "movq (%0, %1, 4), %%mm0 \n\t" // l4
554  "movq %%mm0, %%mm4 \n\t" // l4
555  PAVGB(%%mm7, %%mm0) // ~l4/2
556  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
557  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
558 
559  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
560  "movq %%mm3, %%mm5 \n\t" // l3
561  PAVGB(%%mm7, %%mm3) // ~l3/2
562  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
563  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
564 
565  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
566  "psubusb %%mm3, %%mm0 \n\t"
567  "psubusb %%mm6, %%mm3 \n\t"
568  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
569  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
570 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
571 
572  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
573  "movq %%mm6, %%mm5 \n\t" // l6
574  PAVGB(%%mm7, %%mm6) // ~l6/2
575  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
576  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
577 
578  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
579  "movq %%mm2, %%mm4 \n\t" // l5
580  PAVGB(%%mm7, %%mm2) // ~l5/2
581  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
582  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
583 
584  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
585  "psubusb %%mm2, %%mm6 \n\t"
586  "psubusb %%mm4, %%mm2 \n\t"
587  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
588 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
589 
590 
591  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
592  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
593  "paddusb "MANGLE(b01)", %%mm4 \n\t"
594  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
595  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
596  "pand %%mm4, %%mm3 \n\t"
597 
598  "movq %%mm3, %%mm1 \n\t"
599 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
600  PAVGB(%%mm7, %%mm3)
601  PAVGB(%%mm7, %%mm3)
602  "paddusb %%mm1, %%mm3 \n\t"
603 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
604 
605  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
606  "movq (%0, %1, 4), %%mm5 \n\t" //l4
607  "movq (%0, %1, 4), %%mm4 \n\t" //l4
608  "psubusb %%mm6, %%mm5 \n\t"
609  "psubusb %%mm4, %%mm6 \n\t"
610  "por %%mm6, %%mm5 \n\t" // |l3-l4|
611  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
612  "pxor %%mm6, %%mm0 \n\t"
613  "pand %%mm0, %%mm3 \n\t"
614  PMINUB(%%mm5, %%mm3, %%mm0)
615 
616  "psubusb "MANGLE(b01)", %%mm3 \n\t"
617  PAVGB(%%mm7, %%mm3)
618 
619  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
620  "movq (%0, %1, 4), %%mm2 \n\t"
621  "pxor %%mm6, %%mm0 \n\t"
622  "pxor %%mm6, %%mm2 \n\t"
623  "psubb %%mm3, %%mm0 \n\t"
624  "paddb %%mm3, %%mm2 \n\t"
625  "pxor %%mm6, %%mm0 \n\t"
626  "pxor %%mm6, %%mm2 \n\t"
627  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
628  "movq %%mm2, (%0, %1, 4) \n\t"
629 #endif //0
630 
631  "lea (%0, %1), %%"FF_REG_a" \n\t"
632  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
633 // 0 1 2 3 4 5 6 7
634 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
635 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
636 
637 
638  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
639  "movq (%0, %1, 4), %%mm0 \n\t" // l4
640  "pxor %%mm6, %%mm1 \n\t" // -l3-1
641  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
642 // mm1=-l3-1, mm0=128-q
643 
644  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
645  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
646  "pxor %%mm6, %%mm2 \n\t" // -l5-1
647  "movq %%mm2, %%mm5 \n\t" // -l5-1
648  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
649  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
650  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
651  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
652  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
653  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
654 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
655 
656  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
657  "pxor %%mm6, %%mm2 \n\t" // -l1-1
658  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
659  PAVGB((%0), %%mm1) // (l0-l3+256)/2
660  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
661  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
662  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
663  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
664 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
665 
666  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
667  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
668  "pxor %%mm6, %%mm1 \n\t" // -l7-1
669  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
670  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
671  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
672  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
673  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
674 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
675 
676  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
677  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
678  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
679  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
680  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
681  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
682  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
683 
684 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
685 
686  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
687  "movq %2, %%mm2 \n\t" // QP
688  PAVGB(%%mm6, %%mm2) // 128 + QP/2
689  "psubb %%mm6, %%mm2 \n\t"
690 
691  "movq %%mm4, %%mm1 \n\t"
692  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
693  "pxor %%mm1, %%mm4 \n\t"
694  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
695  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
696  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
697 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
698 
699  "movq %%mm4, %%mm3 \n\t" // d
700  "psubusb "MANGLE(b01)", %%mm4 \n\t"
701  PAVGB(%%mm7, %%mm4) // d/32
702  PAVGB(%%mm7, %%mm4) // (d + 32)/64
703  "paddb %%mm3, %%mm4 \n\t" // 5d/64
704  "pand %%mm2, %%mm4 \n\t"
705 
706  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
707  "psubb %%mm0, %%mm5 \n\t" // q
708  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
709  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
710  "pxor %%mm7, %%mm5 \n\t"
711 
712  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
713  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
714 
715  "pand %%mm7, %%mm4 \n\t"
716  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
717  "movq (%0, %1, 4), %%mm2 \n\t"
718  "pxor %%mm1, %%mm0 \n\t"
719  "pxor %%mm1, %%mm2 \n\t"
720  "paddb %%mm4, %%mm0 \n\t"
721  "psubb %%mm4, %%mm2 \n\t"
722  "pxor %%mm1, %%mm0 \n\t"
723  "pxor %%mm1, %%mm2 \n\t"
724  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
725  "movq %%mm2, (%0, %1, 4) \n\t"
726 
727  :
728  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
729  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
730  : "%"FF_REG_a, "%"FF_REG_c
731  );
732 
733 /*
734  {
735  int x;
736  src-= stride;
737  for(x=0; x<BLOCK_SIZE; x++){
738  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
739  if(FFABS(middleEnergy)< 8*QP){
740  const int q=(src[l4] - src[l5])/2;
741  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
742  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
743 
744  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
745  d= FFMAX(d, 0);
746 
747  d= (5*d + 32) >> 6;
748  d*= FFSIGN(-middleEnergy);
749 
750  if(q>0){
751  d= d<0 ? 0 : d;
752  d= d>q ? q : d;
753  }else{
754  d= d>0 ? 0 : d;
755  d= d<q ? q : d;
756  }
757 
758  src[l4]-= d;
759  src[l5]+= d;
760  }
761  src++;
762  }
763  src-=8;
764  for(x=0; x<8; x++){
765  int y;
766  for(y=4; y<6; y++){
767  int d= src[x+y*stride] - tmp[x+(y-4)*8];
768  int ad= FFABS(d);
769  static int max=0;
770  static int sum=0;
771  static int num=0;
772  static int bias=0;
773 
774  if(max<ad) max=ad;
775  sum+= ad>3 ? 1 : 0;
776  if(ad>3){
777  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
778  }
779  if(y==4) bias+=d;
780  num++;
781  if(num%1000000 == 0){
782  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
783  }
784  }
785  }
786 }
787 */
788 #else //TEMPLATE_PP_MMXEXT
789  const int l1= stride;
790  const int l2= stride + l1;
791  const int l3= stride + l2;
792  const int l4= stride + l3;
793  const int l5= stride + l4;
794  const int l6= stride + l5;
795  const int l7= stride + l6;
796  const int l8= stride + l7;
797 // const int l9= stride + l8;
798  int x;
799  src+= stride*3;
800  for(x=0; x<BLOCK_SIZE; x++){
801  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
802  if(FFABS(middleEnergy) < 8*c->QP){
803  const int q=(src[l4] - src[l5])/2;
804  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
805  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
806 
807  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
808  d= FFMAX(d, 0);
809 
810  d= (5*d + 32) >> 6;
811  d*= FFSIGN(-middleEnergy);
812 
813  if(q>0){
814  d = FFMAX(d, 0);
815  d = FFMIN(d, q);
816  }else{
817  d = FFMIN(d, 0);
818  d = FFMAX(d, q);
819  }
820 
821  src[l4]-= d;
822  src[l5]+= d;
823  }
824  src++;
825  }
826 #endif //TEMPLATE_PP_MMXEXT
827 }
828 #endif //TEMPLATE_PP_ALTIVEC
829 
830 #if !TEMPLATE_PP_ALTIVEC
831 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
832 {
833 #if HAVE_7REGS && TEMPLATE_PP_MMXEXT
834  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
835  __asm__ volatile(
836  "pxor %%mm6, %%mm6 \n\t"
837  "pcmpeqb %%mm7, %%mm7 \n\t"
838  "movq %2, %%mm0 \n\t"
839  "punpcklbw %%mm6, %%mm0 \n\t"
840  "psrlw $1, %%mm0 \n\t"
841  "psubw %%mm7, %%mm0 \n\t"
842  "packuswb %%mm0, %%mm0 \n\t"
843  "movq %%mm0, %3 \n\t"
844 
845  "lea (%0, %1), %%"FF_REG_a" \n\t"
846  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
847 
848 // 0 1 2 3 4 5 6 7 8 9
849 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
850 
851 #undef REAL_FIND_MIN_MAX
852 #undef FIND_MIN_MAX
853 #define REAL_FIND_MIN_MAX(addr)\
854  "movq " #addr ", %%mm0 \n\t"\
855  "pminub %%mm0, %%mm7 \n\t"\
856  "pmaxub %%mm0, %%mm6 \n\t"
857 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
858 
859 FIND_MIN_MAX((%%FF_REGa))
860 FIND_MIN_MAX((%%FF_REGa, %1))
861 FIND_MIN_MAX((%%FF_REGa, %1, 2))
862 FIND_MIN_MAX((%0, %1, 4))
863 FIND_MIN_MAX((%%FF_REGd))
864 FIND_MIN_MAX((%%FF_REGd, %1))
865 FIND_MIN_MAX((%%FF_REGd, %1, 2))
866 FIND_MIN_MAX((%0, %1, 8))
867 
868  "movq %%mm7, %%mm4 \n\t"
869  "psrlq $8, %%mm7 \n\t"
870  "pminub %%mm4, %%mm7 \n\t" // min of pixels
871  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
872  "pminub %%mm4, %%mm7 \n\t" // min of pixels
873  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
874  "pminub %%mm4, %%mm7 \n\t"
875 
876 
877  "movq %%mm6, %%mm4 \n\t"
878  "psrlq $8, %%mm6 \n\t"
879  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
880  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
881  "pmaxub %%mm4, %%mm6 \n\t"
882  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
883  "pmaxub %%mm4, %%mm6 \n\t"
884  "movq %%mm6, %%mm0 \n\t" // max
885  "psubb %%mm7, %%mm6 \n\t" // max - min
886  "push %%"FF_REG_a" \n\t"
887  "movd %%mm6, %%eax \n\t"
888  "cmpb "MANGLE(deringThreshold)", %%al \n\t"
889  "pop %%"FF_REG_a" \n\t"
890  " jb 1f \n\t"
891  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
892  "punpcklbw %%mm7, %%mm7 \n\t"
893  "punpcklbw %%mm7, %%mm7 \n\t"
894  "punpcklbw %%mm7, %%mm7 \n\t"
895  "movq %%mm7, (%4) \n\t"
896 
897  "movq (%0), %%mm0 \n\t" // L10
898  "movq %%mm0, %%mm1 \n\t" // L10
899  "movq %%mm0, %%mm2 \n\t" // L10
900  "psllq $8, %%mm1 \n\t"
901  "psrlq $8, %%mm2 \n\t"
902  "movd -4(%0), %%mm3 \n\t"
903  "movd 8(%0), %%mm4 \n\t"
904  "psrlq $24, %%mm3 \n\t"
905  "psllq $56, %%mm4 \n\t"
906  "por %%mm3, %%mm1 \n\t" // L00
907  "por %%mm4, %%mm2 \n\t" // L20
908  "movq %%mm1, %%mm3 \n\t" // L00
909  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
910  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
911  "psubusb %%mm7, %%mm0 \n\t"
912  "psubusb %%mm7, %%mm2 \n\t"
913  "psubusb %%mm7, %%mm3 \n\t"
914  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
915  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
916  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
917  "paddb %%mm2, %%mm0 \n\t"
918  "paddb %%mm3, %%mm0 \n\t"
919 
920  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
921  "movq %%mm2, %%mm3 \n\t" // L11
922  "movq %%mm2, %%mm4 \n\t" // L11
923  "psllq $8, %%mm3 \n\t"
924  "psrlq $8, %%mm4 \n\t"
925  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
926  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
927  "psrlq $24, %%mm5 \n\t"
928  "psllq $56, %%mm6 \n\t"
929  "por %%mm5, %%mm3 \n\t" // L01
930  "por %%mm6, %%mm4 \n\t" // L21
931  "movq %%mm3, %%mm5 \n\t" // L01
932  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
933  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
934  "psubusb %%mm7, %%mm2 \n\t"
935  "psubusb %%mm7, %%mm4 \n\t"
936  "psubusb %%mm7, %%mm5 \n\t"
937  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
938  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
939  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
940  "paddb %%mm4, %%mm2 \n\t"
941  "paddb %%mm5, %%mm2 \n\t"
942 // 0, 2, 3, 1
943 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
944  "movq " #src ", " #sx " \n\t" /* src[0] */\
945  "movq " #sx ", " #lx " \n\t" /* src[0] */\
946  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
947  "psllq $8, " #lx " \n\t"\
948  "psrlq $8, " #t0 " \n\t"\
949  "movd -4" #src ", " #t1 " \n\t"\
950  "psrlq $24, " #t1 " \n\t"\
951  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
952  "movd 8" #src ", " #t1 " \n\t"\
953  "psllq $56, " #t1 " \n\t"\
954  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
955  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
956  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
957  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
958  PAVGB(lx, pplx) \
959  "movq " #lx ", 8(%4) \n\t"\
960  "movq (%4), " #lx " \n\t"\
961  "psubusb " #lx ", " #t1 " \n\t"\
962  "psubusb " #lx ", " #t0 " \n\t"\
963  "psubusb " #lx ", " #sx " \n\t"\
964  "movq "MANGLE(b00)", " #lx " \n\t"\
965  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
966  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
967  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
968  "paddb " #t1 ", " #t0 " \n\t"\
969  "paddb " #t0 ", " #sx " \n\t"\
970 \
971  PAVGB(plx, pplx) /* filtered */\
972  "movq " #dst ", " #t0 " \n\t" /* dst */\
973  "movq " #t0 ", " #t1 " \n\t" /* dst */\
974  "psubusb %3, " #t0 " \n\t"\
975  "paddusb %3, " #t1 " \n\t"\
976  PMAXUB(t0, pplx)\
977  PMINUB(t1, pplx, t0)\
978  "paddb " #sx ", " #ppsx " \n\t"\
979  "paddb " #psx ", " #ppsx " \n\t"\
980  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
981  "pand "MANGLE(b08)", " #ppsx " \n\t"\
982  "pcmpeqb " #lx ", " #ppsx " \n\t"\
983  "pand " #ppsx ", " #pplx " \n\t"\
984  "pandn " #dst ", " #ppsx " \n\t"\
985  "por " #pplx ", " #ppsx " \n\t"\
986  "movq " #ppsx ", " #dst " \n\t"\
987  "movq 8(%4), " #lx " \n\t"
988 
989 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
990  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
991 /*
992 0000000
993 1111111
994 
995 1111110
996 1111101
997 1111100
998 1111011
999 1111010
1000 1111001
1001 
1002 1111000
1003 1110111
1004 
1005 */
1006 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1007 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1008 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1009 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1010 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1011 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1012 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1013 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1014 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1015 
1016  "1: \n\t"
1017  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1018  NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
1019  : "%"FF_REG_a, "%"FF_REG_d
1020  );
1021 #else // HAVE_7REGS && TEMPLATE_PP_MMXEXT
1022  int y;
1023  int min=255;
1024  int max=0;
1025  int avg;
1026  uint8_t *p;
1027  int s[10];
1028  const int QP2= c->QP/2 + 1;
1029 
1030  src --;
1031  for(y=1; y<9; y++){
1032  int x;
1033  p= src + stride*y;
1034  for(x=1; x<9; x++){
1035  p++;
1036  if(*p > max) max= *p;
1037  if(*p < min) min= *p;
1038  }
1039  }
1040  avg= (min + max + 1)>>1;
1041 
1042  if(max - min <deringThreshold) return;
1043 
1044  for(y=0; y<10; y++){
1045  int t = 0;
1046 
1047  if(src[stride*y + 0] > avg) t+= 1;
1048  if(src[stride*y + 1] > avg) t+= 2;
1049  if(src[stride*y + 2] > avg) t+= 4;
1050  if(src[stride*y + 3] > avg) t+= 8;
1051  if(src[stride*y + 4] > avg) t+= 16;
1052  if(src[stride*y + 5] > avg) t+= 32;
1053  if(src[stride*y + 6] > avg) t+= 64;
1054  if(src[stride*y + 7] > avg) t+= 128;
1055  if(src[stride*y + 8] > avg) t+= 256;
1056  if(src[stride*y + 9] > avg) t+= 512;
1057 
1058  t |= (~t)<<16;
1059  t &= (t<<1) & (t>>1);
1060  s[y] = t;
1061  }
1062 
1063  for(y=1; y<9; y++){
1064  int t = s[y-1] & s[y] & s[y+1];
1065  t|= t>>16;
1066  s[y-1]= t;
1067  }
1068 
1069  for(y=1; y<9; y++){
1070  int x;
1071  int t = s[y-1];
1072 
1073  p= src + stride*y;
1074  for(x=1; x<9; x++){
1075  p++;
1076  if(t & (1<<x)){
1077  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1078  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1079  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1080  f= (f + 8)>>4;
1081 
1082 #ifdef DEBUG_DERING_THRESHOLD
1083  __asm__ volatile("emms\n\t":);
1084  {
1085  static uint64_t numPixels=0;
1086  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1087 // if((max-min)<20 || (max-min)*QP<200)
1088 // if((max-min)*QP < 500)
1089 // if(max-min<QP/2)
1090  if(max-min < 20){
1091  static int numSkipped=0;
1092  static int errorSum=0;
1093  static int worstQP=0;
1094  static int worstRange=0;
1095  static int worstDiff=0;
1096  int diff= (f - *p);
1097  int absDiff= FFABS(diff);
1098  int error= diff*diff;
1099 
1100  if(x==1 || x==8 || y==1 || y==8) continue;
1101 
1102  numSkipped++;
1103  if(absDiff > worstDiff){
1104  worstDiff= absDiff;
1105  worstQP= QP;
1106  worstRange= max-min;
1107  }
1108  errorSum+= error;
1109 
1110  if(1024LL*1024LL*1024LL % numSkipped == 0){
1111  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1112  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1113  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1114  worstDiff, (float)numSkipped/numPixels);
1115  }
1116  }
1117  }
1118 #endif
1119  if (*p + QP2 < f) *p= *p + QP2;
1120  else if(*p - QP2 > f) *p= *p - QP2;
1121  else *p=f;
1122  }
1123  }
1124  }
1125 #ifdef DEBUG_DERING_THRESHOLD
1126  if(max-min < 20){
1127  for(y=1; y<9; y++){
1128  int x;
1129  int t = 0;
1130  p= src + stride*y;
1131  for(x=1; x<9; x++){
1132  p++;
1133  *p = FFMIN(*p + 20, 255);
1134  }
1135  }
1136 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1137  }
1138 #endif
1139 #endif //TEMPLATE_PP_MMXEXT
1140 }
1141 #endif //TEMPLATE_PP_ALTIVEC
1142 
1143 /**
1144  * Deinterlace the given block by linearly interpolating every second line.
1145  * will be called for every 8x8 block and can read & write from line 4-15
1146  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1147  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1148  */
1149 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1150 {
1151 #if TEMPLATE_PP_MMXEXT
1152  src+= 4*stride;
1153  __asm__ volatile(
1154  "lea (%0, %1), %%"FF_REG_a" \n\t"
1155  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1156 // 0 1 2 3 4 5 6 7 8 9
1157 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1158 
1159  "movq (%0), %%mm0 \n\t"
1160  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1161  PAVGB(%%mm1, %%mm0)
1162  "movq %%mm0, (%%"FF_REG_a") \n\t"
1163  "movq (%0, %1, 4), %%mm0 \n\t"
1164  PAVGB(%%mm0, %%mm1)
1165  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1166  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1167  PAVGB(%%mm1, %%mm0)
1168  "movq %%mm0, (%%"FF_REG_c") \n\t"
1169  "movq (%0, %1, 8), %%mm0 \n\t"
1170  PAVGB(%%mm0, %%mm1)
1171  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1172 
1173  : : "r" (src), "r" ((x86_reg)stride)
1174  : "%"FF_REG_a, "%"FF_REG_c
1175  );
1176 #else
1177  int a, b, x;
1178  src+= 4*stride;
1179 
1180  for(x=0; x<2; x++){
1181  a= *(uint32_t*)&src[stride*0];
1182  b= *(uint32_t*)&src[stride*2];
1183  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1184  a= *(uint32_t*)&src[stride*4];
1185  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1186  b= *(uint32_t*)&src[stride*6];
1187  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1188  a= *(uint32_t*)&src[stride*8];
1189  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1190  src += 4;
1191  }
1192 #endif
1193 }
1194 
1195 /**
1196  * Deinterlace the given block by cubic interpolating every second line.
1197  * will be called for every 8x8 block and can read & write from line 4-15
1198  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1199  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1200  * this filter will read lines 3-15 and write 7-13
1201  */
1202 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1203 {
1204 #if TEMPLATE_PP_SSE2
1205  src+= stride*3;
1206  __asm__ volatile(
1207  "lea (%0, %1), %%"FF_REG_a" \n\t"
1208  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1209  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1210  "add %1, %%"FF_REG_c" \n\t"
1211  "pxor %%xmm7, %%xmm7 \n\t"
1212 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1213  "movq " #a ", %%xmm0 \n\t"\
1214  "movq " #b ", %%xmm1 \n\t"\
1215  "movq " #d ", %%xmm2 \n\t"\
1216  "movq " #e ", %%xmm3 \n\t"\
1217  "pavgb %%xmm2, %%xmm1 \n\t"\
1218  "pavgb %%xmm3, %%xmm0 \n\t"\
1219  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1220  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1221  "psubw %%xmm1, %%xmm0 \n\t"\
1222  "psraw $3, %%xmm0 \n\t"\
1223  "psubw %%xmm0, %%xmm1 \n\t"\
1224  "packuswb %%xmm1, %%xmm1 \n\t"\
1225  "movlps %%xmm1, " #c " \n\t"
1226 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1227 
1228 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1229 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1230 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1231 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1232 
1233  : : "r" (src), "r" ((x86_reg)stride)
1234  :
1235  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1236  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1237  );
1238 #undef REAL_DEINT_CUBIC
1239 #else //TEMPLATE_PP_SSE2
1240  int x;
1241  src+= stride*3;
1242  for(x=0; x<8; x++){
1243  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1244  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1245  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1246  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1247  src++;
1248  }
1249 #endif //TEMPLATE_PP_SSE2
1250 }
1251 
1252 /**
1253  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1254  * will be called for every 8x8 block and can read & write from line 4-15
1255  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1256  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1257  * this filter will read lines 4-13 and write 5-11
1258  */
1259 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1260 {
1261 #if TEMPLATE_PP_MMXEXT
1262  src+= stride*4;
1263  __asm__ volatile(
1264  "lea (%0, %1), %%"FF_REG_a" \n\t"
1265  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1266  "pxor %%mm7, %%mm7 \n\t"
1267  "movq (%2), %%mm0 \n\t"
1268 // 0 1 2 3 4 5 6 7 8 9 10
1269 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1270 
1271 #define REAL_DEINT_FF(a,b,c,d)\
1272  "movq " #a ", %%mm1 \n\t"\
1273  "movq " #b ", %%mm2 \n\t"\
1274  "movq " #c ", %%mm3 \n\t"\
1275  "movq " #d ", %%mm4 \n\t"\
1276  PAVGB(%%mm3, %%mm1) \
1277  PAVGB(%%mm4, %%mm0) \
1278  "movq %%mm0, %%mm3 \n\t"\
1279  "punpcklbw %%mm7, %%mm0 \n\t"\
1280  "punpckhbw %%mm7, %%mm3 \n\t"\
1281  "movq %%mm1, %%mm4 \n\t"\
1282  "punpcklbw %%mm7, %%mm1 \n\t"\
1283  "punpckhbw %%mm7, %%mm4 \n\t"\
1284  "psllw $2, %%mm1 \n\t"\
1285  "psllw $2, %%mm4 \n\t"\
1286  "psubw %%mm0, %%mm1 \n\t"\
1287  "psubw %%mm3, %%mm4 \n\t"\
1288  "movq %%mm2, %%mm5 \n\t"\
1289  "movq %%mm2, %%mm0 \n\t"\
1290  "punpcklbw %%mm7, %%mm2 \n\t"\
1291  "punpckhbw %%mm7, %%mm5 \n\t"\
1292  "paddw %%mm2, %%mm1 \n\t"\
1293  "paddw %%mm5, %%mm4 \n\t"\
1294  "psraw $2, %%mm1 \n\t"\
1295  "psraw $2, %%mm4 \n\t"\
1296  "packuswb %%mm4, %%mm1 \n\t"\
1297  "movq %%mm1, " #b " \n\t"\
1298 
1299 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1300 
1301 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1302 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1303 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1304 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1305 
1306  "movq %%mm0, (%2) \n\t"
1307  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1308  : "%"FF_REG_a, "%"FF_REG_d
1309  );
1310 #else //TEMPLATE_PP_MMXEXT
1311  int x;
1312  src+= stride*4;
1313  for(x=0; x<8; x++){
1314  int t1= tmp[x];
1315  int t2= src[stride*1];
1316 
1317  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1318  t1= src[stride*4];
1319  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1320  t2= src[stride*6];
1321  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1322  t1= src[stride*8];
1323  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1324  tmp[x]= t1;
1325 
1326  src++;
1327  }
1328 #endif //TEMPLATE_PP_MMXEXT
1329 }
1330 
1331 /**
1332  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1333  * will be called for every 8x8 block and can read & write from line 4-15
1334  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1335  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1336  * this filter will read lines 4-13 and write 4-11
1337  */
1338 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1339 {
1340 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1341  src+= stride*4;
1342  __asm__ volatile(
1343  "lea (%0, %1), %%"FF_REG_a" \n\t"
1344  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1345  "pxor %%mm7, %%mm7 \n\t"
1346  "movq (%2), %%mm0 \n\t"
1347  "movq (%3), %%mm1 \n\t"
1348 // 0 1 2 3 4 5 6 7 8 9 10
1349 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1350 
1351 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1352  "movq " #a ", %%mm2 \n\t"\
1353  "movq " #b ", %%mm3 \n\t"\
1354  "movq " #c ", %%mm4 \n\t"\
1355  PAVGB(t2, %%mm3) \
1356  PAVGB(t1, %%mm4) \
1357  "movq %%mm2, %%mm5 \n\t"\
1358  "movq %%mm2, " #t1 " \n\t"\
1359  "punpcklbw %%mm7, %%mm2 \n\t"\
1360  "punpckhbw %%mm7, %%mm5 \n\t"\
1361  "movq %%mm2, %%mm6 \n\t"\
1362  "paddw %%mm2, %%mm2 \n\t"\
1363  "paddw %%mm6, %%mm2 \n\t"\
1364  "movq %%mm5, %%mm6 \n\t"\
1365  "paddw %%mm5, %%mm5 \n\t"\
1366  "paddw %%mm6, %%mm5 \n\t"\
1367  "movq %%mm3, %%mm6 \n\t"\
1368  "punpcklbw %%mm7, %%mm3 \n\t"\
1369  "punpckhbw %%mm7, %%mm6 \n\t"\
1370  "paddw %%mm3, %%mm3 \n\t"\
1371  "paddw %%mm6, %%mm6 \n\t"\
1372  "paddw %%mm3, %%mm2 \n\t"\
1373  "paddw %%mm6, %%mm5 \n\t"\
1374  "movq %%mm4, %%mm6 \n\t"\
1375  "punpcklbw %%mm7, %%mm4 \n\t"\
1376  "punpckhbw %%mm7, %%mm6 \n\t"\
1377  "psubw %%mm4, %%mm2 \n\t"\
1378  "psubw %%mm6, %%mm5 \n\t"\
1379  "psraw $2, %%mm2 \n\t"\
1380  "psraw $2, %%mm5 \n\t"\
1381  "packuswb %%mm5, %%mm2 \n\t"\
1382  "movq %%mm2, " #a " \n\t"\
1383 
1384 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1385 
1386 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1387 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1388 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1389 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1390 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1391 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1392 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1393 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1394 
1395  "movq %%mm0, (%2) \n\t"
1396  "movq %%mm1, (%3) \n\t"
1397  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1398  : "%"FF_REG_a, "%"FF_REG_d
1399  );
1400 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
1401  int x;
1402  src+= stride*4;
1403  for(x=0; x<8; x++){
1404  int t1= tmp[x];
1405  int t2= tmp2[x];
1406  int t3= src[0];
1407 
1408  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1409  t1= src[stride*1];
1410  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1411  t2= src[stride*2];
1412  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1413  t3= src[stride*3];
1414  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1415  t1= src[stride*4];
1416  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1417  t2= src[stride*5];
1418  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1419  t3= src[stride*6];
1420  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1421  t1= src[stride*7];
1422  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1423 
1424  tmp[x]= t3;
1425  tmp2[x]= t1;
1426 
1427  src++;
1428  }
1429 #endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS
1430 }
1431 
1432 /**
1433  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1434  * will be called for every 8x8 block and can read & write from line 4-15
1435  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1436  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1437  * this filter will read lines 4-13 and write 4-11
1438  */
1439 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1440 {
1441 #if TEMPLATE_PP_MMXEXT
1442  src+= 4*stride;
1443  __asm__ volatile(
1444  "lea (%0, %1), %%"FF_REG_a" \n\t"
1445  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1446 // 0 1 2 3 4 5 6 7 8 9
1447 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1448 
1449  "movq (%2), %%mm0 \n\t" // L0
1450  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1451  PAVGB(%%mm1, %%mm0) // L0+L2
1452  "movq (%0), %%mm2 \n\t" // L1
1453  PAVGB(%%mm2, %%mm0)
1454  "movq %%mm0, (%0) \n\t"
1455  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1456  PAVGB(%%mm0, %%mm2) // L1+L3
1457  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1458  "movq %%mm2, (%%"FF_REG_a") \n\t"
1459  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1460  PAVGB(%%mm2, %%mm1) // L2+L4
1461  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1462  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1463  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1464  PAVGB(%%mm1, %%mm0) // L3+L5
1465  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1466  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1467  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1468  PAVGB(%%mm0, %%mm2) // L4+L6
1469  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1470  "movq %%mm2, (%0, %1, 4) \n\t"
1471  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1472  PAVGB(%%mm2, %%mm1) // L5+L7
1473  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1474  "movq %%mm1, (%%"FF_REG_d") \n\t"
1475  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1476  PAVGB(%%mm1, %%mm0) // L6+L8
1477  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1478  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1479  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1480  PAVGB(%%mm0, %%mm2) // L7+L9
1481  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1482  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1483  "movq %%mm1, (%2) \n\t"
1484 
1485  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1486  : "%"FF_REG_a, "%"FF_REG_d
1487  );
1488 #else //TEMPLATE_PP_MMXEXT
1489  int a, b, c, x;
1490  src+= 4*stride;
1491 
1492  for(x=0; x<2; x++){
1493  a= *(uint32_t*)&tmp[stride*0];
1494  b= *(uint32_t*)&src[stride*0];
1495  c= *(uint32_t*)&src[stride*1];
1496  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1497  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1498 
1499  a= *(uint32_t*)&src[stride*2];
1500  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1501  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1502 
1503  b= *(uint32_t*)&src[stride*3];
1504  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1505  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1506 
1507  c= *(uint32_t*)&src[stride*4];
1508  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1509  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1510 
1511  a= *(uint32_t*)&src[stride*5];
1512  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1513  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1514 
1515  b= *(uint32_t*)&src[stride*6];
1516  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1517  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1518 
1519  c= *(uint32_t*)&src[stride*7];
1520  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1521  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1522 
1523  a= *(uint32_t*)&src[stride*8];
1524  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1525  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1526 
1527  *(uint32_t*)&tmp[stride*0]= c;
1528  src += 4;
1529  tmp += 4;
1530  }
1531 #endif //TEMPLATE_PP_MMXEXT
1532 }
1533 
1534 /**
1535  * Deinterlace the given block by applying a median filter to every second line.
1536  * will be called for every 8x8 block and can read & write from line 4-15,
1537  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1538  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1539  */
1540 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1541 {
1542 #if TEMPLATE_PP_MMXEXT
1543  src+= 4*stride;
1544  __asm__ volatile(
1545  "lea (%0, %1), %%"FF_REG_a" \n\t"
1546  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1547 // 0 1 2 3 4 5 6 7 8 9
1548 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1549 
1550  "movq (%0), %%mm0 \n\t"
1551  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1552  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1553  "movq %%mm0, %%mm3 \n\t"
1554  "pmaxub %%mm1, %%mm0 \n\t"
1555  "pminub %%mm3, %%mm1 \n\t"
1556  "pmaxub %%mm2, %%mm1 \n\t"
1557  "pminub %%mm1, %%mm0 \n\t"
1558  "movq %%mm0, (%%"FF_REG_a") \n\t"
1559 
1560  "movq (%0, %1, 4), %%mm0 \n\t"
1561  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1562  "movq %%mm2, %%mm3 \n\t"
1563  "pmaxub %%mm1, %%mm2 \n\t"
1564  "pminub %%mm3, %%mm1 \n\t"
1565  "pmaxub %%mm0, %%mm1 \n\t"
1566  "pminub %%mm1, %%mm2 \n\t"
1567  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1568 
1569  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1570  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1571  "movq %%mm2, %%mm3 \n\t"
1572  "pmaxub %%mm0, %%mm2 \n\t"
1573  "pminub %%mm3, %%mm0 \n\t"
1574  "pmaxub %%mm1, %%mm0 \n\t"
1575  "pminub %%mm0, %%mm2 \n\t"
1576  "movq %%mm2, (%%"FF_REG_d") \n\t"
1577 
1578  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1579  "movq (%0, %1, 8), %%mm0 \n\t"
1580  "movq %%mm2, %%mm3 \n\t"
1581  "pmaxub %%mm0, %%mm2 \n\t"
1582  "pminub %%mm3, %%mm0 \n\t"
1583  "pmaxub %%mm1, %%mm0 \n\t"
1584  "pminub %%mm0, %%mm2 \n\t"
1585  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1586 
1587 
1588  : : "r" (src), "r" ((x86_reg)stride)
1589  : "%"FF_REG_a, "%"FF_REG_d
1590  );
1591 
1592 #else //TEMPLATE_PP_MMX
1593  int x, y;
1594  src+= 4*stride;
1595  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1596  for(x=0; x<8; x++){
1597  uint8_t *colsrc = src;
1598  for (y=0; y<4; y++){
1599  int a, b, c, d, e, f;
1600  a = colsrc[0 ];
1601  b = colsrc[stride ];
1602  c = colsrc[stride*2];
1603  d = (a-b)>>31;
1604  e = (b-c)>>31;
1605  f = (c-a)>>31;
1606  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1607  colsrc += stride*2;
1608  }
1609  src++;
1610  }
1611 #endif //TEMPLATE_PP_MMX
1612 }
1613 
1614 #if TEMPLATE_PP_MMX
1615 /**
1616  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1617  */
1618 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1619 {
1620  __asm__(
1621  "lea (%0, %1), %%"FF_REG_a" \n\t"
1622 // 0 1 2 3 4 5 6 7 8 9
1623 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1624  "movq (%0), %%mm0 \n\t" // 12345678
1625  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1626  "movq %%mm0, %%mm2 \n\t" // 12345678
1627  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1628  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1629 
1630  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1631  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1632  "movq %%mm1, %%mm4 \n\t"
1633  "punpcklbw %%mm3, %%mm1 \n\t"
1634  "punpckhbw %%mm3, %%mm4 \n\t"
1635 
1636  "movq %%mm0, %%mm3 \n\t"
1637  "punpcklwd %%mm1, %%mm0 \n\t"
1638  "punpckhwd %%mm1, %%mm3 \n\t"
1639  "movq %%mm2, %%mm1 \n\t"
1640  "punpcklwd %%mm4, %%mm2 \n\t"
1641  "punpckhwd %%mm4, %%mm1 \n\t"
1642 
1643  "movd %%mm0, 128(%2) \n\t"
1644  "psrlq $32, %%mm0 \n\t"
1645  "movd %%mm0, 144(%2) \n\t"
1646  "movd %%mm3, 160(%2) \n\t"
1647  "psrlq $32, %%mm3 \n\t"
1648  "movd %%mm3, 176(%2) \n\t"
1649  "movd %%mm3, 48(%3) \n\t"
1650  "movd %%mm2, 192(%2) \n\t"
1651  "movd %%mm2, 64(%3) \n\t"
1652  "psrlq $32, %%mm2 \n\t"
1653  "movd %%mm2, 80(%3) \n\t"
1654  "movd %%mm1, 96(%3) \n\t"
1655  "psrlq $32, %%mm1 \n\t"
1656  "movd %%mm1, 112(%3) \n\t"
1657 
1658  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
1659 
1660  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1661  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1662  "movq %%mm0, %%mm2 \n\t" // 12345678
1663  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1664  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1665 
1666  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1667  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1668  "movq %%mm1, %%mm4 \n\t"
1669  "punpcklbw %%mm3, %%mm1 \n\t"
1670  "punpckhbw %%mm3, %%mm4 \n\t"
1671 
1672  "movq %%mm0, %%mm3 \n\t"
1673  "punpcklwd %%mm1, %%mm0 \n\t"
1674  "punpckhwd %%mm1, %%mm3 \n\t"
1675  "movq %%mm2, %%mm1 \n\t"
1676  "punpcklwd %%mm4, %%mm2 \n\t"
1677  "punpckhwd %%mm4, %%mm1 \n\t"
1678 
1679  "movd %%mm0, 132(%2) \n\t"
1680  "psrlq $32, %%mm0 \n\t"
1681  "movd %%mm0, 148(%2) \n\t"
1682  "movd %%mm3, 164(%2) \n\t"
1683  "psrlq $32, %%mm3 \n\t"
1684  "movd %%mm3, 180(%2) \n\t"
1685  "movd %%mm3, 52(%3) \n\t"
1686  "movd %%mm2, 196(%2) \n\t"
1687  "movd %%mm2, 68(%3) \n\t"
1688  "psrlq $32, %%mm2 \n\t"
1689  "movd %%mm2, 84(%3) \n\t"
1690  "movd %%mm1, 100(%3) \n\t"
1691  "psrlq $32, %%mm1 \n\t"
1692  "movd %%mm1, 116(%3) \n\t"
1693 
1694 
1695  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
1696  : "%"FF_REG_a
1697  );
1698 }
1699 
1700 /**
1701  * Transpose the given 8x8 block.
1702  */
1703 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
1704 {
1705  __asm__(
1706  "lea (%0, %1), %%"FF_REG_a" \n\t"
1707  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
1708 // 0 1 2 3 4 5 6 7 8 9
1709 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1710  "movq (%2), %%mm0 \n\t" // 12345678
1711  "movq 16(%2), %%mm1 \n\t" // abcdefgh
1712  "movq %%mm0, %%mm2 \n\t" // 12345678
1713  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1714  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1715 
1716  "movq 32(%2), %%mm1 \n\t"
1717  "movq 48(%2), %%mm3 \n\t"
1718  "movq %%mm1, %%mm4 \n\t"
1719  "punpcklbw %%mm3, %%mm1 \n\t"
1720  "punpckhbw %%mm3, %%mm4 \n\t"
1721 
1722  "movq %%mm0, %%mm3 \n\t"
1723  "punpcklwd %%mm1, %%mm0 \n\t"
1724  "punpckhwd %%mm1, %%mm3 \n\t"
1725  "movq %%mm2, %%mm1 \n\t"
1726  "punpcklwd %%mm4, %%mm2 \n\t"
1727  "punpckhwd %%mm4, %%mm1 \n\t"
1728 
1729  "movd %%mm0, (%0) \n\t"
1730  "psrlq $32, %%mm0 \n\t"
1731  "movd %%mm0, (%%"FF_REG_a") \n\t"
1732  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
1733  "psrlq $32, %%mm3 \n\t"
1734  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
1735  "movd %%mm2, (%0, %1, 4) \n\t"
1736  "psrlq $32, %%mm2 \n\t"
1737  "movd %%mm2, (%%"FF_REG_d") \n\t"
1738  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
1739  "psrlq $32, %%mm1 \n\t"
1740  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
1741 
1742 
1743  "movq 64(%2), %%mm0 \n\t" // 12345678
1744  "movq 80(%2), %%mm1 \n\t" // abcdefgh
1745  "movq %%mm0, %%mm2 \n\t" // 12345678
1746  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1747  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1748 
1749  "movq 96(%2), %%mm1 \n\t"
1750  "movq 112(%2), %%mm3 \n\t"
1751  "movq %%mm1, %%mm4 \n\t"
1752  "punpcklbw %%mm3, %%mm1 \n\t"
1753  "punpckhbw %%mm3, %%mm4 \n\t"
1754 
1755  "movq %%mm0, %%mm3 \n\t"
1756  "punpcklwd %%mm1, %%mm0 \n\t"
1757  "punpckhwd %%mm1, %%mm3 \n\t"
1758  "movq %%mm2, %%mm1 \n\t"
1759  "punpcklwd %%mm4, %%mm2 \n\t"
1760  "punpckhwd %%mm4, %%mm1 \n\t"
1761 
1762  "movd %%mm0, 4(%0) \n\t"
1763  "psrlq $32, %%mm0 \n\t"
1764  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
1765  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
1766  "psrlq $32, %%mm3 \n\t"
1767  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
1768  "movd %%mm2, 4(%0, %1, 4) \n\t"
1769  "psrlq $32, %%mm2 \n\t"
1770  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
1771  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
1772  "psrlq $32, %%mm1 \n\t"
1773  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
1774 
1775  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
1776  : "%"FF_REG_a, "%"FF_REG_d
1777  );
1778 }
1779 #endif //TEMPLATE_PP_MMX
1780 //static long test=0;
1781 
1782 #if !TEMPLATE_PP_ALTIVEC
1783 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
1784  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
1785 {
1786  // to save a register (FIXME do this outside of the loops)
1787  tempBlurredPast[127]= maxNoise[0];
1788  tempBlurredPast[128]= maxNoise[1];
1789  tempBlurredPast[129]= maxNoise[2];
1790 
1791 #define FAST_L2_DIFF
1792 //#define L1_DIFF //u should change the thresholds too if u try that one
1793 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1794  __asm__ volatile(
1795  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
1796  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
1797  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1798 // 0 1 2 3 4 5 6 7 8 9
1799 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
1800 //FIXME reorder?
1801 #ifdef L1_DIFF //needs mmx2
1802  "movq (%0), %%mm0 \n\t" // L0
1803  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
1804  "movq (%0, %2), %%mm1 \n\t" // L1
1805  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
1806  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1807  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
1808  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1809  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
1810 
1811  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1812  "paddw %%mm1, %%mm0 \n\t"
1813  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
1814  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1815  "paddw %%mm2, %%mm0 \n\t"
1816  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
1817  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1818  "paddw %%mm3, %%mm0 \n\t"
1819  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
1820  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1821  "paddw %%mm4, %%mm0 \n\t"
1822  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
1823  "paddw %%mm5, %%mm6 \n\t"
1824  "paddw %%mm7, %%mm6 \n\t"
1825  "paddw %%mm6, %%mm0 \n\t"
1826 #else //L1_DIFF
1827 #if defined (FAST_L2_DIFF)
1828  "pcmpeqb %%mm7, %%mm7 \n\t"
1829  "movq "MANGLE(b80)", %%mm6 \n\t"
1830  "pxor %%mm0, %%mm0 \n\t"
1831 #define REAL_L2_DIFF_CORE(a, b)\
1832  "movq " #a ", %%mm5 \n\t"\
1833  "movq " #b ", %%mm2 \n\t"\
1834  "pxor %%mm7, %%mm2 \n\t"\
1835  PAVGB(%%mm2, %%mm5)\
1836  "paddb %%mm6, %%mm5 \n\t"\
1837  "movq %%mm5, %%mm2 \n\t"\
1838  "psllw $8, %%mm5 \n\t"\
1839  "pmaddwd %%mm5, %%mm5 \n\t"\
1840  "pmaddwd %%mm2, %%mm2 \n\t"\
1841  "paddd %%mm2, %%mm5 \n\t"\
1842  "psrld $14, %%mm5 \n\t"\
1843  "paddd %%mm5, %%mm0 \n\t"
1844 
1845 #else //defined (FAST_L2_DIFF)
1846  "pxor %%mm7, %%mm7 \n\t"
1847  "pxor %%mm0, %%mm0 \n\t"
1848 #define REAL_L2_DIFF_CORE(a, b)\
1849  "movq " #a ", %%mm5 \n\t"\
1850  "movq " #b ", %%mm2 \n\t"\
1851  "movq %%mm5, %%mm1 \n\t"\
1852  "movq %%mm2, %%mm3 \n\t"\
1853  "punpcklbw %%mm7, %%mm5 \n\t"\
1854  "punpckhbw %%mm7, %%mm1 \n\t"\
1855  "punpcklbw %%mm7, %%mm2 \n\t"\
1856  "punpckhbw %%mm7, %%mm3 \n\t"\
1857  "psubw %%mm2, %%mm5 \n\t"\
1858  "psubw %%mm3, %%mm1 \n\t"\
1859  "pmaddwd %%mm5, %%mm5 \n\t"\
1860  "pmaddwd %%mm1, %%mm1 \n\t"\
1861  "paddd %%mm1, %%mm5 \n\t"\
1862  "paddd %%mm5, %%mm0 \n\t"
1863 
1864 #endif //defined (FAST_L2_DIFF)
1865 
1866 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
1867 
1868 L2_DIFF_CORE((%0) , (%1))
1869 L2_DIFF_CORE((%0, %2) , (%1, %2))
1870 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
1871 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
1872 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
1873 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
1874 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
1875 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
1876 
1877 #endif //L1_DIFF
1878 
1879  "movq %%mm0, %%mm4 \n\t"
1880  "psrlq $32, %%mm0 \n\t"
1881  "paddd %%mm0, %%mm4 \n\t"
1882  "movd %%mm4, %%ecx \n\t"
1883  "shll $2, %%ecx \n\t"
1884  "mov %3, %%"FF_REG_d" \n\t"
1885  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
1886  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
1887  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
1888  "addl $4, %%ecx \n\t"
1889  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
1890  "shrl $3, %%ecx \n\t"
1891  "movl %%ecx, (%%"FF_REG_d") \n\t"
1892 
1893 // "mov %3, %%"FF_REG_c" \n\t"
1894 // "mov %%"FF_REG_c", test \n\t"
1895 // "jmp 4f \n\t"
1896  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
1897  " jb 2f \n\t"
1898  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
1899  " jb 1f \n\t"
1900 
1901  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1902  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1903  "movq (%0), %%mm0 \n\t" // L0
1904  "movq (%0, %2), %%mm1 \n\t" // L1
1905  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1906  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1907  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1908  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1909  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1910  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1911  "movq %%mm0, (%1) \n\t" // L0
1912  "movq %%mm1, (%1, %2) \n\t" // L1
1913  "movq %%mm2, (%1, %2, 2) \n\t" // L2
1914  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
1915  "movq %%mm4, (%1, %2, 4) \n\t" // L4
1916  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
1917  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
1918  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
1919  "jmp 4f \n\t"
1920 
1921  "1: \n\t"
1922  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1923  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1924  "movq (%0), %%mm0 \n\t" // L0
1925  PAVGB((%1), %%mm0) // L0
1926  "movq (%0, %2), %%mm1 \n\t" // L1
1927  PAVGB((%1, %2), %%mm1) // L1
1928  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1929  PAVGB((%1, %2, 2), %%mm2) // L2
1930  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1931  PAVGB((%1, %%FF_REGa), %%mm3) // L3
1932  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1933  PAVGB((%1, %2, 4), %%mm4) // L4
1934  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1935  PAVGB((%1, %%FF_REGd), %%mm5) // L5
1936  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1937  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
1938  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1939  PAVGB((%1, %%FF_REGc), %%mm7) // L7
1940  "movq %%mm0, (%1) \n\t" // R0
1941  "movq %%mm1, (%1, %2) \n\t" // R1
1942  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1943  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1944  "movq %%mm4, (%1, %2, 4) \n\t" // R4
1945  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
1946  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
1947  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
1948  "movq %%mm0, (%0) \n\t" // L0
1949  "movq %%mm1, (%0, %2) \n\t" // L1
1950  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1951  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1952  "movq %%mm4, (%0, %2, 4) \n\t" // L4
1953  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
1954  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
1955  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
1956  "jmp 4f \n\t"
1957 
1958  "2: \n\t"
1959  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
1960  " jb 3f \n\t"
1961 
1962  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1963  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1964  "movq (%0), %%mm0 \n\t" // L0
1965  "movq (%0, %2), %%mm1 \n\t" // L1
1966  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1967  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1968  "movq (%1), %%mm4 \n\t" // R0
1969  "movq (%1, %2), %%mm5 \n\t" // R1
1970  "movq (%1, %2, 2), %%mm6 \n\t" // R2
1971  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
1972  PAVGB(%%mm4, %%mm0)
1973  PAVGB(%%mm5, %%mm1)
1974  PAVGB(%%mm6, %%mm2)
1975  PAVGB(%%mm7, %%mm3)
1976  PAVGB(%%mm4, %%mm0)
1977  PAVGB(%%mm5, %%mm1)
1978  PAVGB(%%mm6, %%mm2)
1979  PAVGB(%%mm7, %%mm3)
1980  "movq %%mm0, (%1) \n\t" // R0
1981  "movq %%mm1, (%1, %2) \n\t" // R1
1982  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1983  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1984  "movq %%mm0, (%0) \n\t" // L0
1985  "movq %%mm1, (%0, %2) \n\t" // L1
1986  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1987  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1988 
1989  "movq (%0, %2, 4), %%mm0 \n\t" // L4
1990  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
1991  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
1992  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
1993  "movq (%1, %2, 4), %%mm4 \n\t" // R4
1994  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
1995  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
1996  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
1997  PAVGB(%%mm4, %%mm0)
1998  PAVGB(%%mm5, %%mm1)
1999  PAVGB(%%mm6, %%mm2)
2000  PAVGB(%%mm7, %%mm3)
2001  PAVGB(%%mm4, %%mm0)
2002  PAVGB(%%mm5, %%mm1)
2003  PAVGB(%%mm6, %%mm2)
2004  PAVGB(%%mm7, %%mm3)
2005  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2006  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2007  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2008  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2009  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2010  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2011  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2012  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2013  "jmp 4f \n\t"
2014 
2015  "3: \n\t"
2016  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2017  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2018  "movq (%0), %%mm0 \n\t" // L0
2019  "movq (%0, %2), %%mm1 \n\t" // L1
2020  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2021  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2022  "movq (%1), %%mm4 \n\t" // R0
2023  "movq (%1, %2), %%mm5 \n\t" // R1
2024  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2025  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2026  PAVGB(%%mm4, %%mm0)
2027  PAVGB(%%mm5, %%mm1)
2028  PAVGB(%%mm6, %%mm2)
2029  PAVGB(%%mm7, %%mm3)
2030  PAVGB(%%mm4, %%mm0)
2031  PAVGB(%%mm5, %%mm1)
2032  PAVGB(%%mm6, %%mm2)
2033  PAVGB(%%mm7, %%mm3)
2034  PAVGB(%%mm4, %%mm0)
2035  PAVGB(%%mm5, %%mm1)
2036  PAVGB(%%mm6, %%mm2)
2037  PAVGB(%%mm7, %%mm3)
2038  "movq %%mm0, (%1) \n\t" // R0
2039  "movq %%mm1, (%1, %2) \n\t" // R1
2040  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2041  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2042  "movq %%mm0, (%0) \n\t" // L0
2043  "movq %%mm1, (%0, %2) \n\t" // L1
2044  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2045  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2046 
2047  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2048  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2049  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2050  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2051  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2052  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2053  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2054  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2055  PAVGB(%%mm4, %%mm0)
2056  PAVGB(%%mm5, %%mm1)
2057  PAVGB(%%mm6, %%mm2)
2058  PAVGB(%%mm7, %%mm3)
2059  PAVGB(%%mm4, %%mm0)
2060  PAVGB(%%mm5, %%mm1)
2061  PAVGB(%%mm6, %%mm2)
2062  PAVGB(%%mm7, %%mm3)
2063  PAVGB(%%mm4, %%mm0)
2064  PAVGB(%%mm5, %%mm1)
2065  PAVGB(%%mm6, %%mm2)
2066  PAVGB(%%mm7, %%mm3)
2067  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2068  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2069  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2070  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2071  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2072  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2073  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2074  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2075 
2076  "4: \n\t"
2077 
2078  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2080  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2081  );
2082 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2083 {
2084  int y;
2085  int d=0;
2086 // int sysd=0;
2087  int i;
2088 
2089  for(y=0; y<8; y++){
2090  int x;
2091  for(x=0; x<8; x++){
2092  int ref= tempBlurred[ x + y*stride ];
2093  int cur= src[ x + y*stride ];
2094  int d1=ref - cur;
2095 // if(x==0 || x==7) d1+= d1>>1;
2096 // if(y==0 || y==7) d1+= d1>>1;
2097 // d+= FFABS(d1);
2098  d+= d1*d1;
2099 // sysd+= d1;
2100  }
2101  }
2102  i=d;
2103  d= (
2104  4*d
2105  +(*(tempBlurredPast-256))
2106  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2107  +(*(tempBlurredPast+256))
2108  +4)>>3;
2109  *tempBlurredPast=i;
2110 // ((*tempBlurredPast)*3 + d + 2)>>2;
2111 
2112 /*
2113 Switch between
2114  1 0 0 0 0 0 0 (0)
2115 64 32 16 8 4 2 1 (1)
2116 64 48 36 27 20 15 11 (33) (approx)
2117 64 56 49 43 37 33 29 (200) (approx)
2118 */
2119  if(d > maxNoise[1]){
2120  if(d < maxNoise[2]){
2121  for(y=0; y<8; y++){
2122  int x;
2123  for(x=0; x<8; x++){
2124  int ref= tempBlurred[ x + y*stride ];
2125  int cur= src[ x + y*stride ];
2126  tempBlurred[ x + y*stride ]=
2127  src[ x + y*stride ]=
2128  (ref + cur + 1)>>1;
2129  }
2130  }
2131  }else{
2132  for(y=0; y<8; y++){
2133  int x;
2134  for(x=0; x<8; x++){
2135  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2136  }
2137  }
2138  }
2139  }else{
2140  if(d < maxNoise[0]){
2141  for(y=0; y<8; y++){
2142  int x;
2143  for(x=0; x<8; x++){
2144  int ref= tempBlurred[ x + y*stride ];
2145  int cur= src[ x + y*stride ];
2146  tempBlurred[ x + y*stride ]=
2147  src[ x + y*stride ]=
2148  (ref*7 + cur + 4)>>3;
2149  }
2150  }
2151  }else{
2152  for(y=0; y<8; y++){
2153  int x;
2154  for(x=0; x<8; x++){
2155  int ref= tempBlurred[ x + y*stride ];
2156  int cur= src[ x + y*stride ];
2157  tempBlurred[ x + y*stride ]=
2158  src[ x + y*stride ]=
2159  (ref*3 + cur + 2)>>2;
2160  }
2161  }
2162  }
2163  }
2164 }
2165 #endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2166 }
2167 #endif //TEMPLATE_PP_ALTIVEC
2168 
2169 #if TEMPLATE_PP_MMXEXT
2170 /**
2171  * accurate deblock filter
2172  */
2173 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2174  int64_t dc_mask, eq_mask, both_masks;
2175  int64_t sums[10*8*2];
2176  src+= step*3; // src points to begin of the 8x8 Block
2177 
2178  __asm__ volatile(
2179  "movq %0, %%mm7 \n\t"
2180  "movq %1, %%mm6 \n\t"
2181  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2182  );
2183 
2184  __asm__ volatile(
2185  "lea (%2, %3), %%"FF_REG_a" \n\t"
2186 // 0 1 2 3 4 5 6 7 8 9
2187 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2188 
2189  "movq (%2), %%mm0 \n\t"
2190  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2191  "movq %%mm1, %%mm3 \n\t"
2192  "movq %%mm1, %%mm4 \n\t"
2193  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2194  "paddb %%mm7, %%mm0 \n\t"
2195  "pcmpgtb %%mm6, %%mm0 \n\t"
2196 
2197  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2198  PMAXUB(%%mm2, %%mm4)
2199  PMINUB(%%mm2, %%mm3, %%mm5)
2200  "psubb %%mm2, %%mm1 \n\t"
2201  "paddb %%mm7, %%mm1 \n\t"
2202  "pcmpgtb %%mm6, %%mm1 \n\t"
2203  "paddb %%mm1, %%mm0 \n\t"
2204 
2205  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2206  PMAXUB(%%mm1, %%mm4)
2207  PMINUB(%%mm1, %%mm3, %%mm5)
2208  "psubb %%mm1, %%mm2 \n\t"
2209  "paddb %%mm7, %%mm2 \n\t"
2210  "pcmpgtb %%mm6, %%mm2 \n\t"
2211  "paddb %%mm2, %%mm0 \n\t"
2212 
2213  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2214 
2215  "movq (%2, %3, 4), %%mm2 \n\t"
2216  PMAXUB(%%mm2, %%mm4)
2217  PMINUB(%%mm2, %%mm3, %%mm5)
2218  "psubb %%mm2, %%mm1 \n\t"
2219  "paddb %%mm7, %%mm1 \n\t"
2220  "pcmpgtb %%mm6, %%mm1 \n\t"
2221  "paddb %%mm1, %%mm0 \n\t"
2222 
2223  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2224  PMAXUB(%%mm1, %%mm4)
2225  PMINUB(%%mm1, %%mm3, %%mm5)
2226  "psubb %%mm1, %%mm2 \n\t"
2227  "paddb %%mm7, %%mm2 \n\t"
2228  "pcmpgtb %%mm6, %%mm2 \n\t"
2229  "paddb %%mm2, %%mm0 \n\t"
2230 
2231  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2232  PMAXUB(%%mm2, %%mm4)
2233  PMINUB(%%mm2, %%mm3, %%mm5)
2234  "psubb %%mm2, %%mm1 \n\t"
2235  "paddb %%mm7, %%mm1 \n\t"
2236  "pcmpgtb %%mm6, %%mm1 \n\t"
2237  "paddb %%mm1, %%mm0 \n\t"
2238 
2239  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2240  PMAXUB(%%mm1, %%mm4)
2241  PMINUB(%%mm1, %%mm3, %%mm5)
2242  "psubb %%mm1, %%mm2 \n\t"
2243  "paddb %%mm7, %%mm2 \n\t"
2244  "pcmpgtb %%mm6, %%mm2 \n\t"
2245  "paddb %%mm2, %%mm0 \n\t"
2246 
2247  "movq (%2, %3, 8), %%mm2 \n\t"
2248  PMAXUB(%%mm2, %%mm4)
2249  PMINUB(%%mm2, %%mm3, %%mm5)
2250  "psubb %%mm2, %%mm1 \n\t"
2251  "paddb %%mm7, %%mm1 \n\t"
2252  "pcmpgtb %%mm6, %%mm1 \n\t"
2253  "paddb %%mm1, %%mm0 \n\t"
2254 
2255  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2256  "psubb %%mm1, %%mm2 \n\t"
2257  "paddb %%mm7, %%mm2 \n\t"
2258  "pcmpgtb %%mm6, %%mm2 \n\t"
2259  "paddb %%mm2, %%mm0 \n\t"
2260  "psubusb %%mm3, %%mm4 \n\t"
2261 
2262  "pxor %%mm6, %%mm6 \n\t"
2263  "movq %4, %%mm7 \n\t" // QP,..., QP
2264  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2265  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2266  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2267  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2268  "movq %%mm7, %1 \n\t"
2269 
2270  "movq %5, %%mm7 \n\t"
2271  "punpcklbw %%mm7, %%mm7 \n\t"
2272  "punpcklbw %%mm7, %%mm7 \n\t"
2273  "punpcklbw %%mm7, %%mm7 \n\t"
2274  "psubb %%mm0, %%mm6 \n\t"
2275  "pcmpgtb %%mm7, %%mm6 \n\t"
2276  "movq %%mm6, %0 \n\t"
2277 
2278  : "=m" (eq_mask), "=m" (dc_mask)
2279  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2280  : "%"FF_REG_a
2281  );
2282 
2283  both_masks = dc_mask & eq_mask;
2284 
2285  if(both_masks){
2286  x86_reg offset= -8*step;
2287  int64_t *temp_sums= sums;
2288 
2289  __asm__ volatile(
2290  "movq %2, %%mm0 \n\t" // QP,..., QP
2291  "pxor %%mm4, %%mm4 \n\t"
2292 
2293  "movq (%0), %%mm6 \n\t"
2294  "movq (%0, %1), %%mm5 \n\t"
2295  "movq %%mm5, %%mm1 \n\t"
2296  "movq %%mm6, %%mm2 \n\t"
2297  "psubusb %%mm6, %%mm5 \n\t"
2298  "psubusb %%mm1, %%mm2 \n\t"
2299  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2300  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2301  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2302 
2303  "pxor %%mm6, %%mm1 \n\t"
2304  "pand %%mm0, %%mm1 \n\t"
2305  "pxor %%mm1, %%mm6 \n\t"
2306  // 0:QP 6:First
2307 
2308  "movq (%0, %1, 8), %%mm5 \n\t"
2309  "add %1, %0 \n\t" // %0 points to line 1 not 0
2310  "movq (%0, %1, 8), %%mm7 \n\t"
2311  "movq %%mm5, %%mm1 \n\t"
2312  "movq %%mm7, %%mm2 \n\t"
2313  "psubusb %%mm7, %%mm5 \n\t"
2314  "psubusb %%mm1, %%mm2 \n\t"
2315  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2316  "movq %2, %%mm0 \n\t" // QP,..., QP
2317  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2318  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2319 
2320  "pxor %%mm7, %%mm1 \n\t"
2321  "pand %%mm0, %%mm1 \n\t"
2322  "pxor %%mm1, %%mm7 \n\t"
2323 
2324  "movq %%mm6, %%mm5 \n\t"
2325  "punpckhbw %%mm4, %%mm6 \n\t"
2326  "punpcklbw %%mm4, %%mm5 \n\t"
2327  // 4:0 5/6:First 7:Last
2328 
2329  "movq %%mm5, %%mm0 \n\t"
2330  "movq %%mm6, %%mm1 \n\t"
2331  "psllw $2, %%mm0 \n\t"
2332  "psllw $2, %%mm1 \n\t"
2333  "paddw "MANGLE(w04)", %%mm0 \n\t"
2334  "paddw "MANGLE(w04)", %%mm1 \n\t"
2335 
2336 #define NEXT\
2337  "movq (%0), %%mm2 \n\t"\
2338  "movq (%0), %%mm3 \n\t"\
2339  "add %1, %0 \n\t"\
2340  "punpcklbw %%mm4, %%mm2 \n\t"\
2341  "punpckhbw %%mm4, %%mm3 \n\t"\
2342  "paddw %%mm2, %%mm0 \n\t"\
2343  "paddw %%mm3, %%mm1 \n\t"
2344 
2345 #define PREV\
2346  "movq (%0), %%mm2 \n\t"\
2347  "movq (%0), %%mm3 \n\t"\
2348  "add %1, %0 \n\t"\
2349  "punpcklbw %%mm4, %%mm2 \n\t"\
2350  "punpckhbw %%mm4, %%mm3 \n\t"\
2351  "psubw %%mm2, %%mm0 \n\t"\
2352  "psubw %%mm3, %%mm1 \n\t"
2353 
2354 
2355  NEXT //0
2356  NEXT //1
2357  NEXT //2
2358  "movq %%mm0, (%3) \n\t"
2359  "movq %%mm1, 8(%3) \n\t"
2360 
2361  NEXT //3
2362  "psubw %%mm5, %%mm0 \n\t"
2363  "psubw %%mm6, %%mm1 \n\t"
2364  "movq %%mm0, 16(%3) \n\t"
2365  "movq %%mm1, 24(%3) \n\t"
2366 
2367  NEXT //4
2368  "psubw %%mm5, %%mm0 \n\t"
2369  "psubw %%mm6, %%mm1 \n\t"
2370  "movq %%mm0, 32(%3) \n\t"
2371  "movq %%mm1, 40(%3) \n\t"
2372 
2373  NEXT //5
2374  "psubw %%mm5, %%mm0 \n\t"
2375  "psubw %%mm6, %%mm1 \n\t"
2376  "movq %%mm0, 48(%3) \n\t"
2377  "movq %%mm1, 56(%3) \n\t"
2378 
2379  NEXT //6
2380  "psubw %%mm5, %%mm0 \n\t"
2381  "psubw %%mm6, %%mm1 \n\t"
2382  "movq %%mm0, 64(%3) \n\t"
2383  "movq %%mm1, 72(%3) \n\t"
2384 
2385  "movq %%mm7, %%mm6 \n\t"
2386  "punpckhbw %%mm4, %%mm7 \n\t"
2387  "punpcklbw %%mm4, %%mm6 \n\t"
2388 
2389  NEXT //7
2390  "mov %4, %0 \n\t"
2391  "add %1, %0 \n\t"
2392  PREV //0
2393  "movq %%mm0, 80(%3) \n\t"
2394  "movq %%mm1, 88(%3) \n\t"
2395 
2396  PREV //1
2397  "paddw %%mm6, %%mm0 \n\t"
2398  "paddw %%mm7, %%mm1 \n\t"
2399  "movq %%mm0, 96(%3) \n\t"
2400  "movq %%mm1, 104(%3) \n\t"
2401 
2402  PREV //2
2403  "paddw %%mm6, %%mm0 \n\t"
2404  "paddw %%mm7, %%mm1 \n\t"
2405  "movq %%mm0, 112(%3) \n\t"
2406  "movq %%mm1, 120(%3) \n\t"
2407 
2408  PREV //3
2409  "paddw %%mm6, %%mm0 \n\t"
2410  "paddw %%mm7, %%mm1 \n\t"
2411  "movq %%mm0, 128(%3) \n\t"
2412  "movq %%mm1, 136(%3) \n\t"
2413 
2414  PREV //4
2415  "paddw %%mm6, %%mm0 \n\t"
2416  "paddw %%mm7, %%mm1 \n\t"
2417  "movq %%mm0, 144(%3) \n\t"
2418  "movq %%mm1, 152(%3) \n\t"
2419 
2420  "mov %4, %0 \n\t" //FIXME
2421 
2422  : "+&r"(src)
2423  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2425  );
2426 
2427  src+= step; // src points to begin of the 8x8 Block
2428 
2429  __asm__ volatile(
2430  "movq %4, %%mm6 \n\t"
2431  "pcmpeqb %%mm5, %%mm5 \n\t"
2432  "pxor %%mm6, %%mm5 \n\t"
2433  "pxor %%mm7, %%mm7 \n\t"
2434 
2435  "1: \n\t"
2436  "movq (%1), %%mm0 \n\t"
2437  "movq 8(%1), %%mm1 \n\t"
2438  "paddw 32(%1), %%mm0 \n\t"
2439  "paddw 40(%1), %%mm1 \n\t"
2440  "movq (%0, %3), %%mm2 \n\t"
2441  "movq %%mm2, %%mm3 \n\t"
2442  "movq %%mm2, %%mm4 \n\t"
2443  "punpcklbw %%mm7, %%mm2 \n\t"
2444  "punpckhbw %%mm7, %%mm3 \n\t"
2445  "paddw %%mm2, %%mm0 \n\t"
2446  "paddw %%mm3, %%mm1 \n\t"
2447  "paddw %%mm2, %%mm0 \n\t"
2448  "paddw %%mm3, %%mm1 \n\t"
2449  "psrlw $4, %%mm0 \n\t"
2450  "psrlw $4, %%mm1 \n\t"
2451  "packuswb %%mm1, %%mm0 \n\t"
2452  "pand %%mm6, %%mm0 \n\t"
2453  "pand %%mm5, %%mm4 \n\t"
2454  "por %%mm4, %%mm0 \n\t"
2455  "movq %%mm0, (%0, %3) \n\t"
2456  "add $16, %1 \n\t"
2457  "add %2, %0 \n\t"
2458  " js 1b \n\t"
2459 
2460  : "+r"(offset), "+r"(temp_sums)
2461  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2462  );
2463  }else
2464  src+= step; // src points to begin of the 8x8 Block
2465 
2466  if(eq_mask != -1LL){
2467  uint8_t *temp_src= src;
2468  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2469  __asm__ volatile(
2470  "pxor %%mm7, %%mm7 \n\t"
2471 // 0 1 2 3 4 5 6 7 8 9
2472 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2473 
2474  "movq (%0), %%mm0 \n\t"
2475  "movq %%mm0, %%mm1 \n\t"
2476  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2477  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2478 
2479  "movq (%0, %1), %%mm2 \n\t"
2480  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2481  "movq %%mm2, %%mm3 \n\t"
2482  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2483  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2484 
2485  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2486  "movq %%mm4, %%mm5 \n\t"
2487  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2488  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2489 
2490  "paddw %%mm0, %%mm0 \n\t" // 2L0
2491  "paddw %%mm1, %%mm1 \n\t" // 2H0
2492  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2493  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2494  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2495  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2496 
2497  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2498  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2499  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2500  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2501 
2502  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2503  "movq %%mm2, %%mm3 \n\t"
2504  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2505  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2506 
2507  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2508  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2509  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2510  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2511  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2512  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2513 
2514  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2515  "movq %%mm0, %%mm1 \n\t"
2516  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2517  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2518 
2519  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2520  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2521  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2522  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2523  "paddw %%mm4, %%mm4 \n\t" // 2L2
2524  "paddw %%mm5, %%mm5 \n\t" // 2H2
2525  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2526  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2527 
2528  "lea (%%"FF_REG_a", %1), %0 \n\t"
2529  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2530  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2531  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2532  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2533 //50 opcodes so far
2534  "movq (%0, %1, 2), %%mm2 \n\t"
2535  "movq %%mm2, %%mm3 \n\t"
2536  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2537  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2538  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2539  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2540  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2541  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2542 
2543  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2544  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2545  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2546  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2547  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2548  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2549 
2550  "paddw %%mm0, %%mm0 \n\t" // 2L4
2551  "paddw %%mm1, %%mm1 \n\t" // 2H4
2552  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2553  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2554 
2555  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2556  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2557  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2558  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2559 
2560  "movq (%0, %1, 4), %%mm2 \n\t"
2561  "movq %%mm2, %%mm3 \n\t"
2562  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2563  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2564 
2565  "paddw %%mm2, %%mm2 \n\t" // 2L7
2566  "paddw %%mm3, %%mm3 \n\t" // 2H7
2567  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2568  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2569 
2570  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2571  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2572 
2573  "movq %%mm7, %%mm6 \n\t" // 0
2574  "psubw %%mm0, %%mm6 \n\t"
2575  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2576  "movq %%mm7, %%mm6 \n\t" // 0
2577  "psubw %%mm1, %%mm6 \n\t"
2578  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2579  "movq %%mm7, %%mm6 \n\t" // 0
2580  "psubw %%mm2, %%mm6 \n\t"
2581  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2582  "movq %%mm7, %%mm6 \n\t" // 0
2583  "psubw %%mm3, %%mm6 \n\t"
2584  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2585 
2586  "pminsw %%mm2, %%mm0 \n\t"
2587  "pminsw %%mm3, %%mm1 \n\t"
2588 
2589  "movd %2, %%mm2 \n\t" // QP
2590  "punpcklbw %%mm7, %%mm2 \n\t"
2591 
2592  "movq %%mm7, %%mm6 \n\t" // 0
2593  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2594  "pxor %%mm6, %%mm4 \n\t"
2595  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2596  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2597  "pxor %%mm7, %%mm5 \n\t"
2598  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2599 // 100 opcodes
2600  "psllw $3, %%mm2 \n\t" // 8QP
2601  "movq %%mm2, %%mm3 \n\t" // 8QP
2602  "pcmpgtw %%mm4, %%mm2 \n\t"
2603  "pcmpgtw %%mm5, %%mm3 \n\t"
2604  "pand %%mm2, %%mm4 \n\t"
2605  "pand %%mm3, %%mm5 \n\t"
2606 
2607 
2608  "psubusw %%mm0, %%mm4 \n\t" // hd
2609  "psubusw %%mm1, %%mm5 \n\t" // ld
2610 
2611 
2612  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2613  "pmullw %%mm2, %%mm4 \n\t"
2614  "pmullw %%mm2, %%mm5 \n\t"
2615  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2616  "paddw %%mm2, %%mm4 \n\t"
2617  "paddw %%mm2, %%mm5 \n\t"
2618  "psrlw $6, %%mm4 \n\t"
2619  "psrlw $6, %%mm5 \n\t"
2620 
2621  "movq 16(%4), %%mm0 \n\t" // L3 - L4
2622  "movq 24(%4), %%mm1 \n\t" // H3 - H4
2623 
2624  "pxor %%mm2, %%mm2 \n\t"
2625  "pxor %%mm3, %%mm3 \n\t"
2626 
2627  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
2628  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
2629  "pxor %%mm2, %%mm0 \n\t"
2630  "pxor %%mm3, %%mm1 \n\t"
2631  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
2632  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2633  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
2634  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2635 
2636  "pxor %%mm6, %%mm2 \n\t"
2637  "pxor %%mm7, %%mm3 \n\t"
2638  "pand %%mm2, %%mm4 \n\t"
2639  "pand %%mm3, %%mm5 \n\t"
2640 
2641  "pminsw %%mm0, %%mm4 \n\t"
2642  "pminsw %%mm1, %%mm5 \n\t"
2643  "pxor %%mm6, %%mm4 \n\t"
2644  "pxor %%mm7, %%mm5 \n\t"
2645  "psubw %%mm6, %%mm4 \n\t"
2646  "psubw %%mm7, %%mm5 \n\t"
2647  "packsswb %%mm5, %%mm4 \n\t"
2648  "movq %3, %%mm1 \n\t"
2649  "pandn %%mm4, %%mm1 \n\t"
2650  "movq (%0), %%mm0 \n\t"
2651  "paddb %%mm1, %%mm0 \n\t"
2652  "movq %%mm0, (%0) \n\t"
2653  "movq (%0, %1), %%mm0 \n\t"
2654  "psubb %%mm1, %%mm0 \n\t"
2655  "movq %%mm0, (%0, %1) \n\t"
2656 
2657  : "+r" (temp_src)
2658  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
2659  NAMED_CONSTRAINTS_ADD(w05,w20)
2660  : "%"FF_REG_a
2661  );
2662  }
2663 }
2664 #endif //TEMPLATE_PP_MMX
2665 
2666 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2667  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
2668 
2669 /**
2670  * Copy a block from src to dst and fixes the blacklevel.
2671  * levelFix == 0 -> do not touch the brightness & contrast
2672  */
2673 #undef REAL_SCALED_CPY
2674 #undef SCALED_CPY
2675 
2676 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
2677  int levelFix, int64_t *packedOffsetAndScale)
2678 {
2679  if(levelFix){
2680 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
2681  __asm__ volatile(
2682  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
2683  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
2684  "lea (%2,%4), %%"FF_REG_a" \n\t"
2685  "lea (%3,%5), %%"FF_REG_d" \n\t"
2686  "pxor %%mm4, %%mm4 \n\t"
2687 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
2688  "movq " #src1 ", %%mm0 \n\t"\
2689  "movq " #src1 ", %%mm5 \n\t"\
2690  "movq " #src2 ", %%mm1 \n\t"\
2691  "movq " #src2 ", %%mm6 \n\t"\
2692  "punpcklbw %%mm0, %%mm0 \n\t"\
2693  "punpckhbw %%mm5, %%mm5 \n\t"\
2694  "punpcklbw %%mm1, %%mm1 \n\t"\
2695  "punpckhbw %%mm6, %%mm6 \n\t"\
2696  "pmulhuw %%mm3, %%mm0 \n\t"\
2697  "pmulhuw %%mm3, %%mm5 \n\t"\
2698  "pmulhuw %%mm3, %%mm1 \n\t"\
2699  "pmulhuw %%mm3, %%mm6 \n\t"\
2700  "psubw %%mm2, %%mm0 \n\t"\
2701  "psubw %%mm2, %%mm5 \n\t"\
2702  "psubw %%mm2, %%mm1 \n\t"\
2703  "psubw %%mm2, %%mm6 \n\t"\
2704  "packuswb %%mm5, %%mm0 \n\t"\
2705  "packuswb %%mm6, %%mm1 \n\t"\
2706  "movq %%mm0, " #dst1 " \n\t"\
2707  "movq %%mm1, " #dst2 " \n\t"\
2708 
2709 #define SCALED_CPY(src1, src2, dst1, dst2)\
2710  REAL_SCALED_CPY(src1, src2, dst1, dst2)
2711 
2712 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
2713 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
2714 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
2715  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
2716  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
2717 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
2718 
2719 
2720  : "=&a" (packedOffsetAndScale)
2721  : "0" (packedOffsetAndScale),
2722  "r"(src),
2723  "r"(dst),
2724  "r" ((x86_reg)srcStride),
2725  "r" ((x86_reg)dstStride)
2726  : "%"FF_REG_d
2727  );
2728 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2729  for (int i = 0; i < 8; i++)
2730  memcpy( &(dst[dstStride*i]),
2731  &(src[srcStride*i]), BLOCK_SIZE);
2732 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2733  }else{
2734 #if TEMPLATE_PP_MMX && HAVE_6REGS
2735  __asm__ volatile(
2736  "lea (%0,%2), %%"FF_REG_a" \n\t"
2737  "lea (%1,%3), %%"FF_REG_d" \n\t"
2738 
2739 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
2740  "movq " #src1 ", %%mm0 \n\t"\
2741  "movq " #src2 ", %%mm1 \n\t"\
2742  "movq %%mm0, " #dst1 " \n\t"\
2743  "movq %%mm1, " #dst2 " \n\t"\
2744 
2745 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
2746  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
2747 
2748 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
2749 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
2750 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
2751  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
2752  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
2753 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
2754 
2755  : : "r" (src),
2756  "r" (dst),
2757  "r" ((x86_reg)srcStride),
2758  "r" ((x86_reg)dstStride)
2759  : "%"FF_REG_a, "%"FF_REG_d
2760  );
2761 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2762  for (int i = 0; i < 8; i++)
2763  memcpy( &(dst[dstStride*i]),
2764  &(src[srcStride*i]), BLOCK_SIZE);
2765 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2766  }
2767 }
2768 
2769 /**
2770  * Duplicate the given 8 src pixels ? times upward
2771  */
2772 static inline void RENAME(duplicate)(uint8_t src[], int stride)
2773 {
2774 #if TEMPLATE_PP_MMX
2775  __asm__ volatile(
2776  "movq (%0), %%mm0 \n\t"
2777  "movq %%mm0, (%0, %1, 4) \n\t"
2778  "add %1, %0 \n\t"
2779  "movq %%mm0, (%0) \n\t"
2780  "movq %%mm0, (%0, %1) \n\t"
2781  "movq %%mm0, (%0, %1, 2) \n\t"
2782  "movq %%mm0, (%0, %1, 4) \n\t"
2783  : "+r" (src)
2784  : "r" ((x86_reg)-stride)
2785  );
2786 #else
2787  int i;
2788  uint8_t *p=src;
2789  for(i=0; i<5; i++){
2790  p-= stride;
2791  memcpy(p, src, 8);
2792  }
2793 #endif
2794 }
2795 
2796 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
2797 static inline void RENAME(prefetchnta)(const void *p)
2798 {
2799  __asm__ volatile( "prefetchnta (%0)\n\t"
2800  : : "r" (p)
2801  );
2802 }
2803 
2804 static inline void RENAME(prefetcht0)(const void *p)
2805 {
2806  __asm__ volatile( "prefetcht0 (%0)\n\t"
2807  : : "r" (p)
2808  );
2809 }
2810 
2811 static inline void RENAME(prefetcht1)(const void *p)
2812 {
2813  __asm__ volatile( "prefetcht1 (%0)\n\t"
2814  : : "r" (p)
2815  );
2816 }
2817 
2818 static inline void RENAME(prefetcht2)(const void *p)
2819 {
2820  __asm__ volatile( "prefetcht2 (%0)\n\t"
2821  : : "r" (p)
2822  );
2823 }
2824 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
2825 static inline void RENAME(prefetchnta)(const void *p)
2826 {
2827  __builtin_prefetch(p,0,0);
2828 }
2829 static inline void RENAME(prefetcht0)(const void *p)
2830 {
2831  __builtin_prefetch(p,0,1);
2832 }
2833 static inline void RENAME(prefetcht1)(const void *p)
2834 {
2835  __builtin_prefetch(p,0,2);
2836 }
2837 static inline void RENAME(prefetcht2)(const void *p)
2838 {
2839  __builtin_prefetch(p,0,3);
2840 }
2841 #else
2842 static inline void RENAME(prefetchnta)(const void *p)
2843 {
2844  return;
2845 }
2846 static inline void RENAME(prefetcht0)(const void *p)
2847 {
2848  return;
2849 }
2850 static inline void RENAME(prefetcht1)(const void *p)
2851 {
2852  return;
2853 }
2854 static inline void RENAME(prefetcht2)(const void *p)
2855 {
2856  return;
2857 }
2858 #endif
2859 /**
2860  * Filter array of bytes (Y or U or V values)
2861  */
2862 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2863  const int8_t QPs[], int QPStride, int isColor, PPContext *c)
2864 {
2865  int x,y;
2866 #ifdef TEMPLATE_PP_TIME_MODE
2867  const int mode= TEMPLATE_PP_TIME_MODE;
2868 #else
2869  const int mode = isColor ? c->ppMode.chromMode : c->ppMode.lumMode;
2870 #endif
2871  int black=0, white=255; // blackest black and whitest white in the picture
2872  int QPCorrecture= 256*256;
2873 
2874  int copyAhead;
2875 #if TEMPLATE_PP_MMX
2876  int i;
2877 #endif
2878 
2879  const int qpHShift = isColor ? 4 - c->hChromaSubSample : 4;
2880  const int qpVShift = isColor ? 4 - c->vChromaSubSample : 4;
2881 
2882  //FIXME remove
2883  uint64_t * const yHistogram= c->yHistogram;
2884  uint8_t * const tempSrc = srcStride > 0 ? c->tempSrc : c->tempSrc - 23*srcStride;
2885  uint8_t * const tempDst = (dstStride > 0 ? c->tempDst : c->tempDst - 23*dstStride) + 32;
2886  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2887 
2888  if (mode & VISUALIZE){
2889  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
2890  av_log(c, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
2891  }
2892  }
2893 
2894 #if TEMPLATE_PP_MMX
2895  for(i=0; i<57; i++){
2896  int offset = ((i * c->ppMode.baseDcDiff) >> 8) + 1;
2897  int threshold= offset*2 + 1;
2898  c->mmxDcOffset[i] = 0x7F - offset;
2899  c->mmxDcThreshold[i] = 0x7F - threshold;
2900  c->mmxDcOffset[i] *= 0x0101010101010101LL;
2901  c->mmxDcThreshold[i] *= 0x0101010101010101LL;
2902  }
2903 #endif
2904 
2905  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2906  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
2907  || (mode & FFMPEG_DEINT_FILTER)
2908  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
2909  else if( (mode & V_DEBLOCK)
2911  || (mode & MEDIAN_DEINT_FILTER)
2912  || (mode & V_A_DEBLOCK)) copyAhead=13;
2913  else if(mode & V_X1_FILTER) copyAhead=11;
2914 // else if(mode & V_RK1_FILTER) copyAhead=10;
2915  else if(mode & DERING) copyAhead=9;
2916  else copyAhead=8;
2917 
2918  copyAhead-= 8;
2919 
2920  if(!isColor){
2921  uint64_t sum= 0;
2922  int i;
2923  uint64_t maxClipped;
2924  uint64_t clipped;
2925  AVRational scale;
2926 
2927  c->frameNum++;
2928  // first frame is fscked so we ignore it
2929  if (c->frameNum == 1)
2930  yHistogram[0] = width * (uint64_t)height/64*15/256;
2931 
2932  for(i=0; i<256; i++){
2933  sum+= yHistogram[i];
2934  }
2935 
2936  /* We always get a completely black picture first. */
2937  maxClipped = av_rescale(sum, c->ppMode.maxClippedThreshold.num,
2938  c->ppMode.maxClippedThreshold.den);
2939 
2940  clipped= sum;
2941  for(black=255; black>0; black--){
2942  if(clipped < maxClipped) break;
2943  clipped-= yHistogram[black];
2944  }
2945 
2946  clipped= sum;
2947  for(white=0; white<256; white++){
2948  if(clipped < maxClipped) break;
2949  clipped-= yHistogram[white];
2950  }
2951 
2952  scale = (AVRational){c->ppMode.maxAllowedY - c->ppMode.minAllowedY, white - black};
2953 
2954 #if TEMPLATE_PP_MMXEXT
2955  c->packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
2956  c->packedYOffset = (((black*c->packedYScale)>>8) - c->ppMode.minAllowedY) & 0xFFFF;
2957 #else
2958  c->packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
2959  c->packedYOffset = (black - c->ppMode.minAllowedY) & 0xFFFF;
2960 #endif
2961 
2962  c->packedYOffset |= c->packedYOffset<<32;
2963  c->packedYOffset |= c->packedYOffset<<16;
2964 
2965  c->packedYScale |= c->packedYScale<<32;
2966  c->packedYScale |= c->packedYScale<<16;
2967 
2968  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
2969  else QPCorrecture= 256*256;
2970  }else{
2971  c->packedYScale = 0x0100010001000100LL;
2972  c->packedYOffset = 0;
2973  QPCorrecture= 256*256;
2974  }
2975 
2976  /* copy & deinterlace first row of blocks */
2977  y=-BLOCK_SIZE;
2978  {
2979  const uint8_t *srcBlock= &(src[y*srcStride]);
2980  uint8_t *dstBlock= tempDst + dstStride;
2981 
2982  // From this point on it is guaranteed that we can read and write 16 lines downward
2983  // finish 1 block before the next otherwise we might have a problem
2984  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
2985  for(x=0; x<width; x+=BLOCK_SIZE){
2986  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
2987  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
2988  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
2989  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
2990 
2991  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2992  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
2993 
2994  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2995 
2997  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2998  else if(mode & LINEAR_BLEND_DEINT_FILTER)
2999  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3000  else if(mode & MEDIAN_DEINT_FILTER)
3001  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3002  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3003  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3004  else if(mode & FFMPEG_DEINT_FILTER)
3005  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3006  else if(mode & LOWPASS5_DEINT_FILTER)
3007  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3008 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3009  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3010 */
3011  dstBlock+=8;
3012  srcBlock+=8;
3013  }
3014  if(width==FFABS(dstStride))
3015  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3016  else{
3017  int i;
3018  for(i=0; i<copyAhead; i++){
3019  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3020  }
3021  }
3022  }
3023 
3024  for(y=0; y<height; y+=BLOCK_SIZE){
3025  //1% speedup if these are here instead of the inner loop
3026  const uint8_t *srcBlock= &(src[y*srcStride]);
3027  uint8_t *dstBlock= &(dst[y*dstStride]);
3028 #if TEMPLATE_PP_MMX
3029  uint8_t *tempBlock1 = c->tempBlocks;
3030  uint8_t *tempBlock2 = c->tempBlocks + 8;
3031 #endif
3032  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3033  int8_t *nonBQPptr = &c->nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3034  int QP=0, nonBQP=0;
3035  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3036  if not than use a temporary buffer */
3037  if(y+15 >= height){
3038  int i;
3039  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3040  blockcopy to dst later */
3041  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3042  FFMAX(height-y-copyAhead, 0), srcStride);
3043 
3044  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3045  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3046  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3047 
3048  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3049  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3050 
3051  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3052  for(i=height-y+1; i<=copyAhead; i++)
3053  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3054 
3055  dstBlock= tempDst + dstStride;
3056  srcBlock= tempSrc;
3057  }
3058 
3059  // From this point on it is guaranteed that we can read and write 16 lines downward
3060  // finish 1 block before the next otherwise we might have a problem
3061  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3062  for(x=0; x<width; ){
3063  int startx = x;
3064  int endx = FFMIN(width, x+32);
3065  uint8_t *dstBlockStart = dstBlock;
3066  const uint8_t *srcBlockStart = srcBlock;
3067  int qp_index = 0;
3068  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3069  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3070  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3071  if(!isColor){
3072  QP= (QP* QPCorrecture + 256*128)>>16;
3073  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3074  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3075  }
3076  c->QP_block[qp_index] = QP;
3077  c->nonBQP_block[qp_index] = nonBQP;
3078 #if TEMPLATE_PP_MMX
3079  __asm__ volatile(
3080  "movd %1, %%mm7 \n\t"
3081  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3082  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3083  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3084  "movq %%mm7, %0 \n\t"
3085  : "=m" (c->pQPb_block[qp_index])
3086  : "r" (QP)
3087  );
3088 #endif
3089  }
3090  for(; x < endx; x+=BLOCK_SIZE){
3091  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3092  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3093  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3094  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3095 
3096  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3097  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
3098 
3100  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3101  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3102  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3103  else if(mode & MEDIAN_DEINT_FILTER)
3104  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3105  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3106  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3107  else if(mode & FFMPEG_DEINT_FILTER)
3108  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3109  else if(mode & LOWPASS5_DEINT_FILTER)
3110  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3111 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3112  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3113 */
3114  dstBlock+=8;
3115  srcBlock+=8;
3116  }
3117 
3118  dstBlock = dstBlockStart;
3119  srcBlock = srcBlockStart;
3120 
3121  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3122  const int stride= dstStride;
3123  //temporary while changing QP stuff to make things continue to work
3124  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3125  c->QP = c->QP_block[qp_index];
3126  c->nonBQP = c->nonBQP_block[qp_index];
3127  c->pQPb = c->pQPb_block[qp_index];
3128  c->pQPb2 = c->pQPb2_block[qp_index];
3129 
3130  /* only deblock if we have 2 blocks */
3131  if(y + 8 < height){
3132  if(mode & V_X1_FILTER)
3133  RENAME(vertX1Filter)(dstBlock, stride, c);
3134  else if(mode & V_DEBLOCK){
3135  const int t = RENAME(vertClassify)(dstBlock, stride, c);
3136 
3137  if(t==1)
3138  RENAME(doVertLowPass)(dstBlock, stride, c);
3139  else if(t==2)
3140  RENAME(doVertDefFilter)(dstBlock, stride, c);
3141  }else if(mode & V_A_DEBLOCK){
3142  RENAME(do_a_deblock)(dstBlock, stride, 1, c, mode);
3143  }
3144  }
3145 
3146  dstBlock+=8;
3147  srcBlock+=8;
3148  }
3149 
3150  dstBlock = dstBlockStart;
3151  srcBlock = srcBlockStart;
3152 
3153  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3154  const int stride= dstStride;
3155  c->QP = c->QP_block[qp_index];
3156  c->nonBQP = c->nonBQP_block[qp_index];
3157  c->pQPb = c->pQPb_block[qp_index];
3158  c->pQPb2 = c->pQPb2_block[qp_index];
3159 #if TEMPLATE_PP_MMX
3160  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3161 #endif
3162  /* check if we have a previous block to deblock it with dstBlock */
3163  if(x - 8 >= 0){
3164 #if TEMPLATE_PP_MMX
3165  if(mode & H_X1_FILTER)
3166  RENAME(vertX1Filter)(tempBlock1, 16, c);
3167  else if(mode & H_DEBLOCK){
3168  const int t= RENAME(vertClassify)(tempBlock1, 16, c);
3169  if(t==1)
3170  RENAME(doVertLowPass)(tempBlock1, 16, c);
3171  else if(t==2)
3172  RENAME(doVertDefFilter)(tempBlock1, 16, c);
3173  }else if(mode & H_A_DEBLOCK){
3174  RENAME(do_a_deblock)(tempBlock1, 16, 1, c, mode);
3175  }
3176 
3177  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3178 
3179 #else
3180  if(mode & H_X1_FILTER)
3181  horizX1Filter(dstBlock-4, stride, c->QP);
3182  else if(mode & H_DEBLOCK){
3183 #if TEMPLATE_PP_ALTIVEC
3184  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3185  int t;
3186  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3187 
3188  t = vertClassify_altivec(tempBlock-48, 16, c);
3189  if(t==1) {
3190  doVertLowPass_altivec(tempBlock-48, 16, c);
3191  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3192  }
3193  else if(t==2) {
3194  doVertDefFilter_altivec(tempBlock-48, 16, c);
3195  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3196  }
3197 #else
3198  const int t= RENAME(horizClassify)(dstBlock-4, stride, c);
3199 
3200  if(t==1)
3201  RENAME(doHorizLowPass)(dstBlock-4, stride, c);
3202  else if(t==2)
3203  RENAME(doHorizDefFilter)(dstBlock-4, stride, c);
3204 #endif
3205  }else if(mode & H_A_DEBLOCK){
3206  RENAME(do_a_deblock)(dstBlock-8, 1, stride, c, mode);
3207  }
3208 #endif //TEMPLATE_PP_MMX
3209  if(mode & DERING){
3210  //FIXME filter first line
3211  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, c);
3212  }
3213 
3214  if(mode & TEMP_NOISE_FILTER)
3215  {
3216  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3217  c->tempBlurred[isColor] + y*dstStride + x,
3218  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3219  c->ppMode.maxTmpNoise);
3220  }
3221  }
3222 
3223  dstBlock+=8;
3224  srcBlock+=8;
3225 
3226 #if TEMPLATE_PP_MMX
3227  FFSWAP(uint8_t *, tempBlock1, tempBlock2);
3228 #endif
3229  }
3230  }
3231 
3232  if(mode & DERING){
3233  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, c);
3234  }
3235 
3236  if((mode & TEMP_NOISE_FILTER)){
3237  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3238  c->tempBlurred[isColor] + y*dstStride + x,
3239  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3240  c->ppMode.maxTmpNoise);
3241  }
3242 
3243  /* did we use a tmp buffer for the last lines*/
3244  if(y+15 >= height){
3245  uint8_t *dstBlock= &(dst[y*dstStride]);
3246  if(width==FFABS(dstStride))
3247  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3248  else{
3249  int i;
3250  for(i=0; i<height-y; i++){
3251  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3252  }
3253  }
3254  }
3255  }
3256 #if TEMPLATE_PP_MMX
3257  __asm__ volatile("emms");
3258 #endif
3259 
3260 #ifdef DEBUG_BRIGHTNESS
3261  if(!isColor){
3262  int max=1;
3263  int i;
3264  for(i=0; i<256; i++)
3265  if(yHistogram[i] > max) max=yHistogram[i];
3266 
3267  for(i=1; i<256; i++){
3268  int x;
3269  int start=yHistogram[i-1]/(max/256+1);
3270  int end=yHistogram[i]/(max/256+1);
3271  int inc= end > start ? 1 : -1;
3272  for(x=start; x!=end+inc; x+=inc)
3273  dst[ i*dstStride + x]+=128;
3274  }
3275 
3276  for(i=0; i<100; i+=2){
3277  dst[ (white)*dstStride + i]+=128;
3278  dst[ (black)*dstStride + i]+=128;
3279  }
3280  }
3281 #endif
3282 }
3283 
3284 #undef RENAME
3285 #undef TEMPLATE_PP_C
3286 #undef TEMPLATE_PP_ALTIVEC
3287 #undef TEMPLATE_PP_MMX
3288 #undef TEMPLATE_PP_MMXEXT
3289 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:31
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:67
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:186
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:68
b
#define b
Definition: input.c:41
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:325
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
t1
#define t1
Definition: regdef.h:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:56
FFSIGN
#define FFSIGN(a)
Definition: common.h:65
QP
#define QP(qP, depth)
Definition: h264data.c:190
scale
static av_always_inline float scale(float x, float s)
Definition: vf_v360.c:1389
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:522
width
#define width
s
#define s(width, name)
Definition: cbs_vp9.c:256
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:52
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:36
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:70
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:64
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:66
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:177
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:51
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1016
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:78
f
f
Definition: af_crystalizer.c:122
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:87
asm.h
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1121
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:34
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:162
height
#define height
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:37
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:191
DERING
#define DERING
Definition: postprocess_internal.h:38
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:73
t3
#define t3
Definition: regdef.h:31
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
stride
#define stride
Definition: h264pred_template.c:537
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:35
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:65
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:49
t2
#define t2
Definition: regdef.h:30
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:112
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:101
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:62
MANGLE
#define MANGLE(a)
Definition: asm.h:127
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
x86_reg
int x86_reg
Definition: asm.h:72
d
d
Definition: ffmpeg_filter.c:156
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:49
int
int
Definition: ffmpeg_filter.c:156
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:39
RENAME
#define RENAME(name)
Definition: ffv1dec.c:117
min
float min
Definition: vorbis_enc_data.h:429