FFmpeg
snowdsp.c
Go to the documentation of this file.
1 /*
2  * MMX and SSE2 optimized snow DSP utils
3  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/cpu.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavcodec/avcodec.h"
25 #include "libavcodec/snow.h"
26 #include "libavcodec/snow_dwt.h"
27 
28 #if HAVE_INLINE_ASM
29 
30 static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
31  const int w2= (width+1)>>1;
32  const int w_l= (width>>1);
33  const int w_r= w2 - 1;
34  int i;
35 
36  { // Lift 0
37  IDWTELEM * const ref = b + w2 - 1;
38  IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
39  // (the first time erroneously), we allow the SSE2 code to run an extra pass.
40  // The savings in code and time are well worth having to store this value and
41  // calculate b[0] correctly afterwards.
42 
43  i = 0;
44  __asm__ volatile(
45  "pcmpeqd %%xmm7, %%xmm7 \n\t"
46  "pcmpeqd %%xmm3, %%xmm3 \n\t"
47  "psllw $1, %%xmm3 \n\t"
48  "paddw %%xmm7, %%xmm3 \n\t"
49  "psllw $13, %%xmm3 \n\t"
50  ::);
51  for(; i<w_l-15; i+=16){
52  __asm__ volatile(
53  "movdqu (%1), %%xmm1 \n\t"
54  "movdqu 16(%1), %%xmm5 \n\t"
55  "movdqu 2(%1), %%xmm2 \n\t"
56  "movdqu 18(%1), %%xmm6 \n\t"
57  "paddw %%xmm1, %%xmm2 \n\t"
58  "paddw %%xmm5, %%xmm6 \n\t"
59  "paddw %%xmm7, %%xmm2 \n\t"
60  "paddw %%xmm7, %%xmm6 \n\t"
61  "pmulhw %%xmm3, %%xmm2 \n\t"
62  "pmulhw %%xmm3, %%xmm6 \n\t"
63  "paddw (%0), %%xmm2 \n\t"
64  "paddw 16(%0), %%xmm6 \n\t"
65  "movdqa %%xmm2, (%0) \n\t"
66  "movdqa %%xmm6, 16(%0) \n\t"
67  :: "r"(&b[i]), "r"(&ref[i])
68  : "memory"
69  );
70  }
71  snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
72  b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
73  }
74 
75  { // Lift 1
76  IDWTELEM * const dst = b+w2;
77 
78  i = 0;
79  for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
80  dst[i] = dst[i] - (b[i] + b[i + 1]);
81  }
82  for(; i<w_r-15; i+=16){
83  __asm__ volatile(
84  "movdqu (%1), %%xmm1 \n\t"
85  "movdqu 16(%1), %%xmm5 \n\t"
86  "movdqu 2(%1), %%xmm2 \n\t"
87  "movdqu 18(%1), %%xmm6 \n\t"
88  "paddw %%xmm1, %%xmm2 \n\t"
89  "paddw %%xmm5, %%xmm6 \n\t"
90  "movdqa (%0), %%xmm0 \n\t"
91  "movdqa 16(%0), %%xmm4 \n\t"
92  "psubw %%xmm2, %%xmm0 \n\t"
93  "psubw %%xmm6, %%xmm4 \n\t"
94  "movdqa %%xmm0, (%0) \n\t"
95  "movdqa %%xmm4, 16(%0) \n\t"
96  :: "r"(&dst[i]), "r"(&b[i])
97  : "memory"
98  );
99  }
100  snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
101  }
102 
103  { // Lift 2
104  IDWTELEM * const ref = b+w2 - 1;
105  IDWTELEM b_0 = b[0];
106 
107  i = 0;
108  __asm__ volatile(
109  "psllw $15, %%xmm7 \n\t"
110  "pcmpeqw %%xmm6, %%xmm6 \n\t"
111  "psrlw $13, %%xmm6 \n\t"
112  "paddw %%xmm7, %%xmm6 \n\t"
113  ::);
114  for(; i<w_l-15; i+=16){
115  __asm__ volatile(
116  "movdqu (%1), %%xmm0 \n\t"
117  "movdqu 16(%1), %%xmm4 \n\t"
118  "movdqu 2(%1), %%xmm1 \n\t"
119  "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
120  "paddw %%xmm6, %%xmm0 \n\t"
121  "paddw %%xmm6, %%xmm4 \n\t"
122  "paddw %%xmm7, %%xmm1 \n\t"
123  "paddw %%xmm7, %%xmm5 \n\t"
124  "pavgw %%xmm1, %%xmm0 \n\t"
125  "pavgw %%xmm5, %%xmm4 \n\t"
126  "psubw %%xmm7, %%xmm0 \n\t"
127  "psubw %%xmm7, %%xmm4 \n\t"
128  "psraw $1, %%xmm0 \n\t"
129  "psraw $1, %%xmm4 \n\t"
130  "movdqa (%0), %%xmm1 \n\t"
131  "movdqa 16(%0), %%xmm5 \n\t"
132  "paddw %%xmm1, %%xmm0 \n\t"
133  "paddw %%xmm5, %%xmm4 \n\t"
134  "psraw $2, %%xmm0 \n\t"
135  "psraw $2, %%xmm4 \n\t"
136  "paddw %%xmm1, %%xmm0 \n\t"
137  "paddw %%xmm5, %%xmm4 \n\t"
138  "movdqa %%xmm0, (%0) \n\t"
139  "movdqa %%xmm4, 16(%0) \n\t"
140  :: "r"(&b[i]), "r"(&ref[i])
141  : "memory"
142  );
143  }
144  snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
145  b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
146  }
147 
148  { // Lift 3
149  IDWTELEM * const src = b+w2;
150 
151  i = 0;
152  for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
153  temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
154  }
155  for(; i<w_r-7; i+=8){
156  __asm__ volatile(
157  "movdqu 2(%1), %%xmm2 \n\t"
158  "movdqu 18(%1), %%xmm6 \n\t"
159  "paddw (%1), %%xmm2 \n\t"
160  "paddw 16(%1), %%xmm6 \n\t"
161  "movdqu (%0), %%xmm0 \n\t"
162  "movdqu 16(%0), %%xmm4 \n\t"
163  "paddw %%xmm2, %%xmm0 \n\t"
164  "paddw %%xmm6, %%xmm4 \n\t"
165  "psraw $1, %%xmm2 \n\t"
166  "psraw $1, %%xmm6 \n\t"
167  "paddw %%xmm0, %%xmm2 \n\t"
168  "paddw %%xmm4, %%xmm6 \n\t"
169  "movdqa %%xmm2, (%2) \n\t"
170  "movdqa %%xmm6, 16(%2) \n\t"
171  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
172  : "memory"
173  );
174  }
175  snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
176  }
177 
178  {
179  snow_interleave_line_header(&i, width, b, temp);
180 
181  for (; (i & 0x3E) != 0x3E; i-=2){
182  b[i+1] = temp[i>>1];
183  b[i] = b[i>>1];
184  }
185  for (i-=62; i>=0; i-=64){
186  __asm__ volatile(
187  "movdqa (%1), %%xmm0 \n\t"
188  "movdqa 16(%1), %%xmm2 \n\t"
189  "movdqa 32(%1), %%xmm4 \n\t"
190  "movdqa 48(%1), %%xmm6 \n\t"
191  "movdqa (%1), %%xmm1 \n\t"
192  "movdqa 16(%1), %%xmm3 \n\t"
193  "movdqa 32(%1), %%xmm5 \n\t"
194  "movdqa 48(%1), %%xmm7 \n\t"
195  "punpcklwd (%2), %%xmm0 \n\t"
196  "punpcklwd 16(%2), %%xmm2 \n\t"
197  "punpcklwd 32(%2), %%xmm4 \n\t"
198  "punpcklwd 48(%2), %%xmm6 \n\t"
199  "movdqa %%xmm0, (%0) \n\t"
200  "movdqa %%xmm2, 32(%0) \n\t"
201  "movdqa %%xmm4, 64(%0) \n\t"
202  "movdqa %%xmm6, 96(%0) \n\t"
203  "punpckhwd (%2), %%xmm1 \n\t"
204  "punpckhwd 16(%2), %%xmm3 \n\t"
205  "punpckhwd 32(%2), %%xmm5 \n\t"
206  "punpckhwd 48(%2), %%xmm7 \n\t"
207  "movdqa %%xmm1, 16(%0) \n\t"
208  "movdqa %%xmm3, 48(%0) \n\t"
209  "movdqa %%xmm5, 80(%0) \n\t"
210  "movdqa %%xmm7, 112(%0) \n\t"
211  :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
212  : "memory"
213  );
214  }
215  }
216 }
217 
218 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
219  const int w2= (width+1)>>1;
220  const int w_l= (width>>1);
221  const int w_r= w2 - 1;
222  int i;
223 
224  { // Lift 0
225  IDWTELEM * const ref = b + w2 - 1;
226 
227  i = 1;
228  b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
229  __asm__ volatile(
230  "pcmpeqw %%mm7, %%mm7 \n\t"
231  "pcmpeqw %%mm3, %%mm3 \n\t"
232  "psllw $1, %%mm3 \n\t"
233  "paddw %%mm7, %%mm3 \n\t"
234  "psllw $13, %%mm3 \n\t"
235  ::);
236  for(; i<w_l-7; i+=8){
237  __asm__ volatile(
238  "movq (%1), %%mm2 \n\t"
239  "movq 8(%1), %%mm6 \n\t"
240  "paddw 2(%1), %%mm2 \n\t"
241  "paddw 10(%1), %%mm6 \n\t"
242  "paddw %%mm7, %%mm2 \n\t"
243  "paddw %%mm7, %%mm6 \n\t"
244  "pmulhw %%mm3, %%mm2 \n\t"
245  "pmulhw %%mm3, %%mm6 \n\t"
246  "paddw (%0), %%mm2 \n\t"
247  "paddw 8(%0), %%mm6 \n\t"
248  "movq %%mm2, (%0) \n\t"
249  "movq %%mm6, 8(%0) \n\t"
250  :: "r"(&b[i]), "r"(&ref[i])
251  : "memory"
252  );
253  }
254  snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
255  }
256 
257  { // Lift 1
258  IDWTELEM * const dst = b+w2;
259 
260  i = 0;
261  for(; i<w_r-7; i+=8){
262  __asm__ volatile(
263  "movq (%1), %%mm2 \n\t"
264  "movq 8(%1), %%mm6 \n\t"
265  "paddw 2(%1), %%mm2 \n\t"
266  "paddw 10(%1), %%mm6 \n\t"
267  "movq (%0), %%mm0 \n\t"
268  "movq 8(%0), %%mm4 \n\t"
269  "psubw %%mm2, %%mm0 \n\t"
270  "psubw %%mm6, %%mm4 \n\t"
271  "movq %%mm0, (%0) \n\t"
272  "movq %%mm4, 8(%0) \n\t"
273  :: "r"(&dst[i]), "r"(&b[i])
274  : "memory"
275  );
276  }
277  snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
278  }
279 
280  { // Lift 2
281  IDWTELEM * const ref = b+w2 - 1;
282 
283  i = 1;
284  b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
285  __asm__ volatile(
286  "psllw $15, %%mm7 \n\t"
287  "pcmpeqw %%mm6, %%mm6 \n\t"
288  "psrlw $13, %%mm6 \n\t"
289  "paddw %%mm7, %%mm6 \n\t"
290  ::);
291  for(; i<w_l-7; i+=8){
292  __asm__ volatile(
293  "movq (%1), %%mm0 \n\t"
294  "movq 8(%1), %%mm4 \n\t"
295  "movq 2(%1), %%mm1 \n\t"
296  "movq 10(%1), %%mm5 \n\t"
297  "paddw %%mm6, %%mm0 \n\t"
298  "paddw %%mm6, %%mm4 \n\t"
299  "paddw %%mm7, %%mm1 \n\t"
300  "paddw %%mm7, %%mm5 \n\t"
301  "pavgw %%mm1, %%mm0 \n\t"
302  "pavgw %%mm5, %%mm4 \n\t"
303  "psubw %%mm7, %%mm0 \n\t"
304  "psubw %%mm7, %%mm4 \n\t"
305  "psraw $1, %%mm0 \n\t"
306  "psraw $1, %%mm4 \n\t"
307  "movq (%0), %%mm1 \n\t"
308  "movq 8(%0), %%mm5 \n\t"
309  "paddw %%mm1, %%mm0 \n\t"
310  "paddw %%mm5, %%mm4 \n\t"
311  "psraw $2, %%mm0 \n\t"
312  "psraw $2, %%mm4 \n\t"
313  "paddw %%mm1, %%mm0 \n\t"
314  "paddw %%mm5, %%mm4 \n\t"
315  "movq %%mm0, (%0) \n\t"
316  "movq %%mm4, 8(%0) \n\t"
317  :: "r"(&b[i]), "r"(&ref[i])
318  : "memory"
319  );
320  }
321  snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
322  }
323 
324  { // Lift 3
325  IDWTELEM * const src = b+w2;
326  i = 0;
327 
328  for(; i<w_r-7; i+=8){
329  __asm__ volatile(
330  "movq 2(%1), %%mm2 \n\t"
331  "movq 10(%1), %%mm6 \n\t"
332  "paddw (%1), %%mm2 \n\t"
333  "paddw 8(%1), %%mm6 \n\t"
334  "movq (%0), %%mm0 \n\t"
335  "movq 8(%0), %%mm4 \n\t"
336  "paddw %%mm2, %%mm0 \n\t"
337  "paddw %%mm6, %%mm4 \n\t"
338  "psraw $1, %%mm2 \n\t"
339  "psraw $1, %%mm6 \n\t"
340  "paddw %%mm0, %%mm2 \n\t"
341  "paddw %%mm4, %%mm6 \n\t"
342  "movq %%mm2, (%2) \n\t"
343  "movq %%mm6, 8(%2) \n\t"
344  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
345  : "memory"
346  );
347  }
348  snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
349  }
350 
351  {
352  snow_interleave_line_header(&i, width, b, temp);
353 
354  for (; (i & 0x1E) != 0x1E; i-=2){
355  b[i+1] = temp[i>>1];
356  b[i] = b[i>>1];
357  }
358  for (i-=30; i>=0; i-=32){
359  __asm__ volatile(
360  "movq (%1), %%mm0 \n\t"
361  "movq 8(%1), %%mm2 \n\t"
362  "movq 16(%1), %%mm4 \n\t"
363  "movq 24(%1), %%mm6 \n\t"
364  "movq (%1), %%mm1 \n\t"
365  "movq 8(%1), %%mm3 \n\t"
366  "movq 16(%1), %%mm5 \n\t"
367  "movq 24(%1), %%mm7 \n\t"
368  "punpcklwd (%2), %%mm0 \n\t"
369  "punpcklwd 8(%2), %%mm2 \n\t"
370  "punpcklwd 16(%2), %%mm4 \n\t"
371  "punpcklwd 24(%2), %%mm6 \n\t"
372  "movq %%mm0, (%0) \n\t"
373  "movq %%mm2, 16(%0) \n\t"
374  "movq %%mm4, 32(%0) \n\t"
375  "movq %%mm6, 48(%0) \n\t"
376  "punpckhwd (%2), %%mm1 \n\t"
377  "punpckhwd 8(%2), %%mm3 \n\t"
378  "punpckhwd 16(%2), %%mm5 \n\t"
379  "punpckhwd 24(%2), %%mm7 \n\t"
380  "movq %%mm1, 8(%0) \n\t"
381  "movq %%mm3, 24(%0) \n\t"
382  "movq %%mm5, 40(%0) \n\t"
383  "movq %%mm7, 56(%0) \n\t"
384  :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
385  : "memory"
386  );
387  }
388  }
389 }
390 
391 #if HAVE_7REGS
392 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
393  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
394  ""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\
395  ""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\
396  ""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t"
397 
398 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
399  snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
400 
401 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
402  snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
403 
404 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
405  "psubw %%"s0", %%"t0" \n\t"\
406  "psubw %%"s1", %%"t1" \n\t"\
407  "psubw %%"s2", %%"t2" \n\t"\
408  "psubw %%"s3", %%"t3" \n\t"
409 
410 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
411  "movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\
412  "movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\
413  "movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\
414  "movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t"
415 
416 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
417  "psraw $"n", %%"t0" \n\t"\
418  "psraw $"n", %%"t1" \n\t"\
419  "psraw $"n", %%"t2" \n\t"\
420  "psraw $"n", %%"t3" \n\t"
421 
422 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
423  "paddw %%"s0", %%"t0" \n\t"\
424  "paddw %%"s1", %%"t1" \n\t"\
425  "paddw %%"s2", %%"t2" \n\t"\
426  "paddw %%"s3", %%"t3" \n\t"
427 
428 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
429  "pmulhw %%"s0", %%"t0" \n\t"\
430  "pmulhw %%"s1", %%"t1" \n\t"\
431  "pmulhw %%"s2", %%"t2" \n\t"\
432  "pmulhw %%"s3", %%"t3" \n\t"
433 
434 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
435  "movdqa %%"s0", %%"t0" \n\t"\
436  "movdqa %%"s1", %%"t1" \n\t"\
437  "movdqa %%"s2", %%"t2" \n\t"\
438  "movdqa %%"s3", %%"t3" \n\t"
439 
440 static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
441  x86_reg i = width;
442 
443  while(i & 0x1F)
444  {
445  i--;
446  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
447  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
448  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
449  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
450  }
451  i+=i;
452 
453  __asm__ volatile (
454  "jmp 2f \n\t"
455  "1: \n\t"
456  snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
457  snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
458 
459 
460  "pcmpeqw %%xmm0, %%xmm0 \n\t"
461  "pcmpeqw %%xmm2, %%xmm2 \n\t"
462  "paddw %%xmm2, %%xmm2 \n\t"
463  "paddw %%xmm0, %%xmm2 \n\t"
464  "psllw $13, %%xmm2 \n\t"
465  snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
466  snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
467  snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
468  snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
469  snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
470  snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
471  snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
472  snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
473 
474  "pcmpeqw %%xmm7, %%xmm7 \n\t"
475  "pcmpeqw %%xmm5, %%xmm5 \n\t"
476  "psllw $15, %%xmm7 \n\t"
477  "psrlw $13, %%xmm5 \n\t"
478  "paddw %%xmm7, %%xmm5 \n\t"
479  snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
480  "movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
481  "movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
482  "paddw %%xmm7, %%xmm1 \n\t"
483  "paddw %%xmm7, %%xmm3 \n\t"
484  "pavgw %%xmm1, %%xmm0 \n\t"
485  "pavgw %%xmm3, %%xmm2 \n\t"
486  "movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
487  "movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
488  "paddw %%xmm7, %%xmm1 \n\t"
489  "paddw %%xmm7, %%xmm3 \n\t"
490  "pavgw %%xmm1, %%xmm4 \n\t"
491  "pavgw %%xmm3, %%xmm6 \n\t"
492  snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
493  snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
494  snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
495 
496  snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
497  snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
498  snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
499  snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
500  snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
501  snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
502  snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
503  snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
504  snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
505 
506  "2: \n\t"
507  "sub $64, %%"FF_REG_d" \n\t"
508  "jge 1b \n\t"
509  :"+d"(i)
510  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
511 }
512 
513 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
514  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
515  ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
516  ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
517  ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
518 
519 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
520  snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
521 
522 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
523  snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
524 
525 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
526  "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
527  "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
528  "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
529  "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
530 
531 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
532  "movq %%"s0", %%"t0" \n\t"\
533  "movq %%"s1", %%"t1" \n\t"\
534  "movq %%"s2", %%"t2" \n\t"\
535  "movq %%"s3", %%"t3" \n\t"
536 
537 
538 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
539  x86_reg i = width;
540  while(i & 15)
541  {
542  i--;
543  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
544  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
545  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
546  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
547  }
548  i+=i;
549  __asm__ volatile(
550  "jmp 2f \n\t"
551  "1: \n\t"
552 
553  snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
554  snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
555  "pcmpeqw %%mm0, %%mm0 \n\t"
556  "pcmpeqw %%mm2, %%mm2 \n\t"
557  "paddw %%mm2, %%mm2 \n\t"
558  "paddw %%mm0, %%mm2 \n\t"
559  "psllw $13, %%mm2 \n\t"
560  snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
561  snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
562  snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
563  snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
564  snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
565  snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
566  snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
567  snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
568  "pcmpeqw %%mm7, %%mm7 \n\t"
569  "pcmpeqw %%mm5, %%mm5 \n\t"
570  "psllw $15, %%mm7 \n\t"
571  "psrlw $13, %%mm5 \n\t"
572  "paddw %%mm7, %%mm5 \n\t"
573  snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
574  "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
575  "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
576  "paddw %%mm7, %%mm1 \n\t"
577  "paddw %%mm7, %%mm3 \n\t"
578  "pavgw %%mm1, %%mm0 \n\t"
579  "pavgw %%mm3, %%mm2 \n\t"
580  "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
581  "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
582  "paddw %%mm7, %%mm1 \n\t"
583  "paddw %%mm7, %%mm3 \n\t"
584  "pavgw %%mm1, %%mm4 \n\t"
585  "pavgw %%mm3, %%mm6 \n\t"
586  snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
587  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
588  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
589 
590  snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
591  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
592  snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
593  snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
594  snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
595  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
596  snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
597  snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
598  snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
599 
600  "2: \n\t"
601  "sub $32, %%"FF_REG_d" \n\t"
602  "jge 1b \n\t"
603  :"+d"(i)
604  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
605 }
606 #endif //HAVE_7REGS
607 
608 #if HAVE_6REGS
609 #define snow_inner_add_yblock_sse2_header \
610  IDWTELEM * * dst_array = sb->line + src_y;\
611  x86_reg tmp;\
612  __asm__ volatile(\
613  "mov %7, %%"FF_REG_c" \n\t"\
614  "mov %6, %2 \n\t"\
615  "mov %4, %%"FF_REG_S" \n\t"\
616  "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
617  "pcmpeqd %%xmm3, %%xmm3 \n\t"\
618  "psllw $15, %%xmm3 \n\t"\
619  "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
620  "1: \n\t"\
621  "mov %1, %%"FF_REG_D" \n\t"\
622  "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
623  "add %3, %%"FF_REG_D" \n\t"
624 
625 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
626  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
627  "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
628  "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\
629  "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
630  "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
631  "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
632  "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
633  "punpcklbw %%xmm7, %%xmm0 \n\t"\
634  "punpcklbw %%xmm7, %%xmm4 \n\t"\
635  "pmullw %%xmm0, %%"out_reg1" \n\t"\
636  "pmullw %%xmm4, %%"out_reg2" \n\t"
637 
638 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
639  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
640  "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
641  "movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\
642  "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
643  "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
644  "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
645  "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
646  "punpcklbw %%xmm7, %%xmm0 \n\t"\
647  "punpcklbw %%xmm7, %%xmm4 \n\t"\
648  "pmullw %%xmm0, %%"out_reg1" \n\t"\
649  "pmullw %%xmm4, %%"out_reg2" \n\t"
650 
651 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
652  snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
653  "paddusw %%xmm2, %%xmm1 \n\t"\
654  "paddusw %%xmm6, %%xmm5 \n\t"
655 
656 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
657  snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
658  "paddusw %%xmm2, %%xmm1 \n\t"\
659  "paddusw %%xmm6, %%xmm5 \n\t"
660 
661 #define snow_inner_add_yblock_sse2_end_common1\
662  "add $32, %%"FF_REG_S" \n\t"\
663  "add %%"FF_REG_c", %0 \n\t"\
664  "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
665  "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
666  "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
667  "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"
668 
669 #define snow_inner_add_yblock_sse2_end_common2\
670  "jnz 1b \n\t"\
671  :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
672  :\
673  "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
674  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
675  "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
676 
677 #define snow_inner_add_yblock_sse2_end_8\
678  "sal $1, %%"FF_REG_c" \n\t"\
679  "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
680  snow_inner_add_yblock_sse2_end_common1\
681  "sar $1, %%"FF_REG_c" \n\t"\
682  "sub $2, %2 \n\t"\
683  snow_inner_add_yblock_sse2_end_common2
684 
685 #define snow_inner_add_yblock_sse2_end_16\
686  "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
687  snow_inner_add_yblock_sse2_end_common1\
688  "dec %2 \n\t"\
689  snow_inner_add_yblock_sse2_end_common2
690 
691 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
692  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
693 snow_inner_add_yblock_sse2_header
694 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
695 snow_inner_add_yblock_sse2_accum_8("2", "8")
696 snow_inner_add_yblock_sse2_accum_8("1", "128")
697 snow_inner_add_yblock_sse2_accum_8("0", "136")
698 
699  "mov %0, %%"FF_REG_d" \n\t"
700  "movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
701  "movdqa %%xmm1, %%xmm2 \n\t"
702 
703  "punpckhwd %%xmm7, %%xmm1 \n\t"
704  "punpcklwd %%xmm7, %%xmm2 \n\t"
705  "paddd %%xmm2, %%xmm0 \n\t"
706  "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
707  "paddd %%xmm1, %%xmm2 \n\t"
708  "paddd %%xmm3, %%xmm0 \n\t"
709  "paddd %%xmm3, %%xmm2 \n\t"
710 
711  "mov %1, %%"FF_REG_D" \n\t"
712  "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
713  "add %3, %%"FF_REG_D" \n\t"
714 
715  "movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
716  "movdqa %%xmm5, %%xmm6 \n\t"
717  "punpckhwd %%xmm7, %%xmm5 \n\t"
718  "punpcklwd %%xmm7, %%xmm6 \n\t"
719  "paddd %%xmm6, %%xmm4 \n\t"
720  "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
721  "paddd %%xmm5, %%xmm6 \n\t"
722  "paddd %%xmm3, %%xmm4 \n\t"
723  "paddd %%xmm3, %%xmm6 \n\t"
724 
725  "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
726  "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
727  "packssdw %%xmm2, %%xmm0 \n\t"
728  "packuswb %%xmm7, %%xmm0 \n\t"
729  "movq %%xmm0, (%%"FF_REG_d") \n\t"
730 
731  "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
732  "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
733  "packssdw %%xmm6, %%xmm4 \n\t"
734  "packuswb %%xmm7, %%xmm4 \n\t"
735  "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
736 snow_inner_add_yblock_sse2_end_8
737 }
738 
739 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
740  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
741 snow_inner_add_yblock_sse2_header
742 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
743 snow_inner_add_yblock_sse2_accum_16("2", "16")
744 snow_inner_add_yblock_sse2_accum_16("1", "512")
745 snow_inner_add_yblock_sse2_accum_16("0", "528")
746 
747  "mov %0, %%"FF_REG_d" \n\t"
748  "psrlw $4, %%xmm1 \n\t"
749  "psrlw $4, %%xmm5 \n\t"
750  "paddw (%%"FF_REG_D"), %%xmm1 \n\t"
751  "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
752  "paddw %%xmm3, %%xmm1 \n\t"
753  "paddw %%xmm3, %%xmm5 \n\t"
754  "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
755  "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
756  "packuswb %%xmm5, %%xmm1 \n\t"
757 
758  "movdqu %%xmm1, (%%"FF_REG_d") \n\t"
759 
760 snow_inner_add_yblock_sse2_end_16
761 }
762 
763 #define snow_inner_add_yblock_mmx_header \
764  IDWTELEM * * dst_array = sb->line + src_y;\
765  x86_reg tmp;\
766  __asm__ volatile(\
767  "mov %7, %%"FF_REG_c" \n\t"\
768  "mov %6, %2 \n\t"\
769  "mov %4, %%"FF_REG_S" \n\t"\
770  "pxor %%mm7, %%mm7 \n\t" /* 0 */\
771  "pcmpeqd %%mm3, %%mm3 \n\t"\
772  "psllw $15, %%mm3 \n\t"\
773  "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
774  "1: \n\t"\
775  "mov %1, %%"FF_REG_D" \n\t"\
776  "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
777  "add %3, %%"FF_REG_D" \n\t"
778 
779 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
780  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
781  "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\
782  "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\
783  "punpcklbw %%mm7, %%"out_reg1" \n\t"\
784  "punpcklbw %%mm7, %%"out_reg2" \n\t"\
785  "movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\
786  "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
787  "punpcklbw %%mm7, %%mm0 \n\t"\
788  "punpcklbw %%mm7, %%mm4 \n\t"\
789  "pmullw %%mm0, %%"out_reg1" \n\t"\
790  "pmullw %%mm4, %%"out_reg2" \n\t"
791 
792 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
793  snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
794  "paddusw %%mm2, %%mm1 \n\t"\
795  "paddusw %%mm6, %%mm5 \n\t"
796 
797 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
798  "mov %0, %%"FF_REG_d" \n\t"\
799  "psrlw $4, %%mm1 \n\t"\
800  "psrlw $4, %%mm5 \n\t"\
801  "paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\
802  "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
803  "paddw %%mm3, %%mm1 \n\t"\
804  "paddw %%mm3, %%mm5 \n\t"\
805  "psraw $4, %%mm1 \n\t"\
806  "psraw $4, %%mm5 \n\t"\
807  "packuswb %%mm5, %%mm1 \n\t"\
808  "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
809 
810 #define snow_inner_add_yblock_mmx_end(s_step)\
811  "add $"s_step", %%"FF_REG_S" \n\t"\
812  "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
813  "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
814  "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
815  "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\
816  "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\
817  "add %%"FF_REG_c", %0 \n\t"\
818  "dec %2 \n\t"\
819  "jnz 1b \n\t"\
820  :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
821  :\
822  "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
823  "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
824 
825 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
826  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
827 snow_inner_add_yblock_mmx_header
828 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
829 snow_inner_add_yblock_mmx_accum("2", "8", "0")
830 snow_inner_add_yblock_mmx_accum("1", "128", "0")
831 snow_inner_add_yblock_mmx_accum("0", "136", "0")
832 snow_inner_add_yblock_mmx_mix("0", "0")
833 snow_inner_add_yblock_mmx_end("16")
834 }
835 
836 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
837  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
838 snow_inner_add_yblock_mmx_header
839 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
840 snow_inner_add_yblock_mmx_accum("2", "16", "0")
841 snow_inner_add_yblock_mmx_accum("1", "512", "0")
842 snow_inner_add_yblock_mmx_accum("0", "528", "0")
843 snow_inner_add_yblock_mmx_mix("0", "0")
844 
845 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
846 snow_inner_add_yblock_mmx_accum("2", "24", "8")
847 snow_inner_add_yblock_mmx_accum("1", "520", "8")
848 snow_inner_add_yblock_mmx_accum("0", "536", "8")
849 snow_inner_add_yblock_mmx_mix("16", "8")
850 snow_inner_add_yblock_mmx_end("32")
851 }
852 
853 static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
854  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
855 
856  if (b_w == 16)
857  inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
858  else if (b_w == 8 && obmc_stride == 16) {
859  if (!(b_h & 1))
860  inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
861  else
862  inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
863  } else
864  ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
865 }
866 
867 static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
868  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
869  if (b_w == 16)
870  inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
871  else if (b_w == 8 && obmc_stride == 16)
872  inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
873  else
874  ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
875 }
876 #endif /* HAVE_6REGS */
877 
878 #endif /* HAVE_INLINE_ASM */
879 
881 {
882 #if HAVE_INLINE_ASM
883  int mm_flags = av_get_cpu_flags();
884 
885  if (mm_flags & AV_CPU_FLAG_MMX) {
886  if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
887  c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
888 #if HAVE_7REGS
889  c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
890 #endif
891 #if HAVE_6REGS
892  c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
893 #endif
894  }
895  else{
896  if (mm_flags & AV_CPU_FLAG_MMXEXT) {
897  c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
898 #if HAVE_7REGS
899  c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
900 #endif
901  }
902 #if HAVE_6REGS
903  c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
904 #endif
905  }
906  }
907 #endif /* HAVE_INLINE_ASM */
908 }
#define W_AO
Definition: snow_dwt.h:73
#define W_DS
Definition: snow_dwt.h:87
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
Definition: snow.c:36
else temp
Definition: vf_mcdeint.c:256
#define W_BO
Definition: snow_dwt.h:78
#define W_DO
Definition: snow_dwt.h:86
#define W_AM
Definition: snow_dwt.h:72
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
Definition: snow.h:218
#define src
Definition: vp8dsp.c:254
short IDWTELEM
Definition: dirac_dwt.h:27
The exact code depends on how similar the blocks are and how related they are to the block
uint8_t
#define av_cold
Definition: attributes.h:82
#define W_CS
Definition: snow_dwt.h:83
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:32
#define W_DM
Definition: snow_dwt.h:85
#define W_BM
Definition: snow_dwt.h:77
void(* horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width)
Definition: snow_dwt.h:60
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
const char * r
Definition: vf_curves.c:114
#define W_AS
Definition: snow_dwt.h:74
#define b
Definition: input.c:41
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
Definition: snowdsp.c:880
#define width
int n
Definition: avisynth_c.h:760
#define W_BS
Definition: snow_dwt.h:79
Libavcodec external API header.
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:31
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
Definition: snow.h:202
#define W_CM
Definition: snow_dwt.h:81
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
void(* inner_add_yblock)(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
Definition: snow_dwt.h:61
void(* vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width)
Definition: snow_dwt.h:57
int x86_reg
Definition: asm.h:72
#define W_CO
Definition: snow_dwt.h:82
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:36
Used to minimize the amount of memory used in order to optimize cache performance.
Definition: snow_dwt.h:44
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w)
Definition: snow.h:228