FFmpeg
me_cmp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "me_cmp_mips.h"
23 
24 static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
25  uint8_t *ref, int32_t ref_stride,
27 {
28  int32_t ht_cnt;
29  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
30  v8u16 sad = { 0 };
31 
32  for (ht_cnt = (height >> 2); ht_cnt--;) {
33  LD_UB4(src, src_stride, src0, src1, src2, src3);
34  src += (4 * src_stride);
35  LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
36  ref += (4 * ref_stride);
37 
38  PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
39  src0, src1, ref0, ref1);
40  sad += SAD_UB2_UH(src0, src1, ref0, ref1);
41  }
42 
43  return (HADD_UH_U32(sad));
44 }
45 
46 static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
47  uint8_t *ref, int32_t ref_stride,
49 {
50  int32_t ht_cnt;
51  v16u8 src0, src1, ref0, ref1;
52  v8u16 sad = { 0 };
53 
54  for (ht_cnt = (height >> 2); ht_cnt--;) {
55  LD_UB2(src, src_stride, src0, src1);
56  src += (2 * src_stride);
57  LD_UB2(ref, ref_stride, ref0, ref1);
58  ref += (2 * ref_stride);
59  sad += SAD_UB2_UH(src0, src1, ref0, ref1);
60 
61  LD_UB2(src, src_stride, src0, src1);
62  src += (2 * src_stride);
63  LD_UB2(ref, ref_stride, ref0, ref1);
64  ref += (2 * ref_stride);
65  sad += SAD_UB2_UH(src0, src1, ref0, ref1);
66  }
67 
68  return (HADD_UH_U32(sad));
69 }
70 
72  int32_t src_stride,
73  uint8_t *ref,
74  int32_t ref_stride,
76 {
77  int32_t ht_cnt;
78  v16u8 src0, src1, src2, src3, comp0, comp1;
79  v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
80  v8u16 sad = { 0 };
81 
82  for (ht_cnt = (height >> 3); ht_cnt--;) {
83  LD_UB4(src, src_stride, src0, src1, src2, src3);
84  src += (4 * src_stride);
85  LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
86  ref += (4 * ref_stride);
87 
88  PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
89  PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
90  SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
91  ref0, ref1, ref2, ref3);
92  PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
93  AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
94  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
95 
96  LD_UB4(src, src_stride, src0, src1, src2, src3);
97  src += (4 * src_stride);
98  LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
99  ref += (4 * ref_stride);
100 
101  PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
102  PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
103  SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
104  ref0, ref1, ref2, ref3);
105  PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
106  AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
107  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
108  }
109 
110  return (HADD_UH_U32(sad));
111 }
112 
114  int32_t src_stride,
115  uint8_t *ref,
116  int32_t ref_stride,
117  int32_t height)
118 {
119  int32_t ht_cnt;
120  v16u8 src0, src1, src2, src3, comp0, comp1;
121  v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
122  v8u16 sad = { 0 };
123 
124  for (ht_cnt = (height >> 3); ht_cnt--;) {
125  LD_UB4(src, src_stride, src0, src1, src2, src3);
126  src += (4 * src_stride);
127  LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
128  LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
129  ref += (4 * ref_stride);
130 
131  AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
132  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
133  AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
134  sad += SAD_UB2_UH(src2, src3, comp0, comp1);
135 
136  LD_UB4(src, src_stride, src0, src1, src2, src3);
137  src += (4 * src_stride);
138  LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
139  LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
140  ref += (4 * ref_stride);
141 
142  AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
143  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
144  AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
145  sad += SAD_UB2_UH(src2, src3, comp0, comp1);
146  }
147 
148  return (HADD_UH_U32(sad));
149 }
150 
152  int32_t src_stride,
153  uint8_t *ref,
154  int32_t ref_stride,
155  int32_t height)
156 {
157  int32_t ht_cnt;
158  v16u8 src0, src1, src2, src3, comp0, comp1;
159  v16u8 ref0, ref1, ref2, ref3, ref4;
160  v8u16 sad = { 0 };
161 
162  for (ht_cnt = (height >> 3); ht_cnt--;) {
163  LD_UB4(src, src_stride, src0, src1, src2, src3);
164  src += (4 * src_stride);
165  LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
166  ref += (4 * ref_stride);
167 
168  PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
169  PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
170  PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
171  AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
172  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
173 
174  LD_UB4(src, src_stride, src0, src1, src2, src3);
175  src += (4 * src_stride);
176  LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
177  ref += (4 * ref_stride);
178 
179  PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
180  PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
181  PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
182  AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
183  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
184  }
185 
186  return (HADD_UH_U32(sad));
187 }
188 
190  int32_t src_stride,
191  uint8_t *ref,
192  int32_t ref_stride,
193  int32_t height)
194 {
195  int32_t ht_cnt;
196  v16u8 src0, src1, src2, src3, comp0, comp1;
197  v16u8 ref0, ref1, ref2, ref3, ref4;
198  v8u16 sad = { 0 };
199 
200  for (ht_cnt = (height >> 3); ht_cnt--;) {
201  LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
202  ref += (5 * ref_stride);
203  LD_UB4(src, src_stride, src0, src1, src2, src3);
204  src += (4 * src_stride);
205 
206  AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
207  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
208  AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
209  sad += SAD_UB2_UH(src2, src3, comp0, comp1);
210 
211  ref4 = ref3;
212 
213  LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
214  ref += (3 * ref_stride);
215  LD_UB4(src, src_stride, src0, src1, src2, src3);
216  src += (4 * src_stride);
217 
218  AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
219  sad += SAD_UB2_UH(src0, src1, comp0, comp1);
220  AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
221  sad += SAD_UB2_UH(src2, src3, comp0, comp1);
222  }
223 
224  return (HADD_UH_U32(sad));
225 }
226 
228  int32_t src_stride,
229  uint8_t *ref,
230  int32_t ref_stride,
231  int32_t height)
232 {
233  int32_t ht_cnt;
234  v16u8 src0, src1, src2, src3, temp0, temp1, diff;
235  v16u8 ref0, ref1, ref2, ref3, ref4;
236  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
237  v8u16 comp0, comp1, comp2, comp3;
238  v8u16 sad = { 0 };
239 
240  for (ht_cnt = (height >> 2); ht_cnt--;) {
241  LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
242  ref += (4 * ref_stride);
243  LD_UB4(src, src_stride, src0, src1, src2, src3);
244  src += (4 * src_stride);
245 
246  PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
247 
248  VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
249  comp0 = __msa_hadd_u_h(temp0, temp0);
250  comp1 = __msa_hadd_u_h(temp1, temp1);
251  comp0 += comp1;
252  comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
253  comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
254 
255  temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
256  comp2 = __msa_hadd_u_h(temp0, temp0);
257  comp1 += comp2;
258  comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
259  comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
260  comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
261  diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
262  sad += __msa_hadd_u_h(diff, diff);
263 
264  temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
265  comp3 = __msa_hadd_u_h(temp1, temp1);
266  comp2 += comp3;
267  comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
268  comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
269 
270  temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
271  comp0 = __msa_hadd_u_h(temp0, temp0);
272  comp3 += comp0;
273  comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
274  comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
275  comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
276  diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
277  sad += __msa_hadd_u_h(diff, diff);
278  }
279 
280  return (HADD_UH_U32(sad));
281 }
282 
284  int32_t src_stride,
285  uint8_t *ref,
286  int32_t ref_stride,
287  int32_t height)
288 {
289  int32_t ht_cnt;
290  v16u8 src0, src1, src2, src3, comp, diff;
291  v16u8 temp0, temp1, temp2, temp3;
292  v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
293  v8u16 comp0, comp1, comp2, comp3;
294  v8u16 sad = { 0 };
295 
296  for (ht_cnt = (height >> 3); ht_cnt--;) {
297  LD_UB4(src, src_stride, src0, src1, src2, src3);
298  src += (4 * src_stride);
299  LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
300  LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
301  ref += (5 * ref_stride);
302 
303  ILVRL_B2_UB(ref14, ref04, temp0, temp1);
304  comp0 = __msa_hadd_u_h(temp0, temp0);
305  comp1 = __msa_hadd_u_h(temp1, temp1);
306  ILVRL_B2_UB(ref10, ref00, temp2, temp3);
307  comp2 = __msa_hadd_u_h(temp2, temp2);
308  comp3 = __msa_hadd_u_h(temp3, temp3);
309  comp0 += comp2;
310  comp1 += comp3;
311  SRARI_H2_UH(comp0, comp1, 2);
312  comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
313  diff = __msa_asub_u_b(src0, comp);
314  sad += __msa_hadd_u_h(diff, diff);
315 
316  ILVRL_B2_UB(ref11, ref01, temp0, temp1);
317  comp0 = __msa_hadd_u_h(temp0, temp0);
318  comp1 = __msa_hadd_u_h(temp1, temp1);
319  comp2 += comp0;
320  comp3 += comp1;
321  SRARI_H2_UH(comp2, comp3, 2);
322  comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
323  diff = __msa_asub_u_b(src1, comp);
324  sad += __msa_hadd_u_h(diff, diff);
325 
326  ILVRL_B2_UB(ref12, ref02, temp2, temp3);
327  comp2 = __msa_hadd_u_h(temp2, temp2);
328  comp3 = __msa_hadd_u_h(temp3, temp3);
329  comp0 += comp2;
330  comp1 += comp3;
331  SRARI_H2_UH(comp0, comp1, 2);
332  comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
333  diff = __msa_asub_u_b(src2, comp);
334  sad += __msa_hadd_u_h(diff, diff);
335 
336  ILVRL_B2_UB(ref13, ref03, temp0, temp1);
337  comp0 = __msa_hadd_u_h(temp0, temp0);
338  comp1 = __msa_hadd_u_h(temp1, temp1);
339  comp2 += comp0;
340  comp3 += comp1;
341  SRARI_H2_UH(comp2, comp3, 2);
342  comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
343  diff = __msa_asub_u_b(src3, comp);
344  sad += __msa_hadd_u_h(diff, diff);
345 
346  LD_UB4(src, src_stride, src0, src1, src2, src3);
347  src += (4 * src_stride);
348  LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
349  LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
350  ref += (3 * ref_stride);
351 
352  ILVRL_B2_UB(ref10, ref00, temp2, temp3);
353  comp2 = __msa_hadd_u_h(temp2, temp2);
354  comp3 = __msa_hadd_u_h(temp3, temp3);
355  comp0 += comp2;
356  comp1 += comp3;
357  SRARI_H2_UH(comp0, comp1, 2);
358  comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
359  diff = __msa_asub_u_b(src0, comp);
360  sad += __msa_hadd_u_h(diff, diff);
361 
362  ILVRL_B2_UB(ref11, ref01, temp0, temp1);
363  comp0 = __msa_hadd_u_h(temp0, temp0);
364  comp1 = __msa_hadd_u_h(temp1, temp1);
365  comp2 += comp0;
366  comp3 += comp1;
367  SRARI_H2_UH(comp2, comp3, 2);
368  comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
369  diff = __msa_asub_u_b(src1, comp);
370  sad += __msa_hadd_u_h(diff, diff);
371 
372  ILVRL_B2_UB(ref12, ref02, temp2, temp3);
373  comp2 = __msa_hadd_u_h(temp2, temp2);
374  comp3 = __msa_hadd_u_h(temp3, temp3);
375  comp0 += comp2;
376  comp1 += comp3;
377  SRARI_H2_UH(comp0, comp1, 2);
378  comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
379  diff = __msa_asub_u_b(src2, comp);
380  sad += __msa_hadd_u_h(diff, diff);
381 
382  ILVRL_B2_UB(ref13, ref03, temp0, temp1);
383  comp0 = __msa_hadd_u_h(temp0, temp0);
384  comp1 = __msa_hadd_u_h(temp1, temp1);
385  comp2 += comp0;
386  comp3 += comp1;
387  SRARI_H2_UH(comp2, comp3, 2);
388  comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
389  diff = __msa_asub_u_b(src3, comp);
390  sad += __msa_hadd_u_h(diff, diff);
391  }
392 
393  return (HADD_UH_U32(sad));
394 }
395 
396 #define CALC_MSE_B(src, ref, var) \
397 { \
398  v16u8 src_l0_m, src_l1_m; \
399  v8i16 res_l0_m, res_l1_m; \
400  \
401  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
402  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
403  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
404 }
405 
406 static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
407  uint8_t *ref_ptr, int32_t ref_stride,
408  int32_t height)
409 {
410  int32_t ht_cnt;
411  uint32_t sse;
412  uint32_t src0, src1, src2, src3;
413  uint32_t ref0, ref1, ref2, ref3;
414  v16u8 src = { 0 };
415  v16u8 ref = { 0 };
416  v4i32 var = { 0 };
417 
418  for (ht_cnt = (height >> 2); ht_cnt--;) {
419  LW4(src_ptr, src_stride, src0, src1, src2, src3);
420  src_ptr += (4 * src_stride);
421  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
422  ref_ptr += (4 * ref_stride);
423 
424  INSERT_W4_UB(src0, src1, src2, src3, src);
425  INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
426  CALC_MSE_B(src, ref, var);
427  }
428 
429  sse = HADD_SW_S32(var);
430 
431  return sse;
432 }
433 
434 static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
435  uint8_t *ref_ptr, int32_t ref_stride,
436  int32_t height)
437 {
438  int32_t ht_cnt;
439  uint32_t sse;
440  v16u8 src0, src1, src2, src3;
441  v16u8 ref0, ref1, ref2, ref3;
442  v4i32 var = { 0 };
443 
444  for (ht_cnt = (height >> 2); ht_cnt--;) {
445  LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
446  src_ptr += (4 * src_stride);
447  LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
448  ref_ptr += (4 * ref_stride);
449 
450  PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
451  src0, src1, ref0, ref1);
452  CALC_MSE_B(src0, ref0, var);
453  CALC_MSE_B(src1, ref1, var);
454  }
455 
456  sse = HADD_SW_S32(var);
457 
458  return sse;
459 }
460 
461 static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
462  uint8_t *ref_ptr, int32_t ref_stride,
463  int32_t height)
464 {
465  int32_t ht_cnt;
466  uint32_t sse;
467  v16u8 src, ref;
468  v4i32 var = { 0 };
469 
470  for (ht_cnt = (height >> 2); ht_cnt--;) {
471  src = LD_UB(src_ptr);
472  src_ptr += src_stride;
473  ref = LD_UB(ref_ptr);
474  ref_ptr += ref_stride;
475  CALC_MSE_B(src, ref, var);
476 
477  src = LD_UB(src_ptr);
478  src_ptr += src_stride;
479  ref = LD_UB(ref_ptr);
480  ref_ptr += ref_stride;
481  CALC_MSE_B(src, ref, var);
482 
483  src = LD_UB(src_ptr);
484  src_ptr += src_stride;
485  ref = LD_UB(ref_ptr);
486  ref_ptr += ref_stride;
487  CALC_MSE_B(src, ref, var);
488 
489  src = LD_UB(src_ptr);
490  src_ptr += src_stride;
491  ref = LD_UB(ref_ptr);
492  ref_ptr += ref_stride;
493  CALC_MSE_B(src, ref, var);
494  }
495 
496  sse = HADD_SW_S32(var);
497 
498  return sse;
499 }
500 
502  uint8_t *ref, int32_t ref_stride)
503 {
504  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
505  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
506  v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
507  v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
508  v8i16 sum = { 0 };
509  v8i16 zero = { 0 };
510 
511  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
512  LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
513  ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
514  src4, ref4, src5, ref5, src6, ref6, src7, ref7,
515  diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
516  HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
517  HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
518  TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
519  diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
520  BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
521  temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
522  BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
523  diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
524  BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
525  temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
526  TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
527  temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
528  BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
529  diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
530  BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
531  temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
532  ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
533  diff0, diff1, diff2, diff3);
534  sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
535  sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
536  sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
537  sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
538  sum += __msa_add_a_h((v8i16) diff0, zero);
539  sum += __msa_add_a_h((v8i16) diff1, zero);
540  sum += __msa_add_a_h((v8i16) diff2, zero);
541  sum += __msa_add_a_h((v8i16) diff3, zero);
542 
543  return (HADD_UH_U32(sum));
544 }
545 
547  uint8_t *ref, int32_t ref_stride)
548 {
549  int32_t sum_res = 0;
550  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
551  v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
552  v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
553  v8i16 sum = { 0 };
554  v16i8 zero = { 0 };
555 
556  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
557  TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
558  src0, src1, src2, src3, src4, src5, src6, src7);
559  ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
560  zero, src4, zero, src5, zero, src6, zero, src7,
561  diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
562  BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
563  temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
564  BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
565  diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
566  BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
567  temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
568  TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
569  temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
570  BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
571  diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
572  BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
573  temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
574  ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
575  diff0, diff1, diff2, diff3);
576  sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
577  sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
578  sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
579  sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
580  sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
581  sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
582  sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
583  sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
584  sum_res = (HADD_UH_U32(sum));
585  sum_res -= abs(temp0[0] + temp4[0]);
586 
587  return sum_res;
588 }
589 
591  ptrdiff_t stride, int height)
592 {
593  return sad_16width_msa(src, stride, ref, stride, height);
594 }
595 
597  ptrdiff_t stride, int height)
598 {
599  return sad_8width_msa(src, stride, ref, stride, height);
600 }
601 
603  ptrdiff_t stride, int h)
604 {
605  return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
606 }
607 
609  ptrdiff_t stride, int h)
610 {
611  return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
612 }
613 
615  ptrdiff_t stride, int h)
616 {
617  return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
618 }
619 
621  ptrdiff_t stride, int h)
622 {
623  return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
624 }
625 
627  ptrdiff_t stride, int h)
628 {
629  return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
630 }
631 
633  ptrdiff_t stride, int h)
634 {
635  return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
636 }
637 
639  ptrdiff_t stride, int height)
640 {
641  return sse_16width_msa(src, stride, ref, stride, height);
642 }
643 
645  ptrdiff_t stride, int height)
646 {
647  return sse_8width_msa(src, stride, ref, stride, height);
648 }
649 
651  ptrdiff_t stride, int height)
652 {
653  return sse_4width_msa(src, stride, ref, stride, height);
654 }
655 
657  ptrdiff_t stride, int h)
658 {
659  return hadamard_diff_8x8_msa(src, stride, dst, stride);
660 }
661 
663  ptrdiff_t stride, int h)
664 {
665  return hadamard_intra_8x8_msa(src, stride, dst, stride);
666 }
667 
668 /* Hadamard Transform functions */
669 #define WRAPPER8_16_SQ(name8, name16) \
670 int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
671  ptrdiff_t stride, int h) \
672 { \
673  int score = 0; \
674  score += name8(s, dst, src, stride, 8); \
675  score += name8(s, dst + 8, src + 8, stride, 8); \
676  if(h == 16) { \
677  dst += 8 * stride; \
678  src += 8 * stride; \
679  score +=name8(s, dst, src, stride, 8); \
680  score +=name8(s, dst + 8, src + 8, stride, 8); \
681  } \
682  return score; \
683 }
684 
int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:626
int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:608
int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:644
int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:614
#define SAD_UB2_UH(in0, in1, ref0, ref1)
int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)
static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, int stride)
int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:656
int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:632
#define LD_UB4(...)
int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:596
#define src
Definition: vp8dsp.c:254
int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:602
int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:662
static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride)
Definition: me_cmp_msa.c:501
uint8_t
#define LD_UB2(...)
#define HSUB_UB4_UH(...)
#define ILVRL_B2_UB(...)
#define height
#define CALC_MSE_B(src, ref, var)
Definition: me_cmp_msa.c:396
#define LD_UB5(...)
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:620
static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:283
static const uint16_t mask[17]
Definition: lzw.c:38
#define zero
Definition: regdef.h:64
#define PCKEV_D4_UB(...)
#define TRANSPOSE8x8_UH_UH(...)
static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:189
static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:46
static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:24
#define ILVR_B8_UH(...)
static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride)
Definition: me_cmp_msa.c:546
int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:638
#define LD_UB8(...)
#define SLDI_B4_UB(...)
int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:650
#define SRARI_H2_UH(...)
int32_t
#define AVER_UB2_UB(...)
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,out0, out1, out2, out3, out4, out5, out6, out7)
#define s(width, name)
Definition: cbs_vp9.c:257
static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:71
static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride, uint8_t *ref_ptr, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:434
#define INSERT_W4_UB(...)
static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride, uint8_t *ref_ptr, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:461
static void comp(unsigned char *dst, ptrdiff_t dst_stride, unsigned char *src, ptrdiff_t src_stride, int add)
Definition: eamad.c:83
#define src1
Definition: h264pred.c:139
#define abs(x)
Definition: cuda_runtime.h:35
#define TRANSPOSE8x8_UB_UB(...)
#define HADD_UH_U32(in)
#define src0
Definition: h264pred.c:138
#define WRAPPER8_16_SQ(name8, name16)
Definition: me_cmp_msa.c:669
MpegEncContext.
Definition: mpegvideo.h:81
static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride, uint8_t *ref_ptr, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:406
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:104
#define LW4(psrc, stride, out0, out1, out2, out3)
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:590
static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:227
static av_always_inline int diff(const uint32_t a, const uint32_t b)
#define LD_UB(...)
#define VSHF_B2_UB(...)
static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:151
#define PCKEV_D2_UB(...)
int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)
static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:113
#define HADD_SW_S32(in)