FFmpeg
me_cmp_alpha.c
Go to the documentation of this file.
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavcodec/me_cmp.h"
24 #include "asm.h"
25 
26 int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
27 
28 static inline uint64_t avg2(uint64_t a, uint64_t b)
29 {
30  return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
31 }
32 
33 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
34 {
35  uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
36  + ((l2 & ~BYTE_VEC(0x03)) >> 2)
37  + ((l3 & ~BYTE_VEC(0x03)) >> 2)
38  + ((l4 & ~BYTE_VEC(0x03)) >> 2);
39  uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
40  + (l2 & BYTE_VEC(0x03))
41  + (l3 & BYTE_VEC(0x03))
42  + (l4 & BYTE_VEC(0x03))
43  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
44  return r1 + r2;
45 }
46 
47 static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
48 {
49  int result = 0;
50 
51  if ((size_t) pix2 & 0x7) {
52  /* works only when pix2 is actually unaligned */
53  do { /* do 8 pixel a time */
54  uint64_t p1, p2;
55 
56  p1 = ldq(pix1);
57  p2 = uldq(pix2);
58  result += perr(p1, p2);
59 
60  pix1 += line_size;
61  pix2 += line_size;
62  } while (--h);
63  } else {
64  do {
65  uint64_t p1, p2;
66 
67  p1 = ldq(pix1);
68  p2 = ldq(pix2);
69  result += perr(p1, p2);
70 
71  pix1 += line_size;
72  pix2 += line_size;
73  } while (--h);
74  }
75 
76  return result;
77 }
78 
79 #if 0 /* now done in assembly */
80 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
81 {
82  int result = 0;
83  int h = 16;
84 
85  if ((size_t) pix2 & 0x7) {
86  /* works only when pix2 is actually unaligned */
87  do { /* do 16 pixel a time */
88  uint64_t p1_l, p1_r, p2_l, p2_r;
89  uint64_t t;
90 
91  p1_l = ldq(pix1);
92  p1_r = ldq(pix1 + 8);
93  t = ldq_u(pix2 + 8);
94  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
95  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
96  pix1 += line_size;
97  pix2 += line_size;
98 
99  result += perr(p1_l, p2_l)
100  + perr(p1_r, p2_r);
101  } while (--h);
102  } else {
103  do {
104  uint64_t p1_l, p1_r, p2_l, p2_r;
105 
106  p1_l = ldq(pix1);
107  p1_r = ldq(pix1 + 8);
108  p2_l = ldq(pix2);
109  p2_r = ldq(pix2 + 8);
110  pix1 += line_size;
111  pix2 += line_size;
112 
113  result += perr(p1_l, p2_l)
114  + perr(p1_r, p2_r);
115  } while (--h);
116  }
117 
118  return result;
119 }
120 #endif
121 
122 static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
123 {
124  int result = 0;
125  uint64_t disalign = (size_t) pix2 & 0x7;
126 
127  switch (disalign) {
128  case 0:
129  do {
130  uint64_t p1_l, p1_r, p2_l, p2_r;
131  uint64_t l, r;
132 
133  p1_l = ldq(pix1);
134  p1_r = ldq(pix1 + 8);
135  l = ldq(pix2);
136  r = ldq(pix2 + 8);
137  p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
138  p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
139  pix1 += line_size;
140  pix2 += line_size;
141 
142  result += perr(p1_l, p2_l)
143  + perr(p1_r, p2_r);
144  } while (--h);
145  break;
146  case 7:
147  /* |.......l|lllllllr|rrrrrrr*|
148  This case is special because disalign1 would be 8, which
149  gets treated as 0 by extqh. At least it is a bit faster
150  that way :) */
151  do {
152  uint64_t p1_l, p1_r, p2_l, p2_r;
153  uint64_t l, m, r;
154 
155  p1_l = ldq(pix1);
156  p1_r = ldq(pix1 + 8);
157  l = ldq_u(pix2);
158  m = ldq_u(pix2 + 8);
159  r = ldq_u(pix2 + 16);
160  p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
161  p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
162  pix1 += line_size;
163  pix2 += line_size;
164 
165  result += perr(p1_l, p2_l)
166  + perr(p1_r, p2_r);
167  } while (--h);
168  break;
169  default:
170  do {
171  uint64_t disalign1 = disalign + 1;
172  uint64_t p1_l, p1_r, p2_l, p2_r;
173  uint64_t l, m, r;
174 
175  p1_l = ldq(pix1);
176  p1_r = ldq(pix1 + 8);
177  l = ldq_u(pix2);
178  m = ldq_u(pix2 + 8);
179  r = ldq_u(pix2 + 16);
180  p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
181  extql(l, disalign1) | extqh(m, disalign1));
182  p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
183  extql(m, disalign1) | extqh(r, disalign1));
184  pix1 += line_size;
185  pix2 += line_size;
186 
187  result += perr(p1_l, p2_l)
188  + perr(p1_r, p2_r);
189  } while (--h);
190  break;
191  }
192  return result;
193 }
194 
195 static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
196 {
197  int result = 0;
198 
199  if ((size_t) pix2 & 0x7) {
200  uint64_t t, p2_l, p2_r;
201  t = ldq_u(pix2 + 8);
202  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
203  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
204 
205  do {
206  uint64_t p1_l, p1_r, np2_l, np2_r;
207  uint64_t t;
208 
209  p1_l = ldq(pix1);
210  p1_r = ldq(pix1 + 8);
211  pix2 += line_size;
212  t = ldq_u(pix2 + 8);
213  np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
214  np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
215 
216  result += perr(p1_l, avg2(p2_l, np2_l))
217  + perr(p1_r, avg2(p2_r, np2_r));
218 
219  pix1 += line_size;
220  p2_l = np2_l;
221  p2_r = np2_r;
222 
223  } while (--h);
224  } else {
225  uint64_t p2_l, p2_r;
226  p2_l = ldq(pix2);
227  p2_r = ldq(pix2 + 8);
228  do {
229  uint64_t p1_l, p1_r, np2_l, np2_r;
230 
231  p1_l = ldq(pix1);
232  p1_r = ldq(pix1 + 8);
233  pix2 += line_size;
234  np2_l = ldq(pix2);
235  np2_r = ldq(pix2 + 8);
236 
237  result += perr(p1_l, avg2(p2_l, np2_l))
238  + perr(p1_r, avg2(p2_r, np2_r));
239 
240  pix1 += line_size;
241  p2_l = np2_l;
242  p2_r = np2_r;
243  } while (--h);
244  }
245  return result;
246 }
247 
248 static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
249 {
250  int result = 0;
251 
252  uint64_t p1_l, p1_r;
253  uint64_t p2_l, p2_r, p2_x;
254 
255  p1_l = ldq(pix1);
256  p1_r = ldq(pix1 + 8);
257 
258  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
259  p2_l = uldq(pix2);
260  p2_r = uldq(pix2 + 8);
261  p2_x = (uint64_t) pix2[16] << 56;
262  } else {
263  p2_l = ldq(pix2);
264  p2_r = ldq(pix2 + 8);
265  p2_x = ldq(pix2 + 16) << 56;
266  }
267 
268  do {
269  uint64_t np1_l, np1_r;
270  uint64_t np2_l, np2_r, np2_x;
271 
272  pix1 += line_size;
273  pix2 += line_size;
274 
275  np1_l = ldq(pix1);
276  np1_r = ldq(pix1 + 8);
277 
278  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
279  np2_l = uldq(pix2);
280  np2_r = uldq(pix2 + 8);
281  np2_x = (uint64_t) pix2[16] << 56;
282  } else {
283  np2_l = ldq(pix2);
284  np2_r = ldq(pix2 + 8);
285  np2_x = ldq(pix2 + 16) << 56;
286  }
287 
288  result += perr(p1_l,
289  avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
290  np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
291  + perr(p1_r,
292  avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
293  np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
294 
295  p1_l = np1_l;
296  p1_r = np1_r;
297  p2_l = np2_l;
298  p2_r = np2_r;
299  p2_x = np2_x;
300  } while (--h);
301 
302  return result;
303 }
304 
306 {
307  /* amask clears all bits that correspond to present features. */
308  if (amask(AMASK_MVI) == 0) {
309  c->sad[0] = pix_abs16x16_mvi_asm;
310  c->sad[1] = pix_abs8x8_mvi;
311  c->pix_abs[0][0] = pix_abs16x16_mvi_asm;
312  c->pix_abs[1][0] = pix_abs8x8_mvi;
313  c->pix_abs[0][1] = pix_abs16x16_x2_mvi;
314  c->pix_abs[0][2] = pix_abs16x16_y2_mvi;
315  c->pix_abs[0][3] = pix_abs16x16_xy2_mvi;
316  }
317 }
pix_abs16x16_x2_mvi
static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:122
r
const char * r
Definition: vf_curves.c:116
extqh
#define extqh(a, b)
Definition: asm.h:110
b
#define b
Definition: input.c:40
ldq
#define ldq(p)
Definition: asm.h:59
pix_abs16x16_mvi_asm
int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
AMASK_MVI
#define AMASK_MVI
Definition: asm.h:40
asm.h
extql
#define extql(a, b)
Definition: asm.h:108
av_cold
#define av_cold
Definition: attributes.h:90
perr
#define perr(a, b)
Definition: asm.h:142
avg2
static uint64_t avg2(uint64_t a, uint64_t b)
Definition: me_cmp_alpha.c:28
avg4
static uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
Definition: me_cmp_alpha.c:33
MECmpContext
Definition: me_cmp.h:53
result
and forward the result(frame or status change) to the corresponding input. If nothing is possible
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
pix_abs16x16_xy2_mvi
static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:248
ff_me_cmp_init_alpha
av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
Definition: me_cmp_alpha.c:305
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
attributes.h
amask
#define amask(a)
Definition: asm.h:113
uldq
#define uldq(a)
Definition: asm.h:85
ldq_u
#define ldq_u(p)
Definition: asm.h:84
pix_abs16x16_y2_mvi
static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:195
pix_abs8x8_mvi
static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:47
me_cmp.h
AVCodecContext
main external API structure.
Definition: avcodec.h:383
BYTE_VEC
static uint64_t BYTE_VEC(uint64_t x)
Definition: asm.h:42
h
h
Definition: vp9dsp_template.c:2038