FFmpeg
me_cmp_alpha.c
Go to the documentation of this file.
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavcodec/me_cmp.h"
24 #include "asm.h"
25 
26 int pix_abs16x16_mvi_asm(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
27  ptrdiff_t line_size, int h);
28 
29 static inline uint64_t avg2(uint64_t a, uint64_t b)
30 {
31  return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
32 }
33 
34 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
35 {
36  uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
37  + ((l2 & ~BYTE_VEC(0x03)) >> 2)
38  + ((l3 & ~BYTE_VEC(0x03)) >> 2)
39  + ((l4 & ~BYTE_VEC(0x03)) >> 2);
40  uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
41  + (l2 & BYTE_VEC(0x03))
42  + (l3 & BYTE_VEC(0x03))
43  + (l4 & BYTE_VEC(0x03))
44  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
45  return r1 + r2;
46 }
47 
48 static int pix_abs8x8_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
49  ptrdiff_t line_size, int h)
50 {
51  int result = 0;
52 
53  if ((size_t) pix2 & 0x7) {
54  /* works only when pix2 is actually unaligned */
55  do { /* do 8 pixel a time */
56  uint64_t p1, p2;
57 
58  p1 = ldq(pix1);
59  p2 = uldq(pix2);
60  result += perr(p1, p2);
61 
62  pix1 += line_size;
63  pix2 += line_size;
64  } while (--h);
65  } else {
66  do {
67  uint64_t p1, p2;
68 
69  p1 = ldq(pix1);
70  p2 = ldq(pix2);
71  result += perr(p1, p2);
72 
73  pix1 += line_size;
74  pix2 += line_size;
75  } while (--h);
76  }
77 
78  return result;
79 }
80 
81 static int pix_abs16x16_x2_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
82  ptrdiff_t line_size, int h)
83 {
84  int result = 0;
85  uint64_t disalign = (size_t) pix2 & 0x7;
86 
87  switch (disalign) {
88  case 0:
89  do {
90  uint64_t p1_l, p1_r, p2_l, p2_r;
91  uint64_t l, r;
92 
93  p1_l = ldq(pix1);
94  p1_r = ldq(pix1 + 8);
95  l = ldq(pix2);
96  r = ldq(pix2 + 8);
97  p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
98  p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
99  pix1 += line_size;
100  pix2 += line_size;
101 
102  result += perr(p1_l, p2_l)
103  + perr(p1_r, p2_r);
104  } while (--h);
105  break;
106  case 7:
107  /* |.......l|lllllllr|rrrrrrr*|
108  This case is special because disalign1 would be 8, which
109  gets treated as 0 by extqh. At least it is a bit faster
110  that way :) */
111  do {
112  uint64_t p1_l, p1_r, p2_l, p2_r;
113  uint64_t l, m, r;
114 
115  p1_l = ldq(pix1);
116  p1_r = ldq(pix1 + 8);
117  l = ldq_u(pix2);
118  m = ldq_u(pix2 + 8);
119  r = ldq_u(pix2 + 16);
120  p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
121  p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
122  pix1 += line_size;
123  pix2 += line_size;
124 
125  result += perr(p1_l, p2_l)
126  + perr(p1_r, p2_r);
127  } while (--h);
128  break;
129  default:
130  do {
131  uint64_t disalign1 = disalign + 1;
132  uint64_t p1_l, p1_r, p2_l, p2_r;
133  uint64_t l, m, r;
134 
135  p1_l = ldq(pix1);
136  p1_r = ldq(pix1 + 8);
137  l = ldq_u(pix2);
138  m = ldq_u(pix2 + 8);
139  r = ldq_u(pix2 + 16);
140  p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
141  extql(l, disalign1) | extqh(m, disalign1));
142  p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
143  extql(m, disalign1) | extqh(r, disalign1));
144  pix1 += line_size;
145  pix2 += line_size;
146 
147  result += perr(p1_l, p2_l)
148  + perr(p1_r, p2_r);
149  } while (--h);
150  break;
151  }
152  return result;
153 }
154 
155 static int pix_abs16x16_y2_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
156  ptrdiff_t line_size, int h)
157 {
158  int result = 0;
159 
160  if ((size_t) pix2 & 0x7) {
161  uint64_t t, p2_l, p2_r;
162  t = ldq_u(pix2 + 8);
163  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
164  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
165 
166  do {
167  uint64_t p1_l, p1_r, np2_l, np2_r;
168  uint64_t t;
169 
170  p1_l = ldq(pix1);
171  p1_r = ldq(pix1 + 8);
172  pix2 += line_size;
173  t = ldq_u(pix2 + 8);
174  np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
175  np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
176 
177  result += perr(p1_l, avg2(p2_l, np2_l))
178  + perr(p1_r, avg2(p2_r, np2_r));
179 
180  pix1 += line_size;
181  p2_l = np2_l;
182  p2_r = np2_r;
183 
184  } while (--h);
185  } else {
186  uint64_t p2_l, p2_r;
187  p2_l = ldq(pix2);
188  p2_r = ldq(pix2 + 8);
189  do {
190  uint64_t p1_l, p1_r, np2_l, np2_r;
191 
192  p1_l = ldq(pix1);
193  p1_r = ldq(pix1 + 8);
194  pix2 += line_size;
195  np2_l = ldq(pix2);
196  np2_r = ldq(pix2 + 8);
197 
198  result += perr(p1_l, avg2(p2_l, np2_l))
199  + perr(p1_r, avg2(p2_r, np2_r));
200 
201  pix1 += line_size;
202  p2_l = np2_l;
203  p2_r = np2_r;
204  } while (--h);
205  }
206  return result;
207 }
208 
209 static int pix_abs16x16_xy2_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
210  ptrdiff_t line_size, int h)
211 {
212  int result = 0;
213 
214  uint64_t p1_l, p1_r;
215  uint64_t p2_l, p2_r, p2_x;
216 
217  p1_l = ldq(pix1);
218  p1_r = ldq(pix1 + 8);
219 
220  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
221  p2_l = uldq(pix2);
222  p2_r = uldq(pix2 + 8);
223  p2_x = (uint64_t) pix2[16] << 56;
224  } else {
225  p2_l = ldq(pix2);
226  p2_r = ldq(pix2 + 8);
227  p2_x = ldq(pix2 + 16) << 56;
228  }
229 
230  do {
231  uint64_t np1_l, np1_r;
232  uint64_t np2_l, np2_r, np2_x;
233 
234  pix1 += line_size;
235  pix2 += line_size;
236 
237  np1_l = ldq(pix1);
238  np1_r = ldq(pix1 + 8);
239 
240  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
241  np2_l = uldq(pix2);
242  np2_r = uldq(pix2 + 8);
243  np2_x = (uint64_t) pix2[16] << 56;
244  } else {
245  np2_l = ldq(pix2);
246  np2_r = ldq(pix2 + 8);
247  np2_x = ldq(pix2 + 16) << 56;
248  }
249 
250  result += perr(p1_l,
251  avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
252  np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
253  + perr(p1_r,
254  avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
255  np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
256 
257  p1_l = np1_l;
258  p1_r = np1_r;
259  p2_l = np2_l;
260  p2_r = np2_r;
261  p2_x = np2_x;
262  } while (--h);
263 
264  return result;
265 }
266 
268 {
269  /* amask clears all bits that correspond to present features. */
270  if (amask(AMASK_MVI) == 0) {
271  c->sad[0] = pix_abs16x16_mvi_asm;
272  c->sad[1] = pix_abs8x8_mvi;
273  c->pix_abs[0][0] = pix_abs16x16_mvi_asm;
274  c->pix_abs[1][0] = pix_abs8x8_mvi;
275  c->pix_abs[0][1] = pix_abs16x16_x2_mvi;
276  c->pix_abs[0][2] = pix_abs16x16_y2_mvi;
277  c->pix_abs[0][3] = pix_abs16x16_xy2_mvi;
278  }
279 }
pix_abs16x16_mvi_asm
int pix_abs16x16_mvi_asm(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h)
pix_abs16x16_x2_mvi
static int pix_abs16x16_x2_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h)
Definition: me_cmp_alpha.c:81
r
const char * r
Definition: vf_curves.c:127
extqh
#define extqh(a, b)
Definition: asm.h:110
b
#define b
Definition: input.c:41
ldq
#define ldq(p)
Definition: asm.h:59
AMASK_MVI
#define AMASK_MVI
Definition: asm.h:40
asm.h
extql
#define extql(a, b)
Definition: asm.h:108
pix_abs16x16_xy2_mvi
static int pix_abs16x16_xy2_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h)
Definition: me_cmp_alpha.c:209
av_cold
#define av_cold
Definition: attributes.h:90
perr
#define perr(a, b)
Definition: asm.h:142
avg2
static uint64_t avg2(uint64_t a, uint64_t b)
Definition: me_cmp_alpha.c:29
avg4
static uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
Definition: me_cmp_alpha.c:34
MECmpContext
Definition: me_cmp.h:55
result
and forward the result(frame or status change) to the corresponding input. If nothing is possible
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
pix_abs8x8_mvi
static int pix_abs8x8_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h)
Definition: me_cmp_alpha.c:48
ff_me_cmp_init_alpha
av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
Definition: me_cmp_alpha.c:267
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
attributes.h
amask
#define amask(a)
Definition: asm.h:113
uldq
#define uldq(a)
Definition: asm.h:85
ldq_u
#define ldq_u(p)
Definition: asm.h:84
pix_abs16x16_y2_mvi
static int pix_abs16x16_y2_mvi(struct MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h)
Definition: me_cmp_alpha.c:155
me_cmp.h
AVCodecContext
main external API structure.
Definition: avcodec.h:445
BYTE_VEC
static uint64_t BYTE_VEC(uint64_t x)
Definition: asm.h:42
h
h
Definition: vp9dsp_template.c:2038
MpegEncContext
MpegEncContext.
Definition: mpegvideo.h:73