FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
motion_est_alpha.c
Go to the documentation of this file.
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "dsputil_alpha.h"
23 #include "asm.h"
24 
25 static inline uint64_t avg2(uint64_t a, uint64_t b)
26 {
27  return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
28 }
29 
30 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
31 {
32  uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
33  + ((l2 & ~BYTE_VEC(0x03)) >> 2)
34  + ((l3 & ~BYTE_VEC(0x03)) >> 2)
35  + ((l4 & ~BYTE_VEC(0x03)) >> 2);
36  uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
37  + (l2 & BYTE_VEC(0x03))
38  + (l3 & BYTE_VEC(0x03))
39  + (l4 & BYTE_VEC(0x03))
40  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
41  return r1 + r2;
42 }
43 
44 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
45 {
46  int result = 0;
47 
48  if ((size_t) pix2 & 0x7) {
49  /* works only when pix2 is actually unaligned */
50  do { /* do 8 pixel a time */
51  uint64_t p1, p2;
52 
53  p1 = ldq(pix1);
54  p2 = uldq(pix2);
55  result += perr(p1, p2);
56 
57  pix1 += line_size;
58  pix2 += line_size;
59  } while (--h);
60  } else {
61  do {
62  uint64_t p1, p2;
63 
64  p1 = ldq(pix1);
65  p2 = ldq(pix2);
66  result += perr(p1, p2);
67 
68  pix1 += line_size;
69  pix2 += line_size;
70  } while (--h);
71  }
72 
73  return result;
74 }
75 
76 #if 0 /* now done in assembly */
77 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
78 {
79  int result = 0;
80  int h = 16;
81 
82  if ((size_t) pix2 & 0x7) {
83  /* works only when pix2 is actually unaligned */
84  do { /* do 16 pixel a time */
85  uint64_t p1_l, p1_r, p2_l, p2_r;
86  uint64_t t;
87 
88  p1_l = ldq(pix1);
89  p1_r = ldq(pix1 + 8);
90  t = ldq_u(pix2 + 8);
91  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
92  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
93  pix1 += line_size;
94  pix2 += line_size;
95 
96  result += perr(p1_l, p2_l)
97  + perr(p1_r, p2_r);
98  } while (--h);
99  } else {
100  do {
101  uint64_t p1_l, p1_r, p2_l, p2_r;
102 
103  p1_l = ldq(pix1);
104  p1_r = ldq(pix1 + 8);
105  p2_l = ldq(pix2);
106  p2_r = ldq(pix2 + 8);
107  pix1 += line_size;
108  pix2 += line_size;
109 
110  result += perr(p1_l, p2_l)
111  + perr(p1_r, p2_r);
112  } while (--h);
113  }
114 
115  return result;
116 }
117 #endif
118 
119 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
120 {
121  int result = 0;
122  uint64_t disalign = (size_t) pix2 & 0x7;
123 
124  switch (disalign) {
125  case 0:
126  do {
127  uint64_t p1_l, p1_r, p2_l, p2_r;
128  uint64_t l, r;
129 
130  p1_l = ldq(pix1);
131  p1_r = ldq(pix1 + 8);
132  l = ldq(pix2);
133  r = ldq(pix2 + 8);
134  p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
135  p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
136  pix1 += line_size;
137  pix2 += line_size;
138 
139  result += perr(p1_l, p2_l)
140  + perr(p1_r, p2_r);
141  } while (--h);
142  break;
143  case 7:
144  /* |.......l|lllllllr|rrrrrrr*|
145  This case is special because disalign1 would be 8, which
146  gets treated as 0 by extqh. At least it is a bit faster
147  that way :) */
148  do {
149  uint64_t p1_l, p1_r, p2_l, p2_r;
150  uint64_t l, m, r;
151 
152  p1_l = ldq(pix1);
153  p1_r = ldq(pix1 + 8);
154  l = ldq_u(pix2);
155  m = ldq_u(pix2 + 8);
156  r = ldq_u(pix2 + 16);
157  p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
158  p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
159  pix1 += line_size;
160  pix2 += line_size;
161 
162  result += perr(p1_l, p2_l)
163  + perr(p1_r, p2_r);
164  } while (--h);
165  break;
166  default:
167  do {
168  uint64_t disalign1 = disalign + 1;
169  uint64_t p1_l, p1_r, p2_l, p2_r;
170  uint64_t l, m, r;
171 
172  p1_l = ldq(pix1);
173  p1_r = ldq(pix1 + 8);
174  l = ldq_u(pix2);
175  m = ldq_u(pix2 + 8);
176  r = ldq_u(pix2 + 16);
177  p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
178  extql(l, disalign1) | extqh(m, disalign1));
179  p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
180  extql(m, disalign1) | extqh(r, disalign1));
181  pix1 += line_size;
182  pix2 += line_size;
183 
184  result += perr(p1_l, p2_l)
185  + perr(p1_r, p2_r);
186  } while (--h);
187  break;
188  }
189  return result;
190 }
191 
192 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
193 {
194  int result = 0;
195 
196  if ((size_t) pix2 & 0x7) {
197  uint64_t t, p2_l, p2_r;
198  t = ldq_u(pix2 + 8);
199  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
200  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
201 
202  do {
203  uint64_t p1_l, p1_r, np2_l, np2_r;
204  uint64_t t;
205 
206  p1_l = ldq(pix1);
207  p1_r = ldq(pix1 + 8);
208  pix2 += line_size;
209  t = ldq_u(pix2 + 8);
210  np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
211  np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
212 
213  result += perr(p1_l, avg2(p2_l, np2_l))
214  + perr(p1_r, avg2(p2_r, np2_r));
215 
216  pix1 += line_size;
217  p2_l = np2_l;
218  p2_r = np2_r;
219 
220  } while (--h);
221  } else {
222  uint64_t p2_l, p2_r;
223  p2_l = ldq(pix2);
224  p2_r = ldq(pix2 + 8);
225  do {
226  uint64_t p1_l, p1_r, np2_l, np2_r;
227 
228  p1_l = ldq(pix1);
229  p1_r = ldq(pix1 + 8);
230  pix2 += line_size;
231  np2_l = ldq(pix2);
232  np2_r = ldq(pix2 + 8);
233 
234  result += perr(p1_l, avg2(p2_l, np2_l))
235  + perr(p1_r, avg2(p2_r, np2_r));
236 
237  pix1 += line_size;
238  p2_l = np2_l;
239  p2_r = np2_r;
240  } while (--h);
241  }
242  return result;
243 }
244 
245 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
246 {
247  int result = 0;
248 
249  uint64_t p1_l, p1_r;
250  uint64_t p2_l, p2_r, p2_x;
251 
252  p1_l = ldq(pix1);
253  p1_r = ldq(pix1 + 8);
254 
255  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
256  p2_l = uldq(pix2);
257  p2_r = uldq(pix2 + 8);
258  p2_x = (uint64_t) pix2[16] << 56;
259  } else {
260  p2_l = ldq(pix2);
261  p2_r = ldq(pix2 + 8);
262  p2_x = ldq(pix2 + 16) << 56;
263  }
264 
265  do {
266  uint64_t np1_l, np1_r;
267  uint64_t np2_l, np2_r, np2_x;
268 
269  pix1 += line_size;
270  pix2 += line_size;
271 
272  np1_l = ldq(pix1);
273  np1_r = ldq(pix1 + 8);
274 
275  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
276  np2_l = uldq(pix2);
277  np2_r = uldq(pix2 + 8);
278  np2_x = (uint64_t) pix2[16] << 56;
279  } else {
280  np2_l = ldq(pix2);
281  np2_r = ldq(pix2 + 8);
282  np2_x = ldq(pix2 + 16) << 56;
283  }
284 
285  result += perr(p1_l,
286  avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
287  np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
288  + perr(p1_r,
289  avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
290  np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
291 
292  p1_l = np1_l;
293  p1_r = np1_r;
294  p2_l = np2_l;
295  p2_r = np2_r;
296  p2_x = np2_x;
297  } while (--h);
298 
299  return result;
300 }