FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dsputil_alpha.c
Go to the documentation of this file.
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavcodec/dsputil.h"
24 #include "dsputil_alpha.h"
25 #include "asm.h"
26 
27 void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
28  int line_size);
29 void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
30  int line_size);
31 
32 #if 0
33 /* These functions were the base for the optimized assembler routines,
34  and remain here for documentation purposes. */
35 static void put_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
36  ptrdiff_t line_size)
37 {
38  int i = 8;
39  uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
40 
41  do {
42  uint64_t shorts0, shorts1;
43 
44  shorts0 = ldq(block);
45  shorts0 = maxsw4(shorts0, 0);
46  shorts0 = minsw4(shorts0, clampmask);
47  stl(pkwb(shorts0), pixels);
48 
49  shorts1 = ldq(block + 4);
50  shorts1 = maxsw4(shorts1, 0);
51  shorts1 = minsw4(shorts1, clampmask);
52  stl(pkwb(shorts1), pixels + 4);
53 
54  pixels += line_size;
55  block += 8;
56  } while (--i);
57 }
58 
59 void add_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
60  ptrdiff_t line_size)
61 {
62  int h = 8;
63  /* Keep this function a leaf function by generating the constants
64  manually (mainly for the hack value ;-). */
65  uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
66  uint64_t signmask = zap(-1, 0x33);
67  signmask ^= signmask >> 1; /* 0x8000800080008000 */
68 
69  do {
70  uint64_t shorts0, pix0, signs0;
71  uint64_t shorts1, pix1, signs1;
72 
73  shorts0 = ldq(block);
74  shorts1 = ldq(block + 4);
75 
76  pix0 = unpkbw(ldl(pixels));
77  /* Signed subword add (MMX paddw). */
78  signs0 = shorts0 & signmask;
79  shorts0 &= ~signmask;
80  shorts0 += pix0;
81  shorts0 ^= signs0;
82  /* Clamp. */
83  shorts0 = maxsw4(shorts0, 0);
84  shorts0 = minsw4(shorts0, clampmask);
85 
86  /* Next 4. */
87  pix1 = unpkbw(ldl(pixels + 4));
88  signs1 = shorts1 & signmask;
89  shorts1 &= ~signmask;
90  shorts1 += pix1;
91  shorts1 ^= signs1;
92  shorts1 = maxsw4(shorts1, 0);
93  shorts1 = minsw4(shorts1, clampmask);
94 
95  stl(pkwb(shorts0), pixels);
96  stl(pkwb(shorts1), pixels + 4);
97 
98  pixels += line_size;
99  block += 8;
100  } while (--h);
101 }
102 #endif
103 
104 static void clear_blocks_axp(int16_t *blocks) {
105  uint64_t *p = (uint64_t *) blocks;
106  int n = sizeof(int16_t) * 6 * 64;
107 
108  do {
109  p[0] = 0;
110  p[1] = 0;
111  p[2] = 0;
112  p[3] = 0;
113  p[4] = 0;
114  p[5] = 0;
115  p[6] = 0;
116  p[7] = 0;
117  p += 8;
118  n -= 8 * 8;
119  } while (n);
120 }
121 
122 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
123 {
124  return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
125 }
126 
127 static inline uint64_t avg2(uint64_t a, uint64_t b)
128 {
129  return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
130 }
131 
132 #if 0
133 /* The XY2 routines basically utilize this scheme, but reuse parts in
134  each iteration. */
135 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
136 {
137  uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
138  + ((l2 & ~BYTE_VEC(0x03)) >> 2)
139  + ((l3 & ~BYTE_VEC(0x03)) >> 2)
140  + ((l4 & ~BYTE_VEC(0x03)) >> 2);
141  uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
142  + (l2 & BYTE_VEC(0x03))
143  + (l3 & BYTE_VEC(0x03))
144  + (l4 & BYTE_VEC(0x03))
145  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
146  return r1 + r2;
147 }
148 #endif
149 
150 #define OP(LOAD, STORE) \
151  do { \
152  STORE(LOAD(pixels), block); \
153  pixels += line_size; \
154  block += line_size; \
155  } while (--h)
156 
157 #define OP_X2(LOAD, STORE) \
158  do { \
159  uint64_t pix1, pix2; \
160  \
161  pix1 = LOAD(pixels); \
162  pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
163  STORE(AVG2(pix1, pix2), block); \
164  pixels += line_size; \
165  block += line_size; \
166  } while (--h)
167 
168 #define OP_Y2(LOAD, STORE) \
169  do { \
170  uint64_t pix = LOAD(pixels); \
171  do { \
172  uint64_t next_pix; \
173  \
174  pixels += line_size; \
175  next_pix = LOAD(pixels); \
176  STORE(AVG2(pix, next_pix), block); \
177  block += line_size; \
178  pix = next_pix; \
179  } while (--h); \
180  } while (0)
181 
182 #define OP_XY2(LOAD, STORE) \
183  do { \
184  uint64_t pix1 = LOAD(pixels); \
185  uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
186  uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
187  + (pix2 & BYTE_VEC(0x03)); \
188  uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
189  + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
190  \
191  do { \
192  uint64_t npix1, npix2; \
193  uint64_t npix_l, npix_h; \
194  uint64_t avg; \
195  \
196  pixels += line_size; \
197  npix1 = LOAD(pixels); \
198  npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \
199  npix_l = (npix1 & BYTE_VEC(0x03)) \
200  + (npix2 & BYTE_VEC(0x03)); \
201  npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
202  + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
203  avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
204  + pix_h + npix_h; \
205  STORE(avg, block); \
206  \
207  block += line_size; \
208  pix_l = npix_l; \
209  pix_h = npix_h; \
210  } while (--h); \
211  } while (0)
212 
213 #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
214 static void OPNAME ## _pixels ## SUFF ## _axp \
215  (uint8_t *restrict block, const uint8_t *restrict pixels, \
216  ptrdiff_t line_size, int h) \
217 { \
218  if ((size_t) pixels & 0x7) { \
219  OPKIND(uldq, STORE); \
220  } else { \
221  OPKIND(ldq, STORE); \
222  } \
223 } \
224  \
225 static void OPNAME ## _pixels16 ## SUFF ## _axp \
226  (uint8_t *restrict block, const uint8_t *restrict pixels, \
227  ptrdiff_t line_size, int h) \
228 { \
229  OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \
230  OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
231 }
232 
233 #define PIXOP(OPNAME, STORE) \
234  MAKE_OP(OPNAME, , OP, STORE) \
235  MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
236  MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
237  MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
238 
239 /* Rounding primitives. */
240 #define AVG2 avg2
241 #define AVG4 avg4
242 #define AVG4_ROUNDER BYTE_VEC(0x02)
243 #define STORE(l, b) stq(l, b)
244 PIXOP(put, STORE);
245 
246 #undef STORE
247 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
248 PIXOP(avg, STORE);
249 
250 /* Not rounding primitives. */
251 #undef AVG2
252 #undef AVG4
253 #undef AVG4_ROUNDER
254 #undef STORE
255 #define AVG2 avg2_no_rnd
256 #define AVG4 avg4_no_rnd
257 #define AVG4_ROUNDER BYTE_VEC(0x01)
258 #define STORE(l, b) stq(l, b)
259 PIXOP(put_no_rnd, STORE);
260 
261 #undef STORE
262 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
263 PIXOP(avg_no_rnd, STORE);
264 
265 static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
266  ptrdiff_t line_size, int h)
267 {
268  put_pixels_axp_asm(block, pixels, line_size, h);
269  put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
270 }
271 
273 {
274  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
275 
276  if (!high_bit_depth) {
278  c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
279  c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
280  c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
281 
283  c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
284  c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
285  c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
286 
287  c->avg_pixels_tab[0][0] = avg_pixels16_axp;
288  c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
289  c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
290  c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
291 
292  c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
293  c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
294  c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
295  c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
296 
298  c->put_pixels_tab[1][1] = put_pixels_x2_axp;
299  c->put_pixels_tab[1][2] = put_pixels_y2_axp;
300  c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
301 
303  c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
304  c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
305  c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
306 
307  c->avg_pixels_tab[1][0] = avg_pixels_axp;
308  c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
309  c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
310  c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
311 
313  }
314 
315  /* amask clears all bits that correspond to present features. */
316  if (amask(AMASK_MVI) == 0) {
319 
320  if (!high_bit_depth)
323  c->sad[0] = pix_abs16x16_mvi_asm;
324  c->sad[1] = pix_abs8x8_mvi;
325  c->pix_abs[0][0] = pix_abs16x16_mvi_asm;
326  c->pix_abs[1][0] = pix_abs8x8_mvi;
327  c->pix_abs[0][1] = pix_abs16x16_x2_mvi;
328  c->pix_abs[0][2] = pix_abs16x16_y2_mvi;
329  c->pix_abs[0][3] = pix_abs16x16_xy2_mvi;
330  }
331 
334 
335  if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
336  (avctx->idct_algo == FF_IDCT_AUTO ||
337  avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {
341  }
342 }