FFmpeg
swscale_ppc_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/attributes.h"
25 #include "libavutil/mem_internal.h"
26 
27 static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
28  const int16_t **src, uint8_t *dest,
29  const uint8_t *dither, int offset, int x)
30 {
31  register int i, j;
32  LOCAL_ALIGNED(16, int, val, [16]);
33  vector signed int vo1, vo2, vo3, vo4;
34  vector unsigned short vs1, vs2;
35  vector unsigned char vf;
36  vector unsigned int altivec_vectorShiftInt19 =
37  vec_add(vec_splat_u32(10), vec_splat_u32(9));
38 
39  for (i = 0; i < 16; i++)
40  val[i] = dither[(x + i + offset) & 7] << 12;
41 
42  vo1 = vec_ld(0, val);
43  vo2 = vec_ld(16, val);
44  vo3 = vec_ld(32, val);
45  vo4 = vec_ld(48, val);
46 
47  for (j = 0; j < filterSize; j++) {
48  unsigned int joffset=j<<1;
49  unsigned int xoffset=x<<1;
50  vector unsigned char av_unused perm;
51  vector signed short l1,vLumFilter;
52  LOAD_FILTER(vLumFilter,filter);
53  vLumFilter = vec_splat(vLumFilter, 0);
54  LOAD_L1(l1,src[j],perm);
55  yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
56  yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
57  }
58 
59  vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
60  vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
61  vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
62  vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
63  vs1 = vec_packsu(vo1, vo2);
64  vs2 = vec_packsu(vo3, vo4);
65  vf = vec_packsu(vs1, vs2);
66  VEC_ST(vf, 0, dest);
67 }
68 
69 
70 static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
71  const int16_t **src, uint8_t *dest, int dstW,
72  const uint8_t *dither, int offset, int x)
73 {
74  int i, j;
75 
76  for (i = x; i < dstW; i++) {
77  int t = dither[(i + offset) & 7] << 12;
78  for (j = 0; j < filterSize; j++)
79  t += src[j][i] * filter[j];
80  dest[i] = av_clip_uint8(t >> 19);
81  }
82 }
83 
84 static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
85  const int16_t **src, uint8_t *dest, int dstW,
86  const uint8_t *dither, int offset)
87 {
88  int dst_u = -(uintptr_t)dest & 15;
89  int i;
90 
91  yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
92 
93  for (i = dst_u; i < dstW - 15; i += 16)
94  FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
95  offset, i);
96 
97  yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
98 }
99 
100 static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW,
101  const uint8_t *src, const int16_t *filter,
102  const int32_t *filterPos, int filterSize)
103 {
104  register int i;
105  LOCAL_ALIGNED(16, int, tempo, [4]);
106 
107  if (filterSize % 4) {
108  for (i = 0; i < dstW; i++) {
109  register int j;
110  register int srcPos = filterPos[i];
111  register int val = 0;
112  for (j = 0; j < filterSize; j++)
113  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
114  dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
115  }
116  } else
117  switch (filterSize) {
118  case 4:
119  for (i = 0; i < dstW; i++) {
120  register int srcPos = filterPos[i];
121 
122  vector unsigned char src_vF = unaligned_load(srcPos, src);
123  vector signed short src_v, filter_v;
124  vector signed int val_vEven, val_s;
125  src_v = // vec_unpackh sign-extends...
126  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
127  // now put our elements in the even slots
128  src_v = vec_mergeh(src_v, (vector signed short)vzero);
129  GET_VF4(i, filter_v, filter);
130  val_vEven = vec_mule(src_v, filter_v);
131  val_s = vec_sums(val_vEven, vzero);
132  vec_st(val_s, 0, tempo);
133  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
134  }
135  break;
136  case 8:
137  for (i = 0; i < dstW; i++) {
138  register int srcPos = filterPos[i];
139  vector unsigned char src_vF, av_unused src_v0, av_unused src_v1;
140  vector unsigned char av_unused permS;
141  vector signed short src_v, filter_v;
142  vector signed int val_v, val_s;
143  FIRST_LOAD(src_v0, srcPos, src, permS);
144  LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
145  src_v = // vec_unpackh sign-extends...
146  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
147  filter_v = vec_ld(i << 4, filter);
148  val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
149  val_s = vec_sums(val_v, vzero);
150  vec_st(val_s, 0, tempo);
151  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
152  }
153  break;
154 
155  case 16:
156  for (i = 0; i < dstW; i++) {
157  register int srcPos = filterPos[i];
158 
159  vector unsigned char src_vF = unaligned_load(srcPos, src);
160  vector signed short src_vA = // vec_unpackh sign-extends...
161  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
162  vector signed short src_vB = // vec_unpackh sign-extends...
163  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
164  vector signed short filter_v0 = vec_ld(i << 5, filter);
165  vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
166 
167  vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
168  vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
169 
170  vector signed int val_s = vec_sums(val_v, vzero);
171 
172  VEC_ST(val_s, 0, tempo);
173  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
174  }
175  break;
176 
177  default:
178  for (i = 0; i < dstW; i++) {
179  register int j, av_unused offset = i * 2 * filterSize;
180  register int srcPos = filterPos[i];
181 
182  vector signed int val_s, val_v = (vector signed int)vzero;
183  vector signed short av_unused filter_v0R;
184  vector unsigned char av_unused permF, av_unused src_v0, av_unused permS;
185  FIRST_LOAD(filter_v0R, offset, filter, permF);
186  FIRST_LOAD(src_v0, srcPos, src, permS);
187 
188  for (j = 0; j < filterSize - 15; j += 16) {
189  vector unsigned char av_unused src_v1, src_vF;
190  vector signed short av_unused filter_v1R, av_unused filter_v2R,
191  filter_v0, filter_v1, src_vA, src_vB;
192  vector signed int val_acc;
193  LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
194  src_vA = // vec_unpackh sign-extends...
195  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
196  src_vB = // vec_unpackh sign-extends...
197  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
198  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
199  GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
200 
201  val_acc = vec_msums(src_vA, filter_v0, val_v);
202  val_v = vec_msums(src_vB, filter_v1, val_acc);
203  UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
204  }
205 
206  if (j < filterSize - 7) {
207  // loading src_v0 is useless, it's already done above
208  vector unsigned char av_unused src_v1, src_vF;
209  vector signed short src_v, av_unused filter_v1R, filter_v;
210  LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
211  src_v = // vec_unpackh sign-extends...
212  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
213  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
214  val_v = vec_msums(src_v, filter_v, val_v);
215  }
216  val_s = vec_sums(val_v, vzero);
217 
218  VEC_ST(val_s, 0, tempo);
219  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
220  }
221  }
222 }
mem_internal.h
av_unused
#define av_unused
Definition: attributes.h:131
yuv2planeX
static void FUNC() yuv2planeX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: swscale_ppc_template.c:84
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
perm
perm
Definition: f_perms.c:74
val
static double val(void *priv, double ch)
Definition: aeval.c:76
LOCAL_ALIGNED
#define LOCAL_ALIGNED(a, t, v,...)
Definition: mem_internal.h:113
src
#define src
Definition: vp8dsp.c:255
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
yuv2planeX_u
static void yuv2planeX_u(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset, int x)
Definition: swscale_ppc_template.c:70
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:271
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
FUNC
#define FUNC(a)
Definition: bit_depth_template.c:104
hScale_real
static void FUNC() hScale_real(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_ppc_template.c:100
av_clip_uint8
#define av_clip_uint8
Definition: common.h:102
int32_t
int32_t
Definition: audioconvert.c:56
int
int
Definition: ffmpeg_filter.c:153
SwsContext
Definition: swscale_internal.h:300
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
yuv2planeX_8_16
static void FUNC() yuv2planeX_8_16(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x)
Definition: swscale_ppc_template.c:27
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:58