FFmpeg
swscale_ppc_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/mem_internal.h"
25 
26 static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
27  const int16_t **src, uint8_t *dest,
28  const uint8_t *dither, int offset, int x)
29 {
30  register int i, j;
31  LOCAL_ALIGNED(16, int, val, [16]);
32  vector signed int vo1, vo2, vo3, vo4;
33  vector unsigned short vs1, vs2;
34  vector unsigned char vf;
35  vector unsigned int altivec_vectorShiftInt19 =
36  vec_add(vec_splat_u32(10), vec_splat_u32(9));
37 
38  for (i = 0; i < 16; i++)
39  val[i] = dither[(x + i + offset) & 7] << 12;
40 
41  vo1 = vec_ld(0, val);
42  vo2 = vec_ld(16, val);
43  vo3 = vec_ld(32, val);
44  vo4 = vec_ld(48, val);
45 
46  for (j = 0; j < filterSize; j++) {
47  unsigned int joffset=j<<1;
48  unsigned int xoffset=x<<1;
49  vector unsigned char av_unused perm;
50  vector signed short l1,vLumFilter;
51  LOAD_FILTER(vLumFilter,filter);
52  vLumFilter = vec_splat(vLumFilter, 0);
53  LOAD_L1(l1,src[j],perm);
54  yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
55  yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
56  }
57 
58  vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
59  vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
60  vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
61  vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
62  vs1 = vec_packsu(vo1, vo2);
63  vs2 = vec_packsu(vo3, vo4);
64  vf = vec_packsu(vs1, vs2);
65  VEC_ST(vf, 0, dest);
66 }
67 
68 
69 static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
70  const int16_t **src, uint8_t *dest, int dstW,
71  const uint8_t *dither, int offset, int x)
72 {
73  int i, j;
74 
75  for (i = x; i < dstW; i++) {
76  int t = dither[(i + offset) & 7] << 12;
77  for (j = 0; j < filterSize; j++)
78  t += src[j][i] * filter[j];
79  dest[i] = av_clip_uint8(t >> 19);
80  }
81 }
82 
83 static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
84  const int16_t **src, uint8_t *dest, int dstW,
85  const uint8_t *dither, int offset)
86 {
87  int dst_u = -(uintptr_t)dest & 15;
88  int i;
89 
90  yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
91 
92  for (i = dst_u; i < dstW - 15; i += 16)
93  FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
94  offset, i);
95 
96  yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
97 }
98 
99 static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW,
100  const uint8_t *src, const int16_t *filter,
101  const int32_t *filterPos, int filterSize)
102 {
103  register int i;
104  LOCAL_ALIGNED(16, int, tempo, [4]);
105 
106  if (filterSize % 4) {
107  for (i = 0; i < dstW; i++) {
108  register int j;
109  register int srcPos = filterPos[i];
110  register int val = 0;
111  for (j = 0; j < filterSize; j++)
112  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
113  dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
114  }
115  } else
116  switch (filterSize) {
117  case 4:
118  for (i = 0; i < dstW; i++) {
119  register int srcPos = filterPos[i];
120 
121  vector unsigned char src_vF = unaligned_load(srcPos, src);
122  vector signed short src_v, filter_v;
123  vector signed int val_vEven, val_s;
124  src_v = // vec_unpackh sign-extends...
125  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
126  // now put our elements in the even slots
127  src_v = vec_mergeh(src_v, (vector signed short)vzero);
128  GET_VF4(i, filter_v, filter);
129  val_vEven = vec_mule(src_v, filter_v);
130  val_s = vec_sums(val_vEven, vzero);
131  vec_st(val_s, 0, tempo);
132  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
133  }
134  break;
135  case 8:
136  for (i = 0; i < dstW; i++) {
137  register int srcPos = filterPos[i];
138  vector unsigned char src_vF, av_unused src_v0, av_unused src_v1;
139  vector unsigned char av_unused permS;
140  vector signed short src_v, filter_v;
141  vector signed int val_v, val_s;
142  FIRST_LOAD(src_v0, srcPos, src, permS);
143  LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
144  src_v = // vec_unpackh sign-extends...
145  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
146  filter_v = vec_ld(i << 4, filter);
147  val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
148  val_s = vec_sums(val_v, vzero);
149  vec_st(val_s, 0, tempo);
150  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
151  }
152  break;
153 
154  case 16:
155  for (i = 0; i < dstW; i++) {
156  register int srcPos = filterPos[i];
157 
158  vector unsigned char src_vF = unaligned_load(srcPos, src);
159  vector signed short src_vA = // vec_unpackh sign-extends...
160  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
161  vector signed short src_vB = // vec_unpackh sign-extends...
162  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
163  vector signed short filter_v0 = vec_ld(i << 5, filter);
164  vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
165 
166  vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
167  vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
168 
169  vector signed int val_s = vec_sums(val_v, vzero);
170 
171  VEC_ST(val_s, 0, tempo);
172  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
173  }
174  break;
175 
176  default:
177  for (i = 0; i < dstW; i++) {
178  register int j, av_unused offset = i * 2 * filterSize;
179  register int srcPos = filterPos[i];
180 
181  vector signed int val_s, val_v = (vector signed int)vzero;
182  vector signed short av_unused filter_v0R;
183  vector unsigned char av_unused permF, av_unused src_v0, av_unused permS;
184  FIRST_LOAD(filter_v0R, offset, filter, permF);
185  FIRST_LOAD(src_v0, srcPos, src, permS);
186 
187  for (j = 0; j < filterSize - 15; j += 16) {
188  vector unsigned char av_unused src_v1, src_vF;
189  vector signed short av_unused filter_v1R, av_unused filter_v2R,
190  filter_v0, filter_v1, src_vA, src_vB;
191  vector signed int val_acc;
192  LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
193  src_vA = // vec_unpackh sign-extends...
194  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
195  src_vB = // vec_unpackh sign-extends...
196  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
197  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
198  GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
199 
200  val_acc = vec_msums(src_vA, filter_v0, val_v);
201  val_v = vec_msums(src_vB, filter_v1, val_acc);
202  UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
203  }
204 
205  if (j < filterSize - 7) {
206  // loading src_v0 is useless, it's already done above
207  vector unsigned char av_unused src_v1, src_vF;
208  vector signed short src_v, av_unused filter_v1R, filter_v;
209  LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
210  src_v = // vec_unpackh sign-extends...
211  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
212  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
213  val_v = vec_msums(src_v, filter_v, val_v);
214  }
215  val_s = vec_sums(val_v, vzero);
216 
217  VEC_ST(val_s, 0, tempo);
218  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
219  }
220  }
221 }
mem_internal.h
av_unused
#define av_unused
Definition: attributes.h:131
yuv2planeX
static void FUNC() yuv2planeX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: swscale_ppc_template.c:83
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
perm
perm
Definition: f_perms.c:74
val
static double val(void *priv, double ch)
Definition: aeval.c:76
LOCAL_ALIGNED
#define LOCAL_ALIGNED(a, t, v,...)
Definition: mem_internal.h:113
int32_t
int32_t
Definition: audio_convert.c:194
src
#define src
Definition: vp8dsp.c:255
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
yuv2planeX_u
static void yuv2planeX_u(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset, int x)
Definition: swscale_ppc_template.c:69
FFMIN
#define FFMIN(a, b)
Definition: common.h:105
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
i
int i
Definition: input.c:407
uint8_t
uint8_t
Definition: audio_convert.c:194
FUNC
#define FUNC(a)
Definition: bit_depth_template.c:104
hScale_real
static void FUNC() hScale_real(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_ppc_template.c:99
av_clip_uint8
#define av_clip_uint8
Definition: common.h:128
int
int
Definition: ffmpeg_filter.c:170
SwsContext
Definition: swscale_internal.h:283
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
yuv2planeX_8_16
static void FUNC() yuv2planeX_8_16(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x)
Definition: swscale_ppc_template.c:26
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:59