FFmpeg
mpegvideo.c
Go to the documentation of this file.
1 /*
2  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
3  * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/avassert.h"
24 #include "libavutil/cpu.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavutil/x86/cpu.h"
27 #include "libavcodec/mpegvideo.h"
30 
31 #if HAVE_SSE2_INLINE
32 
33 #define SPLATW(reg) "punpcklwd %%" #reg ", %%" #reg "\n\t" \
34  "pshufd $0, %%" #reg ", %%" #reg "\n\t"
35 
36 #if HAVE_SSSE3_INLINE
37 
38 static void dct_unquantize_h263_intra_ssse3(const MPVContext *s,
39  int16_t *block, int n, int qscale)
40 {
41  x86_reg qmul = (unsigned)qscale << 1;
42  int level, qadd;
43 
44  av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
45 
46  if (!s->h263_aic) {
47  if (n < 4)
48  level = block[0] * s->y_dc_scale;
49  else
50  level = block[0] * s->c_dc_scale;
51  qadd = (qscale - 1) | 1;
52  }else{
53  qadd = 0;
54  level= block[0];
55  }
56  x86_reg offset = s->ac_pred ? 63 << 1 : s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
57 
58 __asm__ volatile(
59  "movd %k1, %%xmm0 \n\t" //qmul
60  "lea (%2, %0), %1 \n\t"
61  "neg %0 \n\t"
62  "movd %3, %%xmm1 \n\t" //qadd
63  SPLATW(xmm0)
64  SPLATW(xmm1)
65 
66  ".p2align 4 \n\t"
67  "1: \n\t"
68  "movdqa (%1, %0), %%xmm2 \n\t"
69  "movdqa 16(%1, %0), %%xmm3 \n\t"
70 
71  "movdqa %%xmm1, %%xmm4 \n\t"
72  "movdqa %%xmm1, %%xmm5 \n\t"
73 
74  "psignw %%xmm2, %%xmm4 \n\t" // sgn(block[i])*qadd
75  "psignw %%xmm3, %%xmm5 \n\t" // sgn(block[i])*qadd
76 
77  "pmullw %%xmm0, %%xmm2 \n\t"
78  "pmullw %%xmm0, %%xmm3 \n\t"
79 
80  "paddw %%xmm4, %%xmm2 \n\t"
81  "paddw %%xmm5, %%xmm3 \n\t"
82 
83  "movdqa %%xmm2, (%1, %0) \n\t"
84  "movdqa %%xmm3, 16(%1, %0) \n\t"
85 
86  "add $32, %0 \n\t"
87  "jng 1b \n\t"
88  : "+r"(offset), "+r"(qmul)
89  : "r" (block), "rm" (qadd)
90  : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory"
91  );
92  block[0]= level;
93 }
94 
95 
96 static void dct_unquantize_h263_inter_ssse3(const MPVContext *s,
97  int16_t *block, int n, int qscale)
98 {
99  int qmul = qscale << 1;
100  int qadd = (qscale - 1) | 1;
101 
102  av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
103 
104  x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] << 1;
105 
106 __asm__ volatile(
107  "movd %2, %%xmm0 \n\t" //qmul
108  "movd %3, %%xmm1 \n\t" //qadd
109  "add %1, %0 \n\t"
110  "neg %1 \n\t"
111  SPLATW(xmm0)
112  SPLATW(xmm1)
113 
114  ".p2align 4 \n\t"
115  "1: \n\t"
116  "movdqa (%0, %1), %%xmm2 \n\t"
117  "movdqa 16(%0, %1), %%xmm3 \n\t"
118 
119  "movdqa %%xmm1, %%xmm4 \n\t"
120  "movdqa %%xmm1, %%xmm5 \n\t"
121 
122  "psignw %%xmm2, %%xmm4 \n\t" // sgn(block[i])*qadd
123  "psignw %%xmm3, %%xmm5 \n\t" // sgn(block[i])*qadd
124 
125  "pmullw %%xmm0, %%xmm2 \n\t"
126  "pmullw %%xmm0, %%xmm3 \n\t"
127 
128  "paddw %%xmm4, %%xmm2 \n\t"
129  "paddw %%xmm5, %%xmm3 \n\t"
130 
131  "movdqa %%xmm2, (%0, %1) \n\t"
132  "movdqa %%xmm3, 16(%0, %1) \n\t"
133 
134  "add $32, %1 \n\t"
135  "jng 1b \n\t"
136  : "+r" (block), "+r" (offset)
137  : "rm"(qmul), "rm" (qadd)
138  : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory"
139  );
140 }
141 
142 static void dct_unquantize_mpeg1_intra_ssse3(const MPVContext *s,
143  int16_t *block, int n, int qscale)
144 {
145  x86_reg nCoeffs;
146  const uint16_t *quant_matrix;
147  int block0;
148 
149  av_assert2(s->block_last_index[n]>=0);
150 
151  nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
152 
153  if (n < 4)
154  block0 = block[0] * s->y_dc_scale;
155  else
156  block0 = block[0] * s->c_dc_scale;
157  /* XXX: only MPEG-1 */
158  quant_matrix = s->intra_matrix;
159  x86_reg offset = -2 * nCoeffs;
160 __asm__ volatile(
161  "movd %3, %%xmm6 \n\t"
162  "pcmpeqw %%xmm7, %%xmm7 \n\t"
163  "psrlw $15, %%xmm7 \n\t"
164  SPLATW(xmm6)
165  ".p2align 4 \n\t"
166  "1: \n\t"
167  "movdqa (%2, %0), %%xmm4 \n\t"
168  "movdqa 16(%2, %0), %%xmm5 \n\t"
169  "movdqa (%1, %0), %%xmm0 \n\t"
170  "movdqa 16(%1, %0), %%xmm1 \n\t"
171  "pmullw %%xmm6, %%xmm4 \n\t" // q=qscale*quant_matrix[i]
172  "pmullw %%xmm6, %%xmm5 \n\t" // q=qscale*quant_matrix[i]
173  "pabsw %%xmm0, %%xmm2 \n\t" // abs(block[i])
174  "pabsw %%xmm1, %%xmm3 \n\t" // abs(block[i])
175  "pmullw %%xmm4, %%xmm2 \n\t" // abs(block[i])*q
176  "pmullw %%xmm5, %%xmm3 \n\t" // abs(block[i])*q
177  "psraw $3, %%xmm2 \n\t"
178  "psraw $3, %%xmm3 \n\t"
179  "psubw %%xmm7, %%xmm2 \n\t"
180  "psubw %%xmm7, %%xmm3 \n\t"
181  "por %%xmm7, %%xmm2 \n\t"
182  "por %%xmm7, %%xmm3 \n\t"
183  "psignw %%xmm0, %%xmm2 \n\t"
184  "psignw %%xmm1, %%xmm3 \n\t"
185  "movdqa %%xmm2, (%1, %0) \n\t"
186  "movdqa %%xmm3, 16(%1, %0) \n\t"
187 
188  "add $32, %0 \n\t"
189  "js 1b \n\t"
190  : "+r" (offset)
191  : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
192  : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
193  "memory"
194  );
195  block[0]= block0;
196 }
197 
198 static void dct_unquantize_mpeg1_inter_ssse3(const MPVContext *s,
199  int16_t *block, int n, int qscale)
200 {
201  x86_reg nCoeffs;
202  const uint16_t *quant_matrix;
203 
204  av_assert2(s->block_last_index[n]>=0);
205 
206  nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
207 
208  quant_matrix = s->inter_matrix;
209  x86_reg offset = -2 * nCoeffs;
210 __asm__ volatile(
211  "movd %3, %%xmm6 \n\t"
212  "pcmpeqw %%xmm7, %%xmm7 \n\t"
213  "psrlw $15, %%xmm7 \n\t"
214  SPLATW(xmm6)
215  ".p2align 4 \n\t"
216  "1: \n\t"
217  "movdqa (%2, %0), %%xmm4 \n\t"
218  "movdqa 16(%2, %0), %%xmm5 \n\t"
219  "movdqa (%1, %0), %%xmm0 \n\t"
220  "movdqa 16(%1, %0), %%xmm1 \n\t"
221  "pmullw %%xmm6, %%xmm4 \n\t" // q=qscale*quant_matrix[i]
222  "pmullw %%xmm6, %%xmm5 \n\t" // q=qscale*quant_matrix[i]
223  "pabsw %%xmm0, %%xmm2 \n\t" // abs(block[i])
224  "pabsw %%xmm1, %%xmm3 \n\t" // abs(block[i])
225  "paddw %%xmm2, %%xmm2 \n\t" // abs(block[i])*2
226  "paddw %%xmm3, %%xmm3 \n\t" // abs(block[i])*2
227  "paddw %%xmm7, %%xmm2 \n\t" // abs(block[i])*2 + 1
228  "paddw %%xmm7, %%xmm3 \n\t" // abs(block[i])*2 + 1
229  "pmullw %%xmm4, %%xmm2 \n\t" // (abs(block[i])*2 + 1)*q
230  "pmullw %%xmm5, %%xmm3 \n\t" // (abs(block[i])*2 + 1)*q
231  "psraw $4, %%xmm2 \n\t"
232  "psraw $4, %%xmm3 \n\t"
233  "psubw %%xmm7, %%xmm2 \n\t"
234  "psubw %%xmm7, %%xmm3 \n\t"
235  "por %%xmm7, %%xmm2 \n\t"
236  "por %%xmm7, %%xmm3 \n\t"
237  "psignw %%xmm0, %%xmm2 \n\t"
238  "psignw %%xmm1, %%xmm3 \n\t"
239  "movdqa %%xmm2, (%1, %0) \n\t"
240  "movdqa %%xmm3, 16(%1, %0) \n\t"
241 
242  "add $32, %0 \n\t"
243  "js 1b \n\t"
244  : "+r" (offset)
245  : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
246  : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
247  "memory"
248  );
249 }
250 
251 #endif /* HAVE_SSSE3_INLINE */
252 
253 static void dct_unquantize_mpeg2_intra_sse2(const MPVContext *s,
254  int16_t *block, int n, int qscale)
255 {
256  x86_reg nCoeffs;
257  const uint16_t *quant_matrix;
258  int block0;
259 
260  av_assert2(s->block_last_index[n]>=0);
261 
262  if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
263  else qscale <<= 1;
264 
265  nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
266 
267  if (n < 4)
268  block0 = block[0] * s->y_dc_scale;
269  else
270  block0 = block[0] * s->c_dc_scale;
271  quant_matrix = s->intra_matrix;
272  x86_reg offset = -2 * nCoeffs;
273 __asm__ volatile(
274  "movd %3, %%xmm6 \n\t"
275  SPLATW(xmm6)
276  ".p2align 4 \n\t"
277  "1: \n\t"
278  "movdqa (%1, %0), %%xmm0 \n\t"
279  "movdqa 16(%1, %0), %%xmm1 \n\t"
280  "movdqa (%2, %0), %%xmm4 \n\t"
281  "movdqa 16(%2, %0), %%xmm5 \n\t"
282  "pmullw %%xmm6, %%xmm4 \n\t" // q=qscale*quant_matrix[i]
283  "pmullw %%xmm6, %%xmm5 \n\t" // q=qscale*quant_matrix[i]
284  "movdqa %%xmm0, %%xmm2 \n\t"
285  "movdqa %%xmm1, %%xmm3 \n\t"
286  "psrlw $12, %%xmm2 \n\t" // block[i] < 0 ? 0xf : 0
287  "psrlw $12, %%xmm3 \n\t" // (block[i] is in the -2048..2047 range)
288  "pmullw %%xmm4, %%xmm0 \n\t" // block[i]*q
289  "pmullw %%xmm5, %%xmm1 \n\t" // block[i]*q
290  "paddw %%xmm2, %%xmm0 \n\t" // bias negative block[i]
291  "paddw %%xmm3, %%xmm1 \n\t" // so that a right-shift
292  "psraw $4, %%xmm0 \n\t" // is equivalent to divide
293  "psraw $4, %%xmm1 \n\t" // with rounding towards zero
294  "movdqa %%xmm0, (%1, %0) \n\t"
295  "movdqa %%xmm1, 16(%1, %0) \n\t"
296 
297  "add $32, %0 \n\t"
298  "jng 1b \n\t"
299  : "+r" (offset)
300  : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
301  : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",)
302  "memory"
303  );
304  block[0]= block0;
305  //Note, we do not do mismatch control for intra as errors cannot accumulate
306 }
307 
308 #if HAVE_SSSE3_INLINE
309 
310 static void dct_unquantize_mpeg2_inter_ssse3(const MPVContext *s,
311  int16_t *block, int n, int qscale)
312 {
313  av_assert2(s->block_last_index[n]>=0);
314 
315  x86_reg qscale2 = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : (unsigned)qscale << 1;
316  x86_reg offset = s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
317  const void *quant_matrix = (const char*)s->inter_matrix + offset;
318 
319 
320 __asm__ volatile(
321  "movd %k1, %%xmm6 \n\t"
322  "lea (%2, %0), %1 \n\t"
323  "neg %0 \n\t"
324  SPLATW(xmm6)
325  "pcmpeqw %%xmm7, %%xmm7 \n\t"
326  "psrldq $14, %%xmm7 \n\t"
327  ".p2align 4 \n\t"
328  "1: \n\t"
329  "movdqa (%3, %0), %%xmm4 \n\t"
330  "movdqa 16(%3, %0), %%xmm5 \n\t"
331  "movdqa (%1, %0), %%xmm0 \n\t"
332  "movdqa 16(%1, %0), %%xmm1 \n\t"
333  "pmullw %%xmm6, %%xmm4 \n\t" // q=qscale*quant_matrix[i]
334  "pmullw %%xmm6, %%xmm5 \n\t" // q=qscale*quant_matrix[i]
335  "pabsw %%xmm0, %%xmm2 \n\t" // abs(block[i])
336  "pabsw %%xmm1, %%xmm3 \n\t" // abs(block[i])
337  "paddw %%xmm2, %%xmm2 \n\t" // abs(block[i])*2
338  "paddw %%xmm3, %%xmm3 \n\t" // abs(block[i])*2
339  "pmullw %%xmm4, %%xmm2 \n\t" // abs(block[i])*2*q
340  "pmullw %%xmm5, %%xmm3 \n\t" // abs(block[i])*2*q
341  "paddw %%xmm4, %%xmm2 \n\t" // (abs(block[i])*2 + 1)*q
342  "paddw %%xmm5, %%xmm3 \n\t" // (abs(block[i])*2 + 1)*q
343  "psrlw $5, %%xmm2 \n\t"
344  "psrlw $5, %%xmm3 \n\t"
345  "psignw %%xmm0, %%xmm2 \n\t"
346  "psignw %%xmm1, %%xmm3 \n\t"
347  "movdqa %%xmm2, (%1, %0) \n\t"
348  "movdqa %%xmm3, 16(%1, %0) \n\t"
349  "pxor %%xmm2, %%xmm7 \n\t"
350  "pxor %%xmm3, %%xmm7 \n\t"
351 
352  "add $32, %0 \n\t"
353  "jng 1b \n\t"
354  "movd 124(%2), %%xmm0 \n\t"
355  "movhlps %%xmm7, %%xmm6 \n\t"
356  "pxor %%xmm6, %%xmm7 \n\t"
357  "pshufd $1, %%xmm7, %%xmm6 \n\t"
358  "pxor %%xmm6, %%xmm7 \n\t"
359  "pshuflw $1, %%xmm7, %%xmm6 \n\t"
360  "pxor %%xmm6, %%xmm7 \n\t"
361  "pslld $31, %%xmm7 \n\t"
362  "psrld $15, %%xmm7 \n\t"
363  "pxor %%xmm7, %%xmm0 \n\t"
364  "movd %%xmm0, 124(%2) \n\t"
365 
366  : "+r"(offset), "+r" (qscale2)
367  : "r" (block), "r"(quant_matrix)
368  : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
369  "memory"
370  );
371 }
372 
373 #endif /* HAVE_SSSE3_INLINE */
374 #endif /* HAVE_SSE2_INLINE */
375 
377 {
378 #if HAVE_SSE2_INLINE
379  int cpu_flags = av_get_cpu_flags();
380 
381  if (INLINE_SSE2(cpu_flags)) {
382  if (!bitexact)
383  s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_sse2;
384  }
385 #if HAVE_SSSE3_INLINE
386  if (INLINE_SSSE3(cpu_flags)) {
387  s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_ssse3;
388  s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_ssse3;
389  s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
390  s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
391  s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
392  }
393 #endif /* HAVE_SSSE3_INLINE */
394 #endif /* HAVE_SSE2_INLINE */
395 }
mpegvideo_unquantize.h
level
uint8_t level
Definition: svq3.c:208
cpu.h
x86_reg
int x86_reg
Definition: asm.h:71
ff_mpeg2_non_linear_qscale
const uint8_t ff_mpeg2_non_linear_qscale[32]
Definition: mpegvideodata.c:26
mpegvideo.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
avassert.h
av_cold
#define av_cold
Definition: attributes.h:106
s
#define s(width, name)
Definition: cbs_vp9.c:198
INLINE_SSE2
#define INLINE_SSE2(flags)
Definition: cpu.h:81
asm.h
cpu.h
mpegvideodata.h
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:68
ff_mpv_unquantize_init_x86
av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
Definition: mpegvideo.c:376
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:97
INLINE_SSSE3
#define INLINE_SSSE3(flags)
Definition: cpu.h:87
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
MPVUnquantDSPContext
Definition: mpegvideo_unquantize.h:34