33 #define SPLATW(reg) "punpcklwd %%" #reg ", %%" #reg "\n\t" \
34 "pshufd $0, %%" #reg ", %%" #reg "\n\t"
38 static void dct_unquantize_h263_intra_ssse3(
const MPVContext *
s,
39 int16_t *
block,
int n,
int qscale)
41 x86_reg qmul = (unsigned)qscale << 1;
51 qadd = (qscale - 1) | 1;
56 x86_reg offset =
s->ac_pred ? 63 << 1 :
s->intra_scantable.raster_end[
s->block_last_index[n]] << 1;
59 "movd %k1, %%xmm0 \n\t"
60 "lea (%2, %0), %1 \n\t"
62 "movd %3, %%xmm1 \n\t"
68 "movdqa (%1, %0), %%xmm2 \n\t"
69 "movdqa 16(%1, %0), %%xmm3 \n\t"
71 "movdqa %%xmm1, %%xmm4 \n\t"
72 "movdqa %%xmm1, %%xmm5 \n\t"
74 "psignw %%xmm2, %%xmm4 \n\t"
75 "psignw %%xmm3, %%xmm5 \n\t"
77 "pmullw %%xmm0, %%xmm2 \n\t"
78 "pmullw %%xmm0, %%xmm3 \n\t"
80 "paddw %%xmm4, %%xmm2 \n\t"
81 "paddw %%xmm5, %%xmm3 \n\t"
83 "movdqa %%xmm2, (%1, %0) \n\t"
84 "movdqa %%xmm3, 16(%1, %0) \n\t"
89 :
"r" (
block),
"rm" (qadd)
90 :
XMM_CLOBBERS(
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",)
"memory"
96 static void dct_unquantize_h263_inter_ssse3(
const MPVContext *
s,
97 int16_t *
block,
int n,
int qscale)
99 int qmul = qscale << 1;
100 int qadd = (qscale - 1) | 1;
104 x86_reg offset =
s->inter_scantable.raster_end[
s->block_last_index[n]] << 1;
107 "movd %2, %%xmm0 \n\t"
108 "movd %3, %%xmm1 \n\t"
116 "movdqa (%0, %1), %%xmm2 \n\t"
117 "movdqa 16(%0, %1), %%xmm3 \n\t"
119 "movdqa %%xmm1, %%xmm4 \n\t"
120 "movdqa %%xmm1, %%xmm5 \n\t"
122 "psignw %%xmm2, %%xmm4 \n\t"
123 "psignw %%xmm3, %%xmm5 \n\t"
125 "pmullw %%xmm0, %%xmm2 \n\t"
126 "pmullw %%xmm0, %%xmm3 \n\t"
128 "paddw %%xmm4, %%xmm2 \n\t"
129 "paddw %%xmm5, %%xmm3 \n\t"
131 "movdqa %%xmm2, (%0, %1) \n\t"
132 "movdqa %%xmm3, 16(%0, %1) \n\t"
137 :
"rm"(qmul),
"rm" (qadd)
138 :
XMM_CLOBBERS(
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",)
"memory"
142 static void dct_unquantize_mpeg1_intra_ssse3(
const MPVContext *
s,
143 int16_t *
block,
int n,
int qscale)
146 const uint16_t *quant_matrix;
151 nCoeffs=
s->intra_scantable.raster_end[
s->block_last_index[n] ]+1;
154 block0 =
block[0] *
s->y_dc_scale;
156 block0 =
block[0] *
s->c_dc_scale;
158 quant_matrix =
s->intra_matrix;
161 "movd %3, %%xmm6 \n\t"
162 "pcmpeqw %%xmm7, %%xmm7 \n\t"
163 "psrlw $15, %%xmm7 \n\t"
167 "movdqa (%2, %0), %%xmm4 \n\t"
168 "movdqa 16(%2, %0), %%xmm5 \n\t"
169 "movdqa (%1, %0), %%xmm0 \n\t"
170 "movdqa 16(%1, %0), %%xmm1 \n\t"
171 "pmullw %%xmm6, %%xmm4 \n\t"
172 "pmullw %%xmm6, %%xmm5 \n\t"
173 "pabsw %%xmm0, %%xmm2 \n\t"
174 "pabsw %%xmm1, %%xmm3 \n\t"
175 "pmullw %%xmm4, %%xmm2 \n\t"
176 "pmullw %%xmm5, %%xmm3 \n\t"
177 "psraw $3, %%xmm2 \n\t"
178 "psraw $3, %%xmm3 \n\t"
179 "psubw %%xmm7, %%xmm2 \n\t"
180 "psubw %%xmm7, %%xmm3 \n\t"
181 "por %%xmm7, %%xmm2 \n\t"
182 "por %%xmm7, %%xmm3 \n\t"
183 "psignw %%xmm0, %%xmm2 \n\t"
184 "psignw %%xmm1, %%xmm3 \n\t"
185 "movdqa %%xmm2, (%1, %0) \n\t"
186 "movdqa %%xmm3, 16(%1, %0) \n\t"
191 :
"r" (
block+nCoeffs),
"r"(quant_matrix+nCoeffs),
"rm" (qscale)
192 :
XMM_CLOBBERS(
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",
"%xmm6",
"%xmm7",)
198 static void dct_unquantize_mpeg1_inter_ssse3(
const MPVContext *
s,
199 int16_t *
block,
int n,
int qscale)
202 const uint16_t *quant_matrix;
206 nCoeffs=
s->intra_scantable.raster_end[
s->block_last_index[n] ]+1;
208 quant_matrix =
s->inter_matrix;
211 "movd %3, %%xmm6 \n\t"
212 "pcmpeqw %%xmm7, %%xmm7 \n\t"
213 "psrlw $15, %%xmm7 \n\t"
217 "movdqa (%2, %0), %%xmm4 \n\t"
218 "movdqa 16(%2, %0), %%xmm5 \n\t"
219 "movdqa (%1, %0), %%xmm0 \n\t"
220 "movdqa 16(%1, %0), %%xmm1 \n\t"
221 "pmullw %%xmm6, %%xmm4 \n\t"
222 "pmullw %%xmm6, %%xmm5 \n\t"
223 "pabsw %%xmm0, %%xmm2 \n\t"
224 "pabsw %%xmm1, %%xmm3 \n\t"
225 "paddw %%xmm2, %%xmm2 \n\t"
226 "paddw %%xmm3, %%xmm3 \n\t"
227 "paddw %%xmm7, %%xmm2 \n\t"
228 "paddw %%xmm7, %%xmm3 \n\t"
229 "pmullw %%xmm4, %%xmm2 \n\t"
230 "pmullw %%xmm5, %%xmm3 \n\t"
231 "psraw $4, %%xmm2 \n\t"
232 "psraw $4, %%xmm3 \n\t"
233 "psubw %%xmm7, %%xmm2 \n\t"
234 "psubw %%xmm7, %%xmm3 \n\t"
235 "por %%xmm7, %%xmm2 \n\t"
236 "por %%xmm7, %%xmm3 \n\t"
237 "psignw %%xmm0, %%xmm2 \n\t"
238 "psignw %%xmm1, %%xmm3 \n\t"
239 "movdqa %%xmm2, (%1, %0) \n\t"
240 "movdqa %%xmm3, 16(%1, %0) \n\t"
245 :
"r" (
block+nCoeffs),
"r"(quant_matrix+nCoeffs),
"rm" (qscale)
246 :
XMM_CLOBBERS(
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",
"%xmm6",
"%xmm7",)
253 static void dct_unquantize_mpeg2_intra_sse2(
const MPVContext *
s,
254 int16_t *
block,
int n,
int qscale)
257 const uint16_t *quant_matrix;
265 nCoeffs=
s->intra_scantable.raster_end[
s->block_last_index[n] ];
268 block0 =
block[0] *
s->y_dc_scale;
270 block0 =
block[0] *
s->c_dc_scale;
271 quant_matrix =
s->intra_matrix;
274 "movd %3, %%xmm6 \n\t"
278 "movdqa (%1, %0), %%xmm0 \n\t"
279 "movdqa 16(%1, %0), %%xmm1 \n\t"
280 "movdqa (%2, %0), %%xmm4 \n\t"
281 "movdqa 16(%2, %0), %%xmm5 \n\t"
282 "pmullw %%xmm6, %%xmm4 \n\t"
283 "pmullw %%xmm6, %%xmm5 \n\t"
284 "movdqa %%xmm0, %%xmm2 \n\t"
285 "movdqa %%xmm1, %%xmm3 \n\t"
286 "psrlw $12, %%xmm2 \n\t"
287 "psrlw $12, %%xmm3 \n\t"
288 "pmullw %%xmm4, %%xmm0 \n\t"
289 "pmullw %%xmm5, %%xmm1 \n\t"
290 "paddw %%xmm2, %%xmm0 \n\t"
291 "paddw %%xmm3, %%xmm1 \n\t"
292 "psraw $4, %%xmm0 \n\t"
293 "psraw $4, %%xmm1 \n\t"
294 "movdqa %%xmm0, (%1, %0) \n\t"
295 "movdqa %%xmm1, 16(%1, %0) \n\t"
300 :
"r" (
block+nCoeffs),
"r"(quant_matrix+nCoeffs),
"rm" (qscale)
301 :
XMM_CLOBBERS(
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",
"%xmm6",)
308 #if HAVE_SSSE3_INLINE
310 static void dct_unquantize_mpeg2_inter_ssse3(
const MPVContext *
s,
311 int16_t *
block,
int n,
int qscale)
316 x86_reg offset =
s->intra_scantable.raster_end[
s->block_last_index[n]] << 1;
317 const void *quant_matrix = (
const char*)
s->inter_matrix +
offset;
321 "movd %k1, %%xmm6 \n\t"
322 "lea (%2, %0), %1 \n\t"
325 "pcmpeqw %%xmm7, %%xmm7 \n\t"
326 "psrldq $14, %%xmm7 \n\t"
329 "movdqa (%3, %0), %%xmm4 \n\t"
330 "movdqa 16(%3, %0), %%xmm5 \n\t"
331 "movdqa (%1, %0), %%xmm0 \n\t"
332 "movdqa 16(%1, %0), %%xmm1 \n\t"
333 "pmullw %%xmm6, %%xmm4 \n\t"
334 "pmullw %%xmm6, %%xmm5 \n\t"
335 "pabsw %%xmm0, %%xmm2 \n\t"
336 "pabsw %%xmm1, %%xmm3 \n\t"
337 "paddw %%xmm2, %%xmm2 \n\t"
338 "paddw %%xmm3, %%xmm3 \n\t"
339 "pmullw %%xmm4, %%xmm2 \n\t"
340 "pmullw %%xmm5, %%xmm3 \n\t"
341 "paddw %%xmm4, %%xmm2 \n\t"
342 "paddw %%xmm5, %%xmm3 \n\t"
343 "psrlw $5, %%xmm2 \n\t"
344 "psrlw $5, %%xmm3 \n\t"
345 "psignw %%xmm0, %%xmm2 \n\t"
346 "psignw %%xmm1, %%xmm3 \n\t"
347 "movdqa %%xmm2, (%1, %0) \n\t"
348 "movdqa %%xmm3, 16(%1, %0) \n\t"
349 "pxor %%xmm2, %%xmm7 \n\t"
350 "pxor %%xmm3, %%xmm7 \n\t"
354 "movd 124(%2), %%xmm0 \n\t"
355 "movhlps %%xmm7, %%xmm6 \n\t"
356 "pxor %%xmm6, %%xmm7 \n\t"
357 "pshufd $1, %%xmm7, %%xmm6 \n\t"
358 "pxor %%xmm6, %%xmm7 \n\t"
359 "pshuflw $1, %%xmm7, %%xmm6 \n\t"
360 "pxor %%xmm6, %%xmm7 \n\t"
361 "pslld $31, %%xmm7 \n\t"
362 "psrld $15, %%xmm7 \n\t"
363 "pxor %%xmm7, %%xmm0 \n\t"
364 "movd %%xmm0, 124(%2) \n\t"
366 :
"+r"(
offset),
"+r" (qscale2)
367 :
"r" (
block),
"r"(quant_matrix)
368 :
XMM_CLOBBERS(
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",
"%xmm6",
"%xmm7",)
383 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_sse2;
385 #if HAVE_SSSE3_INLINE
387 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_ssse3;
388 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_ssse3;
389 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
390 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
391 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;