FFmpeg
vp8dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
28 #include "libavutil/mem_internal.h"
29 
30 #define DECLARE_DOUBLE_1 double db_1
31 #define DECLARE_DOUBLE_2 double db_2
32 #define DECLARE_UINT32_T uint32_t it_1
33 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
34 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
35 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
36 
37 #define MMI_PCMPGTUB(dst, src1, src2) \
38  "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
39  "pmaxub %[db_2], "#src1", "#src2" \n\t" \
40  "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
41  "pxor "#dst", %[db_2], %[db_1] \n\t"
42 
43 #define MMI_BTOH(dst_l, dst_r, src) \
44  "pxor %[db_1], %[db_1], %[db_1] \n\t" \
45  "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
46  "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
47  "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
48 
49 #define MMI_VP8_LOOP_FILTER \
50  /* Calculation of hev */ \
51  "dmtc1 %[thresh], %[ftmp3] \n\t" \
52  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
55  "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
56  "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
57  "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
58  MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
59  /* Calculation of mask */ \
60  "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
61  "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
62  "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
63  "li %[tmp0], 0x09 \n\t" \
64  "dmtc1 %[tmp0], %[ftmp3] \n\t" \
65  PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
66  "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
67  "dmtc1 %[e], %[ftmp3] \n\t" \
68  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
71  MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
72  "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
73  "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
74  "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
75  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
76  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
77  "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
78  "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
79  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
80  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
81  "dmtc1 %[i], %[ftmp3] \n\t" \
82  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85  MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
86  "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
87  "pxor %[mask], %[mask], %[ftmp3] \n\t" \
88  /* VP8_MBFILTER */ \
89  "li %[tmp0], 0x80808080 \n\t" \
90  "dmtc1 %[tmp0], %[ftmp7] \n\t" \
91  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
92  "pxor %[p2], %[p2], %[ftmp7] \n\t" \
93  "pxor %[p1], %[p1], %[ftmp7] \n\t" \
94  "pxor %[p0], %[p0], %[ftmp7] \n\t" \
95  "pxor %[q0], %[q0], %[ftmp7] \n\t" \
96  "pxor %[q1], %[q1], %[ftmp7] \n\t" \
97  "pxor %[q2], %[q2], %[ftmp7] \n\t" \
98  "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
99  "psubb %[ftmp5], %[q0], %[p0] \n\t" \
100  MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
101  MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
102  /* Right part */ \
103  "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
104  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
105  "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
106  /* Left part */ \
107  "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
108  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
109  "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
110  /* Combine left and right part */ \
111  "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
112  "pand %[ftmp1], %[ftmp1], %[mask] \n\t" \
113  "pand %[ftmp2], %[ftmp1], %[hev] \n\t" \
114  "li %[tmp0], 0x04040404 \n\t" \
115  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
116  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
117  "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
118  "li %[tmp0], 0x0B \n\t" \
119  "dmtc1 %[tmp0], %[ftmp4] \n\t" \
120  PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
121  "li %[tmp0], 0x03030303 \n\t" \
122  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
123  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
124  "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
125  "li %[tmp0], 0x0B \n\t" \
126  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
127  PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
128  "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
129  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
130  /* filt_val &= ~hev */ \
131  "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
132  "pxor %[hev], %[hev], %[ftmp0] \n\t" \
133  "pand %[ftmp1], %[ftmp1], %[hev] \n\t" \
134  MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
135  "li %[tmp0], 0x07 \n\t" \
136  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
137  "li %[tmp0], 0x001b001b \n\t" \
138  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
139  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
140  "li %[tmp0], 0x003f003f \n\t" \
141  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
142  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
143  /* Right part */ \
144  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
145  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
146  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
147  /* Left part */ \
148  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
149  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
150  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
151  /* Combine left and right part */ \
152  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
153  "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
154  "pxor %[q0], %[q0], %[ftmp7] \n\t" \
155  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
156  "pxor %[p0], %[p0], %[ftmp7] \n\t" \
157  "li %[tmp0], 0x00120012 \n\t" \
158  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
159  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
160  /* Right part */ \
161  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
162  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
163  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
164  /* Left part */ \
165  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
166  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
167  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
168  /* Combine left and right part */ \
169  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
170  "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
171  "pxor %[q1], %[q1], %[ftmp7] \n\t" \
172  "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
173  "pxor %[p1], %[p1], %[ftmp7] \n\t" \
174  "li %[tmp0], 0x03 \n\t" \
175  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
176  /* Right part */ \
177  "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
178  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
179  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
180  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
181  /* Left part */ \
182  "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
183  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
184  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
185  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
186  /* Combine left and right part */ \
187  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
188  "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
189  "pxor %[q2], %[q2], %[ftmp7] \n\t" \
190  "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
191  "pxor %[p2], %[p2], %[ftmp7] \n\t"
192 
193 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
194  MMI_ULWC1(%[ftmp1], src, 0x00) \
195  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
196  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
197  \
198  MMI_ULWC1(%[ftmp1], src, -0x01) \
199  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
200  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
201  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
202  \
203  MMI_ULWC1(%[ftmp1], src, -0x02) \
204  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
205  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
206  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
207  \
208  MMI_ULWC1(%[ftmp1], src, 0x01) \
209  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
210  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
211  \
212  MMI_ULWC1(%[ftmp1], src, 0x02) \
213  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
214  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
215  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
216  \
217  MMI_ULWC1(%[ftmp1], src, 0x03) \
218  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
219  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
220  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
221  \
222  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
223  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
224  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
225  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
226  \
227  MMI_SWC1(%[ftmp1], dst, 0x00)
228 
229 
230 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
231  MMI_ULWC1(%[ftmp1], src, 0x00) \
232  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
233  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
234  \
235  MMI_ULWC1(%[ftmp1], src, -0x01) \
236  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
237  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
238  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
239  \
240  MMI_ULWC1(%[ftmp1], src, 0x01) \
241  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
242  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
243  \
244  MMI_ULWC1(%[ftmp1], src, 0x02) \
245  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
246  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
247  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
248  \
249  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
250  \
251  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
252  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
253  \
254  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
255  MMI_SWC1(%[ftmp1], dst, 0x00)
256 
257 
258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
259  MMI_ULWC1(%[ftmp1], src, 0x00) \
260  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
261  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
262  \
263  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
264  MMI_ULWC1(%[ftmp1], src1, 0x00) \
265  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
266  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
267  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
268  \
269  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
270  MMI_ULWC1(%[ftmp1], src1, 0x00) \
271  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
272  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
273  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
274  \
275  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
276  MMI_ULWC1(%[ftmp1], src1, 0x00) \
277  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
278  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
279  \
280  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
281  MMI_ULWC1(%[ftmp1], src1, 0x00) \
282  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
283  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
284  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
285  \
286  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
287  MMI_ULWC1(%[ftmp1], src1, 0x00) \
288  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
289  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
290  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
291  \
292  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
293  \
294  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
295  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
296  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
297  \
298  MMI_SWC1(%[ftmp1], dst, 0x00)
299 
300 
301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
302  MMI_ULWC1(%[ftmp1], src, 0x00) \
303  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
304  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
305  \
306  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
307  MMI_ULWC1(%[ftmp1], src1, 0x00) \
308  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
309  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
310  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
311  \
312  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
313  MMI_ULWC1(%[ftmp1], src1, 0x00) \
314  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
315  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
316  \
317  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
318  MMI_ULWC1(%[ftmp1], src1, 0x00) \
319  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
320  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
321  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
322  \
323  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
324  \
325  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
326  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
327  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
328  \
329  MMI_SWC1(%[ftmp1], dst, 0x00)
330 
331 
332 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
333  MMI_ULDC1(%[ftmp1], src, 0x00) \
334  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
335  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
336  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
337  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
338  \
339  MMI_ULDC1(%[ftmp1], src, -0x01) \
340  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
341  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
342  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
343  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
344  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
345  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
346  \
347  MMI_ULDC1(%[ftmp1], src, -0x02) \
348  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
349  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
350  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
351  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
352  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
353  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
354  \
355  MMI_ULDC1(%[ftmp1], src, 0x01) \
356  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
357  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
358  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
359  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
360  \
361  MMI_ULDC1(%[ftmp1], src, 0x02) \
362  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
363  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
364  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
365  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
366  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
367  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
368  \
369  MMI_ULDC1(%[ftmp1], src, 0x03) \
370  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
371  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
372  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
373  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
374  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
375  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
376  \
377  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
378  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
379  \
380  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
381  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
382  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
383  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
384  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
385  \
386  MMI_SDC1(%[ftmp1], dst, 0x00)
387 
388 
389 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
390  MMI_ULDC1(%[ftmp1], src, 0x00) \
391  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
392  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
393  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
394  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
395  \
396  MMI_ULDC1(%[ftmp1], src, -0x01) \
397  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
398  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
399  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
400  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
401  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
402  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
403  \
404  MMI_ULDC1(%[ftmp1], src, 0x01) \
405  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
406  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
407  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
408  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
409  \
410  MMI_ULDC1(%[ftmp1], src, 0x02) \
411  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
412  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
413  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
414  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
415  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
416  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
417  \
418  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
419  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
420  \
421  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
422  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
423  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
424  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
425  \
426  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
427  MMI_SDC1(%[ftmp1], dst, 0x00)
428 
429 
430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
431  MMI_ULDC1(%[ftmp1], src, 0x00) \
432  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
433  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
434  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
435  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
436  \
437  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
438  MMI_ULDC1(%[ftmp1], src1, 0x00) \
439  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
440  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
441  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
442  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
443  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
444  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
445  \
446  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
447  MMI_ULDC1(%[ftmp1], src1, 0x00) \
448  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
449  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
450  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
451  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
452  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
453  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
454  \
455  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
456  MMI_ULDC1(%[ftmp1], src1, 0x00) \
457  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
458  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
459  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
460  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
461  \
462  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
463  MMI_ULDC1(%[ftmp1], src1, 0x00) \
464  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
465  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
466  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
467  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
468  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
469  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
470  \
471  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
472  MMI_ULDC1(%[ftmp1], src1, 0x00) \
473  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
474  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
475  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
476  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
477  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
478  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
479  \
480  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
481  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
482  \
483  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
484  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
485  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
486  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
487  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
488  \
489  MMI_SDC1(%[ftmp1], dst, 0x00)
490 
491 
492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
493  MMI_ULDC1(%[ftmp1], src, 0x00) \
494  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
495  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
496  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
497  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
498  \
499  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
500  MMI_ULDC1(%[ftmp1], src1, 0x00) \
501  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
502  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
503  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
504  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
505  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
506  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
507  \
508  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
509  MMI_ULDC1(%[ftmp1], src1, 0x00) \
510  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
511  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
512  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
513  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
514  \
515  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
516  MMI_ULDC1(%[ftmp1], src1, 0x00) \
517  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
518  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
519  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
520  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
521  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
522  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
523  \
524  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
525  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
526  \
527  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
528  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
529  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
530  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
531  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
532  \
533  MMI_SDC1(%[ftmp1], dst, 0x00)
534 
535 
536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
537  MMI_ULDC1(%[ftmp1], src, 0x00) \
538  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
539  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
540  "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
541  "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
542  \
543  MMI_ULDC1(%[ftmp1], src, 0x01) \
544  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
545  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
546  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
547  "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
548  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
549  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
550  \
551  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
552  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
553  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
554  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
555  \
556  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
557  MMI_SDC1(%[ftmp1], dst, 0x00)
558 
559 
560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
561  MMI_ULWC1(%[ftmp1], src, 0x00) \
562  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
563  "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
564  \
565  MMI_ULWC1(%[ftmp1], src, 0x01) \
566  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
567  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
568  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
569  \
570  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
571  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
572  \
573  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
574  MMI_SWC1(%[ftmp1], dst, 0x00)
575 
576 
577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
578  MMI_ULDC1(%[ftmp1], src, 0x00) \
579  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
580  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
581  "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
582  "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
583  \
584  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
585  MMI_ULDC1(%[ftmp1], src1, 0x00) \
586  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
587  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
588  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
589  "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
590  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
591  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
592  \
593  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
594  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
595  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
596  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
597  \
598  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
599  MMI_SDC1(%[ftmp1], dst, 0x00)
600 
601 
602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
603  MMI_ULWC1(%[ftmp1], src, 0x00) \
604  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
605  "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
606  \
607  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
608  MMI_ULWC1(%[ftmp1], src1, 0x00) \
609  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
610  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
611  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
612  \
613  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
614  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
615  \
616  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
617  MMI_SWC1(%[ftmp1], dst, 0x00)
618 
619 
620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621  {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622  0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
623 
624  {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625  0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
626 
627  {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628  0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
629 
630  {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631  0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
632 
633  {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634  0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
635 
636  {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637  0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
638 
639  {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640  0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
641 };
642 
643 #if 0
644 #define FILTER_6TAP(src, F, stride) \
645  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
646  F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
647  F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
648 
649 #define FILTER_4TAP(src, F, stride) \
650  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
651  F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
652 
653 static const uint8_t subpel_filters[7][6] = {
654  { 0, 6, 123, 12, 1, 0 },
655  { 2, 11, 108, 36, 8, 1 },
656  { 0, 9, 93, 50, 6, 0 },
657  { 3, 16, 77, 77, 16, 3 },
658  { 0, 6, 50, 93, 9, 0 },
659  { 1, 8, 36, 108, 11, 2 },
660  { 0, 1, 12, 123, 6, 0 },
661 };
662 
663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664 #define MUL_35468(a) (((a) * 35468) >> 16)
665 #endif
666 
667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
669  ptrdiff_t stride)
670 {
671  int av_unused p1 = p[-2 * stride];
672  int av_unused p0 = p[-1 * stride];
673  int av_unused q0 = p[ 0 * stride];
674  int av_unused q1 = p[ 1 * stride];
675  int a, f1, f2;
676  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
677 
678  a = 3 * (q0 - p0);
679  a += clip_int8(p1 - q1);
680  a = clip_int8(a);
681 
682  // We deviate from the spec here with c(a+3) >> 3
683  // since that's what libvpx does.
684  f1 = FFMIN(a + 4, 127) >> 3;
685  f2 = FFMIN(a + 3, 127) >> 3;
686 
687  // Despite what the spec says, we do need to clamp here to
688  // be bitexact with libvpx.
689  p[-1 * stride] = cm[p0 + f2];
690  p[ 0 * stride] = cm[q0 - f1];
691 }
692 
694  ptrdiff_t stride)
695 {
696  int av_unused p1 = p[-2 * stride];
697  int av_unused p0 = p[-1 * stride];
698  int av_unused q0 = p[ 0 * stride];
699  int av_unused q1 = p[ 1 * stride];
700  int a, f1, f2;
701  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
702 
703  a = 3 * (q0 - p0);
704  a = clip_int8(a);
705 
706  // We deviate from the spec here with c(a+3) >> 3
707  // since that's what libvpx does.
708  f1 = FFMIN(a + 4, 127) >> 3;
709  f2 = FFMIN(a + 3, 127) >> 3;
710 
711  // Despite what the spec says, we do need to clamp here to
712  // be bitexact with libvpx.
713  p[-1 * stride] = cm[p0 + f2];
714  p[ 0 * stride] = cm[q0 - f1];
715  a = (f1 + 1) >> 1;
716  p[-2 * stride] = cm[p1 + a];
717  p[ 1 * stride] = cm[q1 - a];
718 }
719 
720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
721  int flim)
722 {
723  int av_unused p1 = p[-2 * stride];
724  int av_unused p0 = p[-1 * stride];
725  int av_unused q0 = p[ 0 * stride];
726  int av_unused q1 = p[ 1 * stride];
727 
728  return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
729 }
730 
731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
732 {
733  int av_unused p1 = p[-2 * stride];
734  int av_unused p0 = p[-1 * stride];
735  int av_unused q0 = p[ 0 * stride];
736  int av_unused q1 = p[ 1 * stride];
737 
738  return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
739 }
740 
741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
742 {
743  int a0, a1, a2, w;
744  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
745 
746  int av_unused p2 = p[-3 * stride];
747  int av_unused p1 = p[-2 * stride];
748  int av_unused p0 = p[-1 * stride];
749  int av_unused q0 = p[ 0 * stride];
750  int av_unused q1 = p[ 1 * stride];
751  int av_unused q2 = p[ 2 * stride];
752 
753  w = clip_int8(p1 - q1);
754  w = clip_int8(w + 3 * (q0 - p0));
755 
756  a0 = (27 * w + 63) >> 7;
757  a1 = (18 * w + 63) >> 7;
758  a2 = (9 * w + 63) >> 7;
759 
760  p[-3 * stride] = cm[p2 + a2];
761  p[-2 * stride] = cm[p1 + a1];
762  p[-1 * stride] = cm[p0 + a0];
763  p[ 0 * stride] = cm[q0 - a0];
764  p[ 1 * stride] = cm[q1 - a1];
765  p[ 2 * stride] = cm[q2 - a2];
766 }
767 
768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
769  int E, int I)
770 {
771  int av_unused p3 = p[-4 * stride];
772  int av_unused p2 = p[-3 * stride];
773  int av_unused p1 = p[-2 * stride];
774  int av_unused p0 = p[-1 * stride];
775  int av_unused q0 = p[ 0 * stride];
776  int av_unused q1 = p[ 1 * stride];
777  int av_unused q2 = p[ 2 * stride];
778  int av_unused q3 = p[ 3 * stride];
779 
780  return vp8_simple_limit(p, stride, E) &&
781  FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782  FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783  FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
784 }
785 
786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
788 {
789  double ftmp[18];
790  uint32_t tmp[1];
795 
796  __asm__ volatile(
797  /* Get data from dst */
798  MMI_ULDC1(%[q0], %[dst], 0x0)
799  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
800  MMI_ULDC1(%[p0], %[tmp0], 0x0)
801  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
802  MMI_ULDC1(%[p1], %[tmp0], 0x0)
803  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
804  MMI_ULDC1(%[p2], %[tmp0], 0x0)
805  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
806  MMI_ULDC1(%[p3], %[tmp0], 0x0)
807  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
808  MMI_ULDC1(%[q1], %[tmp0], 0x0)
809  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
810  MMI_ULDC1(%[q2], %[tmp0], 0x0)
811  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
812  MMI_ULDC1(%[q3], %[tmp0], 0x0)
814  /* Move to dst */
815  MMI_USDC1(%[q0], %[dst], 0x0)
816  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
817  MMI_USDC1(%[p0], %[tmp0], 0x0)
818  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
819  MMI_USDC1(%[p1], %[tmp0], 0x0)
820  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
821  MMI_USDC1(%[p2], %[tmp0], 0x0)
822  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
823  MMI_USDC1(%[q1], %[tmp0], 0x0)
824  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
825  MMI_USDC1(%[q2], %[tmp0], 0x0)
827  [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
828  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
829  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
830  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
831  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
832  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
833  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
834  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
835  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
836  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
839  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
840  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
841  : "memory"
842  );
843 }
844 
846  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
847 {
848  int i;
849 
850  for (i = 0; i < 8; i++)
851  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
852  int hv = hev(dst + i * 1, stride, hev_thresh);
853  if (hv)
854  vp8_filter_common_is4tap(dst + i * 1, stride);
855  else
857  }
858 }
859 
860 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
861  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
862 {
863  double ftmp[18];
864  uint32_t tmp[1];
869 
870  __asm__ volatile(
871  /* Get data from dst */
872  MMI_ULDC1(%[p3], %[dst], -0x04)
873  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
874  MMI_ULDC1(%[p2], %[tmp0], -0x04)
875  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
876  MMI_ULDC1(%[p1], %[tmp0], -0x04)
877  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
878  MMI_ULDC1(%[p0], %[tmp0], -0x04)
879  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
880  MMI_ULDC1(%[q0], %[tmp0], -0x04)
881  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
882  MMI_ULDC1(%[q1], %[tmp0], -0x04)
883  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
884  MMI_ULDC1(%[q2], %[tmp0], -0x04)
885  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
886  MMI_ULDC1(%[q3], %[tmp0], -0x04)
887  /* Matrix transpose */
888  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
889  %[q0], %[q1], %[q2], %[q3],
890  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
892  /* Matrix transpose */
893  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
894  %[q0], %[q1], %[q2], %[q3],
895  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
896  /* Move to dst */
897  MMI_USDC1(%[p3], %[dst], -0x04)
898  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
899  MMI_USDC1(%[p2], %[dst], -0x04)
900  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
901  MMI_USDC1(%[p1], %[dst], -0x04)
902  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
903  MMI_USDC1(%[p0], %[dst], -0x04)
904  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
905  MMI_USDC1(%[q0], %[dst], -0x04)
906  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
907  MMI_USDC1(%[q1], %[dst], -0x04)
908  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
909  MMI_USDC1(%[q2], %[dst], -0x04)
910  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
911  MMI_USDC1(%[q3], %[dst], -0x04)
913  [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
914  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
915  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
916  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
917  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
918  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
919  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
920  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
921  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
922  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
925  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
926  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
927  : "memory"
928  );
929 }
930 
932  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
933 {
934  int i;
935 
936  for (i = 0; i < 8; i++)
937  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
938  int hv = hev(dst + i * stride, 1, hev_thresh);
939  if (hv)
940  vp8_filter_common_is4tap(dst + i * stride, 1);
941  else
943  }
944 }
945 
946 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
947 {
948 #if 1
949  double ftmp[8];
951 
952  __asm__ volatile (
953  MMI_LDC1(%[ftmp0], %[dc], 0x00)
954  MMI_LDC1(%[ftmp1], %[dc], 0x08)
955  MMI_LDC1(%[ftmp2], %[dc], 0x10)
956  MMI_LDC1(%[ftmp3], %[dc], 0x18)
957  "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
958  "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
959  "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
960  "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
961  "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
962  "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
963  "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
964  "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
965  MMI_SDC1(%[ftmp0], %[dc], 0x00)
966  MMI_SDC1(%[ftmp1], %[dc], 0x08)
967  MMI_SDC1(%[ftmp2], %[dc], 0x10)
968  MMI_SDC1(%[ftmp3], %[dc], 0x18)
969  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
970  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
971  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
972  [ftmp6]"=&f"(ftmp[6]),
974  [ftmp7]"=&f"(ftmp[7])
975  : [dc]"r"((uint8_t*)dc)
976  : "memory"
977  );
978 
979  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
980  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
981  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
982  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
983 
984  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
985  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
986  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
987  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
988 
989  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
990  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
991  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
992  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
993 
994  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
995  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
996  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
997  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
998 
999  __asm__ volatile (
1000  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1001  MMI_SDC1(%[ftmp0], %[dc], 0x00)
1002  MMI_SDC1(%[ftmp0], %[dc], 0x08)
1003  MMI_SDC1(%[ftmp0], %[dc], 0x10)
1004  MMI_SDC1(%[ftmp0], %[dc], 0x18)
1006  [ftmp0]"=&f"(ftmp[0])
1007  : [dc]"r"((uint8_t *)dc)
1008  : "memory"
1009  );
1010 #else
1011  int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1012 
1013  t00 = dc[0] + dc[12];
1014  t10 = dc[1] + dc[13];
1015  t20 = dc[2] + dc[14];
1016  t30 = dc[3] + dc[15];
1017 
1018  t03 = dc[0] - dc[12];
1019  t13 = dc[1] - dc[13];
1020  t23 = dc[2] - dc[14];
1021  t33 = dc[3] - dc[15];
1022 
1023  t01 = dc[4] + dc[ 8];
1024  t11 = dc[5] + dc[ 9];
1025  t21 = dc[6] + dc[10];
1026  t31 = dc[7] + dc[11];
1027 
1028  t02 = dc[4] - dc[ 8];
1029  t12 = dc[5] - dc[ 9];
1030  t22 = dc[6] - dc[10];
1031  t32 = dc[7] - dc[11];
1032 
1033  dc[ 0] = t00 + t01;
1034  dc[ 1] = t10 + t11;
1035  dc[ 2] = t20 + t21;
1036  dc[ 3] = t30 + t31;
1037 
1038  dc[ 4] = t03 + t02;
1039  dc[ 5] = t13 + t12;
1040  dc[ 6] = t23 + t22;
1041  dc[ 7] = t33 + t32;
1042 
1043  dc[ 8] = t00 - t01;
1044  dc[ 9] = t10 - t11;
1045  dc[10] = t20 - t21;
1046  dc[11] = t30 - t31;
1047 
1048  dc[12] = t03 - t02;
1049  dc[13] = t13 - t12;
1050  dc[14] = t23 - t22;
1051  dc[15] = t33 - t32;
1052 
1053  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1054  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1055  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1056  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1057 
1058  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1059  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1060  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1061  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1062 
1063  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1064  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1065  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1066  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1067 
1068  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1069  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1070  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1071  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1072 
1073  AV_ZERO64(dc + 0);
1074  AV_ZERO64(dc + 4);
1075  AV_ZERO64(dc + 8);
1076  AV_ZERO64(dc + 12);
1077 #endif
1078 }
1079 
1080 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1081 {
1082  int val = (dc[0] + 3) >> 3;
1083 
1084  dc[0] = 0;
1085 
1086  block[0][0][0] = val;
1087  block[0][1][0] = val;
1088  block[0][2][0] = val;
1089  block[0][3][0] = val;
1090  block[1][0][0] = val;
1091  block[1][1][0] = val;
1092  block[1][2][0] = val;
1093  block[1][3][0] = val;
1094  block[2][0][0] = val;
1095  block[2][1][0] = val;
1096  block[2][2][0] = val;
1097  block[2][3][0] = val;
1098  block[3][0][0] = val;
1099  block[3][1][0] = val;
1100  block[3][2][0] = val;
1101  block[3][3][0] = val;
1102 }
1103 
1104 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1105 {
1106 #if 1
1107  double ftmp[12];
1108  uint32_t tmp[1];
1109  union av_intfloat64 ff_ph_4e7b_u;
1110  union av_intfloat64 ff_ph_22a3_u;
1113  ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL;
1114  ff_ph_22a3_u.i = 0x22a322a322a322a3ULL;
1115 
1116  __asm__ volatile (
1117  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1118  MMI_LDC1(%[ftmp1], %[block], 0x00)
1119  MMI_LDC1(%[ftmp2], %[block], 0x08)
1120  MMI_LDC1(%[ftmp3], %[block], 0x10)
1121  MMI_LDC1(%[ftmp4], %[block], 0x18)
1122 
1123  "li %[tmp0], 0x02 \n\t"
1124  "mtc1 %[tmp0], %[ftmp11] \n\t"
1125 
1126  // block[0...3] + block[8...11]
1127  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1128  // block[0...3] - block[8...11]
1129  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1130  // MUL_35468(block[12...15])
1131  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1132  "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1133  // MUL_35468(block[4...7])
1134  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1135  "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1136  // MUL_20091(block[4...7]
1137  "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1138  "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1139  // MUL_20091(block[12...15])
1140  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1141  "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1142 
1143  // tmp[0 4 8 12]
1144  "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1145  "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1146  // tmp[1 5 9 13]
1147  "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1148  "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1149  // tmp[2 6 10 14]
1150  "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1151  "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1152  // tmp[3 7 11 15]
1153  "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1154  "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1155 
1156  MMI_SDC1(%[ftmp0], %[block], 0x00)
1157  MMI_SDC1(%[ftmp0], %[block], 0x08)
1158  MMI_SDC1(%[ftmp0], %[block], 0x10)
1159  MMI_SDC1(%[ftmp0], %[block], 0x18)
1160 
1161  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1162  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1163 
1164  // t[0 4 8 12]
1165  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1166  // t[1 5 9 13]
1167  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1168  // t[2 6 10 14]
1169  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1170  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1171  "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1172  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1173  "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1174  // t[3 7 11 15]
1175  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1176  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1177  "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1178  "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1179  "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1180 
1181  "li %[tmp0], 0x03 \n\t"
1182  "mtc1 %[tmp0], %[ftmp11] \n\t"
1183  "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1184  "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1185  "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1186  "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1187  "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1188  "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1189  "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1190  "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1191  "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1192  "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1193  "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1194  "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1195 
1196  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1197  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1198 
1199  MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1200  MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1201  MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1202  MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1203 
1204  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1205  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1206  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1207  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1208 
1209  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1210  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1211  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1212  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1213 
1214  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1215  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1216  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1217  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1218 
1219  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1220  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1221  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1222  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1223  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1224  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1225  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1226  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1227  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1228  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1231  [tmp0]"=&r"(tmp[0])
1232  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1233  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1234  [block]"r"(block), [ff_pw_4]"f"(ff_pw_4.f),
1235  [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f), [ff_ph_22a3]"f"(ff_ph_22a3_u.f)
1236  : "memory"
1237  );
1238 #else
1239  int i, t0, t1, t2, t3;
1240  int16_t tmp[16];
1241 
1242  for (i = 0; i < 4; i++) {
1243  t0 = block[0 + i] + block[8 + i];
1244  t1 = block[0 + i] - block[8 + i];
1245  t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1246  t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1247  block[ 0 + i] = 0;
1248  block[ 4 + i] = 0;
1249  block[ 8 + i] = 0;
1250  block[12 + i] = 0;
1251 
1252  tmp[i * 4 + 0] = t0 + t3;
1253  tmp[i * 4 + 1] = t1 + t2;
1254  tmp[i * 4 + 2] = t1 - t2;
1255  tmp[i * 4 + 3] = t0 - t3;
1256  }
1257 
1258  for (i = 0; i < 4; i++) {
1259  t0 = tmp[0 + i] + tmp[8 + i];
1260  t1 = tmp[0 + i] - tmp[8 + i];
1261  t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1262  t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1263 
1264  dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1265  dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1266  dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1267  dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1268  dst += stride;
1269  }
1270 #endif
1271 }
1272 
1273 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1274 {
1275 #if 1
1276  int dc = (block[0] + 4) >> 3;
1277  double ftmp[6];
1279 
1280  block[0] = 0;
1281 
1282  __asm__ volatile (
1283  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1284  "mtc1 %[dc], %[ftmp5] \n\t"
1285  MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1286  MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1287  MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1288  MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1289  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1290  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1291  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1292  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1293  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1294  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1295  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1296  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1297  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1298  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1299  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1300  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1301  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1302  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1303  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1304  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1305  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1306  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1307  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1308  [ftmp4]"=&f"(ftmp[4]),
1310  [ftmp5]"=&f"(ftmp[5])
1311  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1312  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1313  [dc]"r"(dc)
1314  : "memory"
1315  );
1316 #else
1317  int i, dc = (block[0] + 4) >> 3;
1318 
1319  block[0] = 0;
1320 
1321  for (i = 0; i < 4; i++) {
1322  dst[0] = av_clip_uint8(dst[0] + dc);
1323  dst[1] = av_clip_uint8(dst[1] + dc);
1324  dst[2] = av_clip_uint8(dst[2] + dc);
1325  dst[3] = av_clip_uint8(dst[3] + dc);
1326  dst += stride;
1327  }
1328 #endif
1329 }
1330 
1331 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1332  ptrdiff_t stride)
1333 {
1334  ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1335  ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1336  ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1337  ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1338 }
1339 
1340 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1341  ptrdiff_t stride)
1342 {
1343  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1344  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1345  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1346  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1347 }
1348 
1349 // loop filter applied to edges between macroblocks
1350 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1351  int flim_I, int hev_thresh)
1352 {
1353  vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1354  vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1355 }
1356 
1357 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1358  int flim_I, int hev_thresh)
1359 {
1360  vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1361  vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1362  hev_thresh);
1363 }
1364 
1365 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1366  int flim_E, int flim_I, int hev_thresh)
1367 {
1368  vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1369  vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1370 }
1371 
1372 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1373  int flim_E, int flim_I, int hev_thresh)
1374 {
1375  vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1376  vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1377 }
1378 
1379 // loop filter applied to inner macroblock edges
1380 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1381  int flim_E, int flim_I, int hev_thresh)
1382 {
1383  int i;
1384 
1385  for (i = 0; i < 16; i++)
1386  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1387  int hv = hev(dst + i * 1, stride, hev_thresh);
1388  if (hv)
1389  vp8_filter_common_is4tap(dst + i * 1, stride);
1390  else
1391  vp8_filter_common_isnot4tap(dst + i * 1, stride);
1392  }
1393 }
1394 
1395 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1396  int flim_E, int flim_I, int hev_thresh)
1397 {
1398  int i;
1399 
1400  for (i = 0; i < 16; i++)
1401  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1402  int hv = hev(dst + i * stride, 1, hev_thresh);
1403  if (hv)
1404  vp8_filter_common_is4tap(dst + i * stride, 1);
1405  else
1406  vp8_filter_common_isnot4tap(dst + i * stride, 1);
1407  }
1408 }
1409 
1410 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1411  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1412 {
1413  vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1414  vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1415 }
1416 
1417 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1418  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1419 {
1420  vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1421  vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1422 }
1423 
1424 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1425 {
1426  int i;
1427 
1428  for (i = 0; i < 16; i++)
1429  if (vp8_simple_limit(dst + i, stride, flim))
1431 }
1432 
1433 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1434 {
1435  int i;
1436 
1437  for (i = 0; i < 16; i++)
1438  if (vp8_simple_limit(dst + i * stride, 1, flim))
1439  vp8_filter_common_is4tap(dst + i * stride, 1);
1440 }
1441 
1442 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1443  ptrdiff_t srcstride, int h, int x, int y)
1444 {
1445 #if 1
1446  double ftmp[2];
1447  uint64_t tmp[2];
1448  mips_reg addr[2];
1450 
1451  __asm__ volatile (
1452  "1: \n\t"
1453  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1454  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1455  "ldl %[tmp0], 0x0f(%[src]) \n\t"
1456  "ldr %[tmp0], 0x08(%[src]) \n\t"
1457  MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1458  "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1459  "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1460  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1461  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1462  "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1463  "sdr %[tmp0], 0x08(%[dst]) \n\t"
1464  "addiu %[h], %[h], -0x02 \n\t"
1465  MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1466  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1467  "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1468  "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1469  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1470  "bnez %[h], 1b \n\t"
1471  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1472  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1474  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1475  [dst]"+&r"(dst), [src]"+&r"(src),
1476  [h]"+&r"(h)
1477  : [dststride]"r"((mips_reg)dststride),
1478  [srcstride]"r"((mips_reg)srcstride)
1479  : "memory"
1480  );
1481 #else
1482  int i;
1483 
1484  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1485  memcpy(dst, src, 16);
1486 #endif
1487 }
1488 
1489 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1490  ptrdiff_t srcstride, int h, int x, int y)
1491 {
1492 #if 1
1493  double ftmp[1];
1494  uint64_t tmp[1];
1495  mips_reg addr[2];
1497 
1498  __asm__ volatile (
1499  "1: \n\t"
1500  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1501  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1502  "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1503  "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1504  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1505  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1506  "addiu %[h], %[h], -0x02 \n\t"
1507  "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1508  "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1509  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1510  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1511  "bnez %[h], 1b \n\t"
1512  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1514  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1515  [dst]"+&r"(dst), [src]"+&r"(src),
1516  [h]"+&r"(h)
1517  : [dststride]"r"((mips_reg)dststride),
1518  [srcstride]"r"((mips_reg)srcstride)
1519  : "memory"
1520  );
1521 #else
1522  int i;
1523 
1524  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1525  memcpy(dst, src, 8);
1526 #endif
1527 }
1528 
1529 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1530  ptrdiff_t srcstride, int h, int x, int y)
1531 {
1532 #if 1
1533  double ftmp[1];
1534  uint64_t tmp[1];
1535  mips_reg addr[2];
1537 
1538  __asm__ volatile (
1539  "1: \n\t"
1540  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1541  MMI_LWC1(%[ftmp0], %[src], 0x00)
1542  "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1543  "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1544  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1545  MMI_SWC1(%[ftmp0], %[dst], 0x00)
1546  "addiu %[h], %[h], -0x02 \n\t"
1547  "swl %[tmp0], 0x03(%[addr1]) \n\t"
1548  "swr %[tmp0], 0x00(%[addr1]) \n\t"
1549  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1550  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1551  "bnez %[h], 1b \n\t"
1552  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1554  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1555  [dst]"+&r"(dst), [src]"+&r"(src),
1556  [h]"+&r"(h)
1557  : [dststride]"r"((mips_reg)dststride),
1558  [srcstride]"r"((mips_reg)srcstride)
1559  : "memory"
1560  );
1561 #else
1562  int i;
1563 
1564  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1565  memcpy(dst, src, 4);
1566 #endif
1567 }
1568 
1569 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1570  ptrdiff_t srcstride, int h, int mx, int my)
1571 {
1572 #if 1
1573  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1574  double ftmp[9];
1575  uint32_t tmp[1];
1576  union av_intfloat64 filter1;
1577  union av_intfloat64 filter2;
1578  union av_intfloat64 filter3;
1579  union av_intfloat64 filter4;
1580  mips_reg src1, dst1;
1582  filter1.i = filter[1];
1583  filter2.i = filter[2];
1584  filter3.i = filter[3];
1585  filter4.i = filter[4];
1586 
1587  /*
1588  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1589  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1590  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1591  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1592  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1593  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1594  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1595  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1596 
1597  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1598  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1599  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1600  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1601  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1602  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1603  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1604  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1605  */
1606  __asm__ volatile (
1607  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1608  "li %[tmp0], 0x07 \n\t"
1609  "mtc1 %[tmp0], %[ftmp4] \n\t"
1610 
1611  "1: \n\t"
1612  // 0 - 7
1613  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1614  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1615  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1616  // 8 - 15
1617  PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1618 
1619  "addiu %[h], %[h], -0x01 \n\t"
1620  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1621  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1622  "bnez %[h], 1b \n\t"
1623  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1624  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1625  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1626  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1627  [ftmp8]"=&f"(ftmp[8]),
1628  [tmp0]"=&r"(tmp[0]),
1630  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1631  [h]"+&r"(h),
1632  [dst]"+&r"(dst), [src]"+&r"(src)
1633  : [ff_pw_64]"f"(ff_pw_64.f),
1634  [srcstride]"r"((mips_reg)srcstride),
1635  [dststride]"r"((mips_reg)dststride),
1636  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1637  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1638  : "memory"
1639  );
1640 #else
1641  const uint8_t *filter = subpel_filters[mx - 1];
1642  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1643  int x, y;
1644 
1645  for (y = 0; y < h; y++) {
1646  for (x = 0; x < 16; x++)
1647  dst[x] = FILTER_4TAP(src, filter, 1);
1648  dst += dststride;
1649  src += srcstride;
1650  }
1651 #endif
1652 }
1653 
1654 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1655  ptrdiff_t srcstride, int h, int mx, int my)
1656 {
1657 #if 1
1658  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1659  double ftmp[9];
1660  uint32_t tmp[1];
1661  union av_intfloat64 filter1;
1662  union av_intfloat64 filter2;
1663  union av_intfloat64 filter3;
1664  union av_intfloat64 filter4;
1666  filter1.i = filter[1];
1667  filter2.i = filter[2];
1668  filter3.i = filter[3];
1669  filter4.i = filter[4];
1670 
1671 
1672  /*
1673  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1674  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1675  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1676  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1677  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1678  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1679  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1680  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1681  */
1682  __asm__ volatile (
1683  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1684  "li %[tmp0], 0x07 \n\t"
1685  "mtc1 %[tmp0], %[ftmp4] \n\t"
1686 
1687  "1: \n\t"
1688  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1689 
1690  "addiu %[h], %[h], -0x01 \n\t"
1691  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1692  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1693  "bnez %[h], 1b \n\t"
1694  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1695  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1696  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1697  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1698  [ftmp8]"=&f"(ftmp[8]),
1699  [tmp0]"=&r"(tmp[0]),
1701  [h]"+&r"(h),
1702  [dst]"+&r"(dst), [src]"+&r"(src)
1703  : [ff_pw_64]"f"(ff_pw_64.f),
1704  [srcstride]"r"((mips_reg)srcstride),
1705  [dststride]"r"((mips_reg)dststride),
1706  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1707  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1708  : "memory"
1709  );
1710 #else
1711  const uint8_t *filter = subpel_filters[mx - 1];
1712  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1713  int x, y;
1714 
1715  for (y = 0; y < h; y++) {
1716  for (x = 0; x < 8; x++)
1717  dst[x] = FILTER_4TAP(src, filter, 1);
1718  dst += dststride;
1719  src += srcstride;
1720  }
1721 #endif
1722 }
1723 
1724 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1725  ptrdiff_t srcstride, int h, int mx, int my)
1726 {
1727 #if 1
1728  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1729  double ftmp[6];
1730  uint32_t tmp[1];
1731  union av_intfloat64 filter1;
1732  union av_intfloat64 filter2;
1733  union av_intfloat64 filter3;
1734  union av_intfloat64 filter4;
1736  filter1.i = filter[1];
1737  filter2.i = filter[2];
1738  filter3.i = filter[3];
1739  filter4.i = filter[4];
1740 
1741  /*
1742  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1743  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1744  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1745  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1746  */
1747  __asm__ volatile (
1748  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1749  "li %[tmp0], 0x07 \n\t"
1750  "mtc1 %[tmp0], %[ftmp4] \n\t"
1751 
1752  "1: \n\t"
1753  PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1754 
1755  "addiu %[h], %[h], -0x01 \n\t"
1756  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1757  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1758  "bnez %[h], 1b \n\t"
1759  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1760  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1761  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1762  [tmp0]"=&r"(tmp[0]),
1764  [h]"+&r"(h),
1765  [dst]"+&r"(dst), [src]"+&r"(src)
1766  : [ff_pw_64]"f"(ff_pw_64.f),
1767  [srcstride]"r"((mips_reg)srcstride),
1768  [dststride]"r"((mips_reg)dststride),
1769  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1770  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1771  : "memory"
1772  );
1773 #else
1774  const uint8_t *filter = subpel_filters[mx - 1];
1775  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1776  int x, y;
1777 
1778  for (y = 0; y < h; y++) {
1779  for (x = 0; x < 4; x++)
1780  dst[x] = FILTER_4TAP(src, filter, 1);
1781  dst += dststride;
1782  src += srcstride;
1783  }
1784 #endif
1785 }
1786 
1787 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1788  ptrdiff_t srcstride, int h, int mx, int my)
1789 {
1790 #if 1
1791  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1792  double ftmp[9];
1793  uint32_t tmp[1];
1794  mips_reg src1, dst1;
1795  union av_intfloat64 filter0;
1796  union av_intfloat64 filter1;
1797  union av_intfloat64 filter2;
1798  union av_intfloat64 filter3;
1799  union av_intfloat64 filter4;
1800  union av_intfloat64 filter5;
1802  filter0.i = filter[0];
1803  filter1.i = filter[1];
1804  filter2.i = filter[2];
1805  filter3.i = filter[3];
1806  filter4.i = filter[4];
1807  filter5.i = filter[5];
1808 
1809  /*
1810  dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1811  dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1812  dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1813  dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1814  dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1815  dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1816  dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1817  dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1818 
1819  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1820  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1821  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1822  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1823  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1824  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1825  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1826  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1827  */
1828  __asm__ volatile (
1829  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1830  "li %[tmp0], 0x07 \n\t"
1831  "mtc1 %[tmp0], %[ftmp4] \n\t"
1832 
1833  "1: \n\t"
1834  // 0 - 7
1835  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1836  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1837  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1838  // 8 - 15
1839  PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1840 
1841  "addiu %[h], %[h], -0x01 \n\t"
1842  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1843  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1844  "bnez %[h], 1b \n\t"
1845  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1846  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1847  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1848  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1849  [ftmp8]"=&f"(ftmp[8]),
1850  [tmp0]"=&r"(tmp[0]),
1852  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1853  [h]"+&r"(h),
1854  [dst]"+&r"(dst), [src]"+&r"(src)
1855  : [ff_pw_64]"f"(ff_pw_64.f),
1856  [srcstride]"r"((mips_reg)srcstride),
1857  [dststride]"r"((mips_reg)dststride),
1858  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1859  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1860  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1861  : "memory"
1862  );
1863 #else
1864  const uint8_t *filter = subpel_filters[mx - 1];
1865  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1866  int x, y;
1867 
1868  for (y = 0; y < h; y++) {
1869  for (x = 0; x < 16; x++)
1870  dst[x] = FILTER_6TAP(src, filter, 1);
1871  dst += dststride;
1872  src += srcstride;
1873  }
1874 #endif
1875 }
1876 
1877 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1878  ptrdiff_t srcstride, int h, int mx, int my)
1879 {
1880 #if 1
1881  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1882  double ftmp[9];
1883  uint32_t tmp[1];
1884  union av_intfloat64 filter0;
1885  union av_intfloat64 filter1;
1886  union av_intfloat64 filter2;
1887  union av_intfloat64 filter3;
1888  union av_intfloat64 filter4;
1889  union av_intfloat64 filter5;
1891  filter0.i = filter[0];
1892  filter1.i = filter[1];
1893  filter2.i = filter[2];
1894  filter3.i = filter[3];
1895  filter4.i = filter[4];
1896  filter5.i = filter[5];
1897 
1898  /*
1899  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1900  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1901  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1902  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1903  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1904  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1905  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1906  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1907  */
1908  __asm__ volatile (
1909  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1910  "li %[tmp0], 0x07 \n\t"
1911  "mtc1 %[tmp0], %[ftmp4] \n\t"
1912 
1913  "1: \n\t"
1914  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1915 
1916  "addiu %[h], %[h], -0x01 \n\t"
1917  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1918  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1919  "bnez %[h], 1b \n\t"
1920  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1921  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1922  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1923  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1924  [ftmp8]"=&f"(ftmp[8]),
1925  [tmp0]"=&r"(tmp[0]),
1927  [h]"+&r"(h),
1928  [dst]"+&r"(dst), [src]"+&r"(src)
1929  : [ff_pw_64]"f"(ff_pw_64.f),
1930  [srcstride]"r"((mips_reg)srcstride),
1931  [dststride]"r"((mips_reg)dststride),
1932  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1933  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1934  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1935  : "memory"
1936  );
1937 #else
1938  const uint8_t *filter = subpel_filters[mx - 1];
1939  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1940  int x, y;
1941 
1942  for (y = 0; y < h; y++) {
1943  for (x = 0; x < 8; x++)
1944  dst[x] = FILTER_6TAP(src, filter, 1);
1945  dst += dststride;
1946  src += srcstride;
1947  }
1948 #endif
1949 }
1950 
1951 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1952  ptrdiff_t srcstride, int h, int mx, int my)
1953 {
1954 #if 1
1955  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1956  double ftmp[6];
1957  uint32_t tmp[1];
1958  union av_intfloat64 filter0;
1959  union av_intfloat64 filter1;
1960  union av_intfloat64 filter2;
1961  union av_intfloat64 filter3;
1962  union av_intfloat64 filter4;
1963  union av_intfloat64 filter5;
1965  filter0.i = filter[0];
1966  filter1.i = filter[1];
1967  filter2.i = filter[2];
1968  filter3.i = filter[3];
1969  filter4.i = filter[4];
1970  filter5.i = filter[5];
1971 
1972  /*
1973  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1974  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1975  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1976  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1977  */
1978  __asm__ volatile (
1979  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1980  "li %[tmp0], 0x07 \n\t"
1981  "mtc1 %[tmp0], %[ftmp4] \n\t"
1982 
1983  "1: \n\t"
1984  PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1985 
1986  "addiu %[h], %[h], -0x01 \n\t"
1987  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1988  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1989  "bnez %[h], 1b \n\t"
1990  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1991  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1992  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1993  [tmp0]"=&r"(tmp[0]),
1995  [h]"+&r"(h),
1996  [dst]"+&r"(dst), [src]"+&r"(src)
1997  : [ff_pw_64]"f"(ff_pw_64.f),
1998  [srcstride]"r"((mips_reg)srcstride),
1999  [dststride]"r"((mips_reg)dststride),
2000  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2001  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2002  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2003  : "memory"
2004  );
2005 #else
2006  const uint8_t *filter = subpel_filters[mx - 1];
2007  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2008  int x, y;
2009 
2010  for (y = 0; y < h; y++) {
2011  for (x = 0; x < 4; x++)
2012  dst[x] = FILTER_6TAP(src, filter, 1);
2013  dst += dststride;
2014  src += srcstride;
2015  }
2016 #endif
2017 }
2018 
2019 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2020  ptrdiff_t srcstride, int h, int mx, int my)
2021 {
2022 #if 1
2023  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2024  double ftmp[9];
2025  uint32_t tmp[1];
2026  mips_reg src0, src1, dst0;
2027  union av_intfloat64 filter1;
2028  union av_intfloat64 filter2;
2029  union av_intfloat64 filter3;
2030  union av_intfloat64 filter4;
2032  filter1.i = filter[1];
2033  filter2.i = filter[2];
2034  filter3.i = filter[3];
2035  filter4.i = filter[4];
2036 
2037  /*
2038  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2039  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2040  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2041  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2042  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2043  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2044  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2045  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2046 
2047  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2048  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2049  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2050  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2051  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2052  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2053  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2054  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2055  */
2056  __asm__ volatile (
2057  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2058  "li %[tmp0], 0x07 \n\t"
2059  "mtc1 %[tmp0], %[ftmp4] \n\t"
2060 
2061  "1: \n\t"
2062  // 0 - 7
2063  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2064  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2065  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2066  // 8 - 15
2067  PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2068 
2069  "addiu %[h], %[h], -0x01 \n\t"
2070  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2071  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2072  "bnez %[h], 1b \n\t"
2073  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2074  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2075  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2076  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2077  [ftmp8]"=&f"(ftmp[8]),
2078  [tmp0]"=&r"(tmp[0]),
2080  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2081  [src1]"=&r"(src1),
2082  [h]"+&r"(h),
2083  [dst]"+&r"(dst), [src]"+&r"(src)
2084  : [ff_pw_64]"f"(ff_pw_64.f),
2085  [srcstride]"r"((mips_reg)srcstride),
2086  [dststride]"r"((mips_reg)dststride),
2087  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2088  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2089  : "memory"
2090  );
2091 #else
2092  const uint8_t *filter = subpel_filters[my - 1];
2093  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2094  int x, y;
2095 
2096  for (y = 0; y < h; y++) {
2097  for (x = 0; x < 16; x++)
2098  dst[x] = FILTER_4TAP(src, filter, srcstride);
2099  dst += dststride;
2100  src += srcstride;
2101  }
2102 #endif
2103 }
2104 
2105 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2106  ptrdiff_t srcstride, int h, int mx, int my)
2107 {
2108 #if 1
2109  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2110  double ftmp[9];
2111  uint32_t tmp[1];
2112  mips_reg src1;
2113  union av_intfloat64 filter1;
2114  union av_intfloat64 filter2;
2115  union av_intfloat64 filter3;
2116  union av_intfloat64 filter4;
2118  filter1.i = filter[1];
2119  filter2.i = filter[2];
2120  filter3.i = filter[3];
2121  filter4.i = filter[4];
2122 
2123  /*
2124  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2125  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2126  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2127  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2128  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2129  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2130  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2131  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2132  */
2133  __asm__ volatile (
2134  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2135  "li %[tmp0], 0x07 \n\t"
2136  "mtc1 %[tmp0], %[ftmp4] \n\t"
2137 
2138  "1: \n\t"
2139  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2140 
2141  "addiu %[h], %[h], -0x01 \n\t"
2142  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2143  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2144  "bnez %[h], 1b \n\t"
2145  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2146  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2147  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2148  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2149  [ftmp8]"=&f"(ftmp[8]),
2150  [tmp0]"=&r"(tmp[0]),
2152  [src1]"=&r"(src1),
2153  [h]"+&r"(h),
2154  [dst]"+&r"(dst), [src]"+&r"(src)
2155  : [ff_pw_64]"f"(ff_pw_64.f),
2156  [srcstride]"r"((mips_reg)srcstride),
2157  [dststride]"r"((mips_reg)dststride),
2158  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2159  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2160  : "memory"
2161  );
2162 #else
2163  const uint8_t *filter = subpel_filters[my - 1];
2164  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2165  int x, y;
2166 
2167  for (y = 0; y < h; y++) {
2168  for (x = 0; x < 8; x++)
2169  dst[x] = FILTER_4TAP(src, filter, srcstride);
2170  dst += dststride;
2171  src += srcstride;
2172  }
2173 #endif
2174 }
2175 
2176 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2177  ptrdiff_t srcstride, int h, int mx, int my)
2178 {
2179 #if 1
2180  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2181  double ftmp[6];
2182  uint32_t tmp[1];
2183  mips_reg src1;
2184  union av_intfloat64 filter1;
2185  union av_intfloat64 filter2;
2186  union av_intfloat64 filter3;
2187  union av_intfloat64 filter4;
2189  filter1.i = filter[1];
2190  filter2.i = filter[2];
2191  filter3.i = filter[3];
2192  filter4.i = filter[4];
2193 
2194  /*
2195  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2196  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2197  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2198  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2199  */
2200  __asm__ volatile (
2201  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2202  "li %[tmp0], 0x07 \n\t"
2203  "mtc1 %[tmp0], %[ftmp4] \n\t"
2204 
2205  "1: \n\t"
2206  PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2207 
2208  "addiu %[h], %[h], -0x01 \n\t"
2209  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2210  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2211  "bnez %[h], 1b \n\t"
2212  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2213  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2214  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2215  [tmp0]"=&r"(tmp[0]),
2217  [src1]"=&r"(src1),
2218  [h]"+&r"(h),
2219  [dst]"+&r"(dst), [src]"+&r"(src)
2220  : [ff_pw_64]"f"(ff_pw_64.f),
2221  [srcstride]"r"((mips_reg)srcstride),
2222  [dststride]"r"((mips_reg)dststride),
2223  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2224  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2225  : "memory"
2226  );
2227 #else
2228  const uint8_t *filter = subpel_filters[my - 1];
2229  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2230  int x, y;
2231 
2232  for (y = 0; y < h; y++) {
2233  for (x = 0; x < 4; x++)
2234  dst[x] = FILTER_4TAP(src, filter, srcstride);
2235  dst += dststride;
2236  src += srcstride;
2237  }
2238 #endif
2239 }
2240 
2241 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2242  ptrdiff_t srcstride, int h, int mx, int my)
2243 {
2244 #if 1
2245  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2246  double ftmp[9];
2247  uint32_t tmp[1];
2248  mips_reg src0, src1, dst0;
2249  union av_intfloat64 filter0;
2250  union av_intfloat64 filter1;
2251  union av_intfloat64 filter2;
2252  union av_intfloat64 filter3;
2253  union av_intfloat64 filter4;
2254  union av_intfloat64 filter5;
2256  filter0.i = filter[0];
2257  filter1.i = filter[1];
2258  filter2.i = filter[2];
2259  filter3.i = filter[3];
2260  filter4.i = filter[4];
2261  filter5.i = filter[5];
2262 
2263  /*
2264  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2265  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2266  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2267  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2268  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2269  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2270  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2271  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2272 
2273  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2274  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2275  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2276  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2277  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2278  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2279  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2280  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2281  */
2282  __asm__ volatile (
2283  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2284  "li %[tmp0], 0x07 \n\t"
2285  "mtc1 %[tmp0], %[ftmp4] \n\t"
2286 
2287  "1: \n\t"
2288  // 0 - 7
2289  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2290  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2291  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2292  // 8 - 15
2293  PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2294 
2295  "addiu %[h], %[h], -0x01 \n\t"
2296  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2297  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2298  "bnez %[h], 1b \n\t"
2299  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2300  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2301  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2302  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2303  [ftmp8]"=&f"(ftmp[8]),
2304  [tmp0]"=&r"(tmp[0]),
2306  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2307  [src1]"=&r"(src1),
2308  [h]"+&r"(h),
2309  [dst]"+&r"(dst), [src]"+&r"(src)
2310  : [ff_pw_64]"f"(ff_pw_64.f),
2311  [srcstride]"r"((mips_reg)srcstride),
2312  [dststride]"r"((mips_reg)dststride),
2313  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2314  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2315  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2316  : "memory"
2317  );
2318 #else
2319  const uint8_t *filter = subpel_filters[my - 1];
2320  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2321  int x, y;
2322 
2323  for (y = 0; y < h; y++) {
2324  for (x = 0; x < 16; x++)
2325  dst[x] = FILTER_6TAP(src, filter, srcstride);
2326  dst += dststride;
2327  src += srcstride;
2328  }
2329 #endif
2330 }
2331 
2332 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2333  ptrdiff_t srcstride, int h, int mx, int my)
2334 {
2335 #if 1
2336  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2337  double ftmp[9];
2338  uint32_t tmp[1];
2339  mips_reg src1;
2340  union av_intfloat64 filter0;
2341  union av_intfloat64 filter1;
2342  union av_intfloat64 filter2;
2343  union av_intfloat64 filter3;
2344  union av_intfloat64 filter4;
2345  union av_intfloat64 filter5;
2347  filter0.i = filter[0];
2348  filter1.i = filter[1];
2349  filter2.i = filter[2];
2350  filter3.i = filter[3];
2351  filter4.i = filter[4];
2352  filter5.i = filter[5];
2353 
2354  /*
2355  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2356  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2357  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2358  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2359  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2360  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2361  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2362  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2363  */
2364  __asm__ volatile (
2365  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2366  "li %[tmp0], 0x07 \n\t"
2367  "mtc1 %[tmp0], %[ftmp4] \n\t"
2368 
2369  "1: \n\t"
2370  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2371 
2372  "addiu %[h], %[h], -0x01 \n\t"
2373  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2374  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2375  "bnez %[h], 1b \n\t"
2376  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2377  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2378  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2379  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2380  [ftmp8]"=&f"(ftmp[8]),
2381  [tmp0]"=&r"(tmp[0]),
2383  [src1]"=&r"(src1),
2384  [h]"+&r"(h),
2385  [dst]"+&r"(dst), [src]"+&r"(src)
2386  : [ff_pw_64]"f"(ff_pw_64.f),
2387  [srcstride]"r"((mips_reg)srcstride),
2388  [dststride]"r"((mips_reg)dststride),
2389  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2390  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2391  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2392  : "memory"
2393  );
2394 #else
2395  const uint8_t *filter = subpel_filters[my - 1];
2396  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2397  int x, y;
2398 
2399  for (y = 0; y < h; y++) {
2400  for (x = 0; x < 8; x++)
2401  dst[x] = FILTER_6TAP(src, filter, srcstride);
2402  dst += dststride;
2403  src += srcstride;
2404  }
2405 #endif
2406 }
2407 
2408 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2409  ptrdiff_t srcstride, int h, int mx, int my)
2410 {
2411 #if 1
2412  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2413  double ftmp[6];
2414  uint32_t tmp[1];
2415  mips_reg src1;
2416  union av_intfloat64 filter0;
2417  union av_intfloat64 filter1;
2418  union av_intfloat64 filter2;
2419  union av_intfloat64 filter3;
2420  union av_intfloat64 filter4;
2421  union av_intfloat64 filter5;
2423  filter0.i = filter[0];
2424  filter1.i = filter[1];
2425  filter2.i = filter[2];
2426  filter3.i = filter[3];
2427  filter4.i = filter[4];
2428  filter5.i = filter[5];
2429 
2430  /*
2431  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2432  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2433  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2434  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2435  */
2436  __asm__ volatile (
2437  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2438  "li %[tmp0], 0x07 \n\t"
2439  "mtc1 %[tmp0], %[ftmp4] \n\t"
2440 
2441  "1: \n\t"
2442  PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2443 
2444  "addiu %[h], %[h], -0x01 \n\t"
2445  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2446  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2447  "bnez %[h], 1b \n\t"
2448  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2449  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2450  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2451  [tmp0]"=&r"(tmp[0]),
2453  [src1]"=&r"(src1),
2454  [h]"+&r"(h),
2455  [dst]"+&r"(dst), [src]"+&r"(src)
2456  : [ff_pw_64]"f"(ff_pw_64.f),
2457  [srcstride]"r"((mips_reg)srcstride),
2458  [dststride]"r"((mips_reg)dststride),
2459  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2460  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2461  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2462  : "memory"
2463  );
2464 #else
2465  const uint8_t *filter = subpel_filters[my - 1];
2466  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2467  int x, y;
2468 
2469  for (y = 0; y < h; y++) {
2470  for (x = 0; x < 4; x++)
2471  dst[x] = FILTER_6TAP(src, filter, srcstride);
2472  dst += dststride;
2473  src += srcstride;
2474  }
2475 #endif
2476 }
2477 
2478 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2479  ptrdiff_t srcstride, int h, int mx, int my)
2480 {
2481 #if 1
2482  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2483  uint8_t *tmp = tmp_array;
2484 
2485  src -= srcstride;
2486  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2487  tmp = tmp_array + 16;
2488  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2489 #else
2490  const uint8_t *filter = subpel_filters[mx - 1];
2491  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2492  int x, y;
2493  uint8_t tmp_array[560];
2494  uint8_t *tmp = tmp_array;
2495 
2496  src -= srcstride;
2497 
2498  for (y = 0; y < h + 3; y++) {
2499  for (x = 0; x < 16; x++)
2500  tmp[x] = FILTER_4TAP(src, filter, 1);
2501  tmp += 16;
2502  src += srcstride;
2503  }
2504 
2505  tmp = tmp_array + 16;
2506  filter = subpel_filters[my - 1];
2507 
2508  for (y = 0; y < h; y++) {
2509  for (x = 0; x < 16; x++)
2510  dst[x] = FILTER_4TAP(tmp, filter, 16);
2511  dst += dststride;
2512  tmp += 16;
2513  }
2514 #endif
2515 }
2516 
2517 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2518  ptrdiff_t srcstride, int h, int mx, int my)
2519 {
2520 #if 1
2521  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2522  uint8_t *tmp = tmp_array;
2523 
2524  src -= srcstride;
2525  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2526  tmp = tmp_array + 8;
2527  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2528 #else
2529  const uint8_t *filter = subpel_filters[mx - 1];
2530  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2531  int x, y;
2532  uint8_t tmp_array[152];
2533  uint8_t *tmp = tmp_array;
2534 
2535  src -= srcstride;
2536 
2537  for (y = 0; y < h + 3; y++) {
2538  for (x = 0; x < 8; x++)
2539  tmp[x] = FILTER_4TAP(src, filter, 1);
2540  tmp += 8;
2541  src += srcstride;
2542  }
2543 
2544  tmp = tmp_array + 8;
2545  filter = subpel_filters[my - 1];
2546 
2547  for (y = 0; y < h; y++) {
2548  for (x = 0; x < 8; x++)
2549  dst[x] = FILTER_4TAP(tmp, filter, 8);
2550  dst += dststride;
2551  tmp += 8;
2552  }
2553 #endif
2554 }
2555 
2556 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2557  ptrdiff_t srcstride, int h, int mx, int my)
2558 {
2559 #if 1
2560  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2561  uint8_t *tmp = tmp_array;
2562 
2563  src -= srcstride;
2564  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2565  tmp = tmp_array + 4;
2566  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2567 #else
2568  const uint8_t *filter = subpel_filters[mx - 1];
2569  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2570  int x, y;
2571  uint8_t tmp_array[44];
2572  uint8_t *tmp = tmp_array;
2573 
2574  src -= srcstride;
2575 
2576  for (y = 0; y < h + 3; y++) {
2577  for (x = 0; x < 4; x++)
2578  tmp[x] = FILTER_4TAP(src, filter, 1);
2579  tmp += 4;
2580  src += srcstride;
2581  }
2582  tmp = tmp_array + 4;
2583  filter = subpel_filters[my - 1];
2584 
2585  for (y = 0; y < h; y++) {
2586  for (x = 0; x < 4; x++)
2587  dst[x] = FILTER_4TAP(tmp, filter, 4);
2588  dst += dststride;
2589  tmp += 4;
2590  }
2591 #endif
2592 }
2593 
2594 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2595  ptrdiff_t srcstride, int h, int mx, int my)
2596 {
2597 #if 1
2598  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2599  uint8_t *tmp = tmp_array;
2600 
2601  src -= 2 * srcstride;
2602  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2603  tmp = tmp_array + 32;
2604  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2605 #else
2606  const uint8_t *filter = subpel_filters[mx - 1];
2607  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2608  int x, y;
2609  uint8_t tmp_array[592];
2610  uint8_t *tmp = tmp_array;
2611 
2612  src -= 2 * srcstride;
2613 
2614  for (y = 0; y < h + 5; y++) {
2615  for (x = 0; x < 16; x++)
2616  tmp[x] = FILTER_4TAP(src, filter, 1);
2617  tmp += 16;
2618  src += srcstride;
2619  }
2620 
2621  tmp = tmp_array + 32;
2622  filter = subpel_filters[my - 1];
2623 
2624  for (y = 0; y < h; y++) {
2625  for (x = 0; x < 16; x++)
2626  dst[x] = FILTER_6TAP(tmp, filter, 16);
2627  dst += dststride;
2628  tmp += 16;
2629  }
2630 #endif
2631 }
2632 
2633 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2634  ptrdiff_t srcstride, int h, int mx, int my)
2635 {
2636 #if 1
2637  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2638  uint8_t *tmp = tmp_array;
2639 
2640  src -= 2 * srcstride;
2641  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2642  tmp = tmp_array + 16;
2643  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2644 #else
2645  const uint8_t *filter = subpel_filters[mx - 1];
2646  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2647  int x, y;
2648  uint8_t tmp_array[168];
2649  uint8_t *tmp = tmp_array;
2650 
2651  src -= 2 * srcstride;
2652 
2653  for (y = 0; y < h + 5; y++) {
2654  for (x = 0; x < 8; x++)
2655  tmp[x] = FILTER_4TAP(src, filter, 1);
2656  tmp += 8;
2657  src += srcstride;
2658  }
2659 
2660  tmp = tmp_array + 16;
2661  filter = subpel_filters[my - 1];
2662 
2663  for (y = 0; y < h; y++) {
2664  for (x = 0; x < 8; x++)
2665  dst[x] = FILTER_6TAP(tmp, filter, 8);
2666  dst += dststride;
2667  tmp += 8;
2668  }
2669 #endif
2670 }
2671 
2672 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2673  ptrdiff_t srcstride, int h, int mx, int my)
2674 {
2675 #if 1
2676  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2677  uint8_t *tmp = tmp_array;
2678 
2679  src -= 2 * srcstride;
2680  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2681  tmp = tmp_array + 8;
2682  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2683 #else
2684  const uint8_t *filter = subpel_filters[mx - 1];
2685  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2686  int x, y;
2687  uint8_t tmp_array[52];
2688  uint8_t *tmp = tmp_array;
2689 
2690  src -= 2 * srcstride;
2691 
2692  for (y = 0; y < h + 5; y++) {
2693  for (x = 0; x < 4; x++)
2694  tmp[x] = FILTER_4TAP(src, filter, 1);
2695  tmp += 4;
2696  src += srcstride;
2697  }
2698 
2699  tmp = tmp_array + 8;
2700  filter = subpel_filters[my - 1];
2701 
2702  for (y = 0; y < h; y++) {
2703  for (x = 0; x < 4; x++)
2704  dst[x] = FILTER_6TAP(tmp, filter, 4);
2705  dst += dststride;
2706  tmp += 4;
2707  }
2708 #endif
2709 }
2710 
2711 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2712  ptrdiff_t srcstride, int h, int mx, int my)
2713 {
2714 #if 1
2715  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2716  uint8_t *tmp = tmp_array;
2717 
2718  src -= srcstride;
2719  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2720  tmp = tmp_array + 16;
2721  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2722 #else
2723  const uint8_t *filter = subpel_filters[mx - 1];
2724  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2725  int x, y;
2726  uint8_t tmp_array[560];
2727  uint8_t *tmp = tmp_array;
2728 
2729  src -= srcstride;
2730 
2731  for (y = 0; y < h + 3; y++) {
2732  for (x = 0; x < 16; x++)
2733  tmp[x] = FILTER_6TAP(src, filter, 1);
2734  tmp += 16;
2735  src += srcstride;
2736  }
2737 
2738  tmp = tmp_array + 16;
2739  filter = subpel_filters[my - 1];
2740 
2741  for (y = 0; y < h; y++) {
2742  for (x = 0; x < 16; x++)
2743  dst[x] = FILTER_4TAP(tmp, filter, 16);
2744  dst += dststride;
2745  tmp += 16;
2746  }
2747 #endif
2748 }
2749 
2750 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2751  ptrdiff_t srcstride, int h, int mx, int my)
2752 {
2753 #if 1
2754  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2755  uint8_t *tmp = tmp_array;
2756 
2757  src -= srcstride;
2758  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2759  tmp = tmp_array + 8;
2760  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2761 #else
2762  const uint8_t *filter = subpel_filters[mx - 1];
2763  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2764  int x, y;
2765  uint8_t tmp_array[152];
2766  uint8_t *tmp = tmp_array;
2767 
2768  src -= srcstride;
2769 
2770  for (y = 0; y < h + 3; y++) {
2771  for (x = 0; x < 8; x++)
2772  tmp[x] = FILTER_6TAP(src, filter, 1);
2773  tmp += 8;
2774  src += srcstride;
2775  }
2776 
2777  tmp = tmp_array + 8;
2778  filter = subpel_filters[my - 1];
2779 
2780  for (y = 0; y < h; y++) {
2781  for (x = 0; x < 8; x++)
2782  dst[x] = FILTER_4TAP(tmp, filter, 8);
2783  dst += dststride;
2784  tmp += 8;
2785  }
2786 #endif
2787 }
2788 
2789 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2790  ptrdiff_t srcstride, int h, int mx, int my)
2791 {
2792 #if 1
2793  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2794  uint8_t *tmp = tmp_array;
2795 
2796  src -= srcstride;
2797  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2798  tmp = tmp_array + 4;
2799  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2800 #else
2801  const uint8_t *filter = subpel_filters[mx - 1];
2802  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2803  int x, y;
2804  uint8_t tmp_array[44];
2805  uint8_t *tmp = tmp_array;
2806 
2807  src -= srcstride;
2808 
2809  for (y = 0; y < h + 3; y++) {
2810  for (x = 0; x < 4; x++)
2811  tmp[x] = FILTER_6TAP(src, filter, 1);
2812  tmp += 4;
2813  src += srcstride;
2814  }
2815 
2816  tmp = tmp_array + 4;
2817  filter = subpel_filters[my - 1];
2818 
2819  for (y = 0; y < h; y++) {
2820  for (x = 0; x < 4; x++)
2821  dst[x] = FILTER_4TAP(tmp, filter, 4);
2822  dst += dststride;
2823  tmp += 4;
2824  }
2825 #endif
2826 }
2827 
2828 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2829  ptrdiff_t srcstride, int h, int mx, int my)
2830 {
2831 #if 1
2832  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2833  uint8_t *tmp = tmp_array;
2834 
2835  src -= 2 * srcstride;
2836  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2837  tmp = tmp_array + 32;
2838  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2839 #else
2840  const uint8_t *filter = subpel_filters[mx - 1];
2841  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2842  int x, y;
2843  uint8_t tmp_array[592];
2844  uint8_t *tmp = tmp_array;
2845 
2846  src -= 2 * srcstride;
2847 
2848  for (y = 0; y < h + 5; y++) {
2849  for (x = 0; x < 16; x++)
2850  tmp[x] = FILTER_6TAP(src, filter, 1);
2851  tmp += 16;
2852  src += srcstride;
2853  }
2854 
2855  tmp = tmp_array + 32;
2856  filter = subpel_filters[my - 1];
2857 
2858  for (y = 0; y < h; y++) {
2859  for (x = 0; x < 16; x++)
2860  dst[x] = FILTER_6TAP(tmp, filter, 16);
2861  dst += dststride;
2862  tmp += 16;
2863  }
2864 #endif
2865 }
2866 
2867 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2868  ptrdiff_t srcstride, int h, int mx, int my)
2869 {
2870 #if 1
2871  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2872  uint8_t *tmp = tmp_array;
2873 
2874  src -= 2 * srcstride;
2875  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2876  tmp = tmp_array + 16;
2877  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2878 #else
2879  const uint8_t *filter = subpel_filters[mx - 1];
2880  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2881  int x, y;
2882  uint8_t tmp_array[168];
2883  uint8_t *tmp = tmp_array;
2884 
2885  src -= 2 * srcstride;
2886 
2887  for (y = 0; y < h + 5; y++) {
2888  for (x = 0; x < 8; x++)
2889  tmp[x] = FILTER_6TAP(src, filter, 1);
2890  tmp += 8;
2891  src += srcstride;
2892  }
2893 
2894  tmp = tmp_array + 16;
2895  filter = subpel_filters[my - 1];
2896 
2897  for (y = 0; y < h; y++) {
2898  for (x = 0; x < 8; x++)
2899  dst[x] = FILTER_6TAP(tmp, filter, 8);
2900  dst += dststride;
2901  tmp += 8;
2902  }
2903 #endif
2904 }
2905 
2906 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2907  ptrdiff_t srcstride, int h, int mx, int my)
2908 {
2909 #if 1
2910  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2911  uint8_t *tmp = tmp_array;
2912 
2913  src -= 2 * srcstride;
2914  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2915  tmp = tmp_array + 8;
2916  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2917 #else
2918  const uint8_t *filter = subpel_filters[mx - 1];
2919  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2920  int x, y;
2921  uint8_t tmp_array[52];
2922  uint8_t *tmp = tmp_array;
2923 
2924  src -= 2 * srcstride;
2925 
2926  for (y = 0; y < h + 5; y++) {
2927  for (x = 0; x < 4; x++)
2928  tmp[x] = FILTER_6TAP(src, filter, 1);
2929  tmp += 4;
2930  src += srcstride;
2931  }
2932 
2933  tmp = tmp_array + 8;
2934  filter = subpel_filters[my - 1];
2935 
2936  for (y = 0; y < h; y++) {
2937  for (x = 0; x < 4; x++)
2938  dst[x] = FILTER_6TAP(tmp, filter, 4);
2939  dst += dststride;
2940  tmp += 4;
2941  }
2942 #endif
2943 }
2944 
2945 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2946  ptrdiff_t sstride, int h, int mx, int my)
2947 {
2948 #if 1
2949  union mmi_intfloat64 a, b;
2950  double ftmp[7];
2951  uint32_t tmp[1];
2952  mips_reg dst0, src0;
2954  a.i = 8 - mx;
2955  b.i = mx;
2956 
2957  /*
2958  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2959  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2960  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2961  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2962  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2963  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2964  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2965  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2966 
2967  dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2968  dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2969  dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2970  dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2971  dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2972  dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2973  dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2974  dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2975  */
2976  __asm__ volatile (
2977  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2978  "li %[tmp0], 0x03 \n\t"
2979  "mtc1 %[tmp0], %[ftmp4] \n\t"
2980  "pshufh %[a], %[a], %[ftmp0] \n\t"
2981  "pshufh %[b], %[b], %[ftmp0] \n\t"
2982 
2983  "1: \n\t"
2984  // 0 - 7
2985  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2986  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2987  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2988  // 8 - 15
2989  PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2990 
2991  "addiu %[h], %[h], -0x01 \n\t"
2992  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2993  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2994  "bnez %[h], 1b \n\t"
2995  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2996  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2997  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2998  [ftmp6]"=&f"(ftmp[6]),
2999  [tmp0]"=&r"(tmp[0]),
3001  [dst0]"=&r"(dst0), [src0]"=&r"(src0),
3002  [h]"+&r"(h),
3003  [dst]"+&r"(dst), [src]"+&r"(src),
3004  [a]"+&f"(a.f), [b]"+&f"(b.f)
3005  : [sstride]"r"((mips_reg)sstride),
3006  [dstride]"r"((mips_reg)dstride),
3007  [ff_pw_4]"f"(ff_pw_4.f)
3008  : "memory"
3009  );
3010 #else
3011  int a = 8 - mx, b = mx;
3012  int x, y;
3013 
3014  for (y = 0; y < h; y++) {
3015  for (x = 0; x < 16; x++)
3016  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3017  dst += dstride;
3018  src += sstride;
3019  }
3020 #endif
3021 }
3022 
3023 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024  ptrdiff_t sstride, int h, int mx, int my)
3025 {
3026 #if 1
3027  union mmi_intfloat64 c, d;
3028  double ftmp[7];
3029  uint32_t tmp[1];
3030  mips_reg src0, src1, dst0;
3032  c.i = 8 - my;
3033  d.i = my;
3034 
3035  /*
3036  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3037  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3038  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3039  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3040  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3041  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3042  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3043  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3044  */
3045  __asm__ volatile (
3046  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3047  "li %[tmp0], 0x03 \n\t"
3048  "mtc1 %[tmp0], %[ftmp4] \n\t"
3049  "pshufh %[c], %[c], %[ftmp0] \n\t"
3050  "pshufh %[d], %[d], %[ftmp0] \n\t"
3051 
3052  "1: \n\t"
3053  // 0 - 7
3054  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3055  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
3056  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
3057  // 8 - 15
3058  PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
3059 
3060  "addiu %[h], %[h], -0x01 \n\t"
3061  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3062  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3063  "bnez %[h], 1b \n\t"
3064  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3065  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3066  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3067  [ftmp6]"=&f"(ftmp[6]),
3068  [tmp0]"=&r"(tmp[0]),
3070  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
3071  [src1]"=&r"(src1),
3072  [h]"+&r"(h),
3073  [dst]"+&r"(dst), [src]"+&r"(src),
3074  [c]"+&f"(c.f), [d]"+&f"(d.f)
3075  : [sstride]"r"((mips_reg)sstride),
3076  [dstride]"r"((mips_reg)dstride),
3077  [ff_pw_4]"f"(ff_pw_4.f)
3078  : "memory"
3079  );
3080 #else
3081  int c = 8 - my, d = my;
3082  int x, y;
3083 
3084  for (y = 0; y < h; y++) {
3085  for (x = 0; x < 16; x++)
3086  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3087  dst += dstride;
3088  src += sstride;
3089  }
3090 #endif
3091 }
3092 
3093 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3094  ptrdiff_t sstride, int h, int mx, int my)
3095 {
3096 #if 1
3097  DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
3098  uint8_t *tmp = tmp_array;
3099 
3100  ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
3101  ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3102 #else
3103  int a = 8 - mx, b = mx;
3104  int c = 8 - my, d = my;
3105  int x, y;
3106  uint8_t tmp_array[528];
3107  uint8_t *tmp = tmp_array;
3108 
3109  for (y = 0; y < h + 1; y++) {
3110  for (x = 0; x < 16; x++)
3111  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3112  tmp += 16;
3113  src += sstride;
3114  }
3115 
3116  tmp = tmp_array;
3117 
3118  for (y = 0; y < h; y++) {
3119  for (x = 0; x < 16; x++)
3120  dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3121  dst += dstride;
3122  tmp += 16;
3123  }
3124 #endif
3125 }
3126 
3127 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3128  ptrdiff_t sstride, int h, int mx, int my)
3129 {
3130 #if 1
3131  union mmi_intfloat64 a, b;
3132  double ftmp[7];
3133  uint32_t tmp[1];
3135  a.i = 8 - mx;
3136  b.i = mx;
3137 
3138  /*
3139  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3140  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3141  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3142  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3143  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3144  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3145  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3146  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3147  */
3148  __asm__ volatile (
3149  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3150  "li %[tmp0], 0x03 \n\t"
3151  "mtc1 %[tmp0], %[ftmp4] \n\t"
3152  "pshufh %[a], %[a], %[ftmp0] \n\t"
3153  "pshufh %[b], %[b], %[ftmp0] \n\t"
3154 
3155  "1: \n\t"
3156  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3157 
3158  "addiu %[h], %[h], -0x01 \n\t"
3159  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3160  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3161  "bnez %[h], 1b \n\t"
3162  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3163  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3164  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3165  [ftmp6]"=&f"(ftmp[6]),
3166  [tmp0]"=&r"(tmp[0]),
3168  [h]"+&r"(h),
3169  [dst]"+&r"(dst), [src]"+&r"(src),
3170  [a]"+&f"(a.f), [b]"+&f"(b.f)
3171  : [sstride]"r"((mips_reg)sstride),
3172  [dstride]"r"((mips_reg)dstride),
3173  [ff_pw_4]"f"(ff_pw_4.f)
3174  : "memory"
3175  );
3176 #else
3177  int a = 8 - mx, b = mx;
3178  int x, y;
3179 
3180  for (y = 0; y < h; y++) {
3181  for (x = 0; x < 8; x++)
3182  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3183  dst += dstride;
3184  src += sstride;
3185  }
3186 #endif
3187 }
3188 
3189 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3190  ptrdiff_t sstride, int h, int mx, int my)
3191 {
3192 #if 1
3193  union mmi_intfloat64 c, d;
3194  double ftmp[7];
3195  uint32_t tmp[1];
3196  mips_reg src1;
3198  c.i = 8 - my;
3199  d.i = my;
3200 
3201  /*
3202  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3203  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3204  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3205  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3206  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3207  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3208  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3209  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3210  */
3211  __asm__ volatile (
3212  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3213  "li %[tmp0], 0x03 \n\t"
3214  "mtc1 %[tmp0], %[ftmp4] \n\t"
3215  "pshufh %[c], %[c], %[ftmp0] \n\t"
3216  "pshufh %[d], %[d], %[ftmp0] \n\t"
3217 
3218  "1: \n\t"
3219  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3220 
3221  "addiu %[h], %[h], -0x01 \n\t"
3222  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3223  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3224  "bnez %[h], 1b \n\t"
3225  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3226  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3227  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3228  [ftmp6]"=&f"(ftmp[6]),
3229  [tmp0]"=&r"(tmp[0]),
3231  [src1]"=&r"(src1),
3232  [h]"+&r"(h),
3233  [dst]"+&r"(dst), [src]"+&r"(src),
3234  [c]"+&f"(c.f), [d]"+&f"(d.f)
3235  : [sstride]"r"((mips_reg)sstride),
3236  [dstride]"r"((mips_reg)dstride),
3237  [ff_pw_4]"f"(ff_pw_4.f)
3238  : "memory"
3239  );
3240 #else
3241  int c = 8 - my, d = my;
3242  int x, y;
3243 
3244  for (y = 0; y < h; y++) {
3245  for (x = 0; x < 8; x++)
3246  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3247  dst += dstride;
3248  src += sstride;
3249  }
3250 #endif
3251 }
3252 
3253 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3254  ptrdiff_t sstride, int h, int mx, int my)
3255 {
3256 #if 1
3257  DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3258  uint8_t *tmp = tmp_array;
3259 
3260  ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3261  ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3262 #else
3263  int a = 8 - mx, b = mx;
3264  int c = 8 - my, d = my;
3265  int x, y;
3266  uint8_t tmp_array[136];
3267  uint8_t *tmp = tmp_array;
3268 
3269  for (y = 0; y < h + 1; y++) {
3270  for (x = 0; x < 8; x++)
3271  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3272  tmp += 8;
3273  src += sstride;
3274  }
3275 
3276  tmp = tmp_array;
3277 
3278  for (y = 0; y < h; y++) {
3279  for (x = 0; x < 8; x++)
3280  dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3281  dst += dstride;
3282  tmp += 8;
3283  }
3284 #endif
3285 }
3286 
3287 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3288  ptrdiff_t sstride, int h, int mx, int my)
3289 {
3290 #if 1
3291  union mmi_intfloat64 a, b;
3292  double ftmp[5];
3293  uint32_t tmp[1];
3296  a.i = 8 - mx;
3297  b.i = mx;
3298 
3299  /*
3300  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3301  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3302  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3303  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3304  */
3305  __asm__ volatile (
3306  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3307  "li %[tmp0], 0x03 \n\t"
3308  "mtc1 %[tmp0], %[ftmp4] \n\t"
3309  "pshufh %[a], %[a], %[ftmp0] \n\t"
3310  "pshufh %[b], %[b], %[ftmp0] \n\t"
3311 
3312  "1: \n\t"
3313  PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3314 
3315  "addiu %[h], %[h], -0x01 \n\t"
3316  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3317  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3318  "bnez %[h], 1b \n\t"
3319  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3320  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3321  [ftmp4]"=&f"(ftmp[4]),
3322  [tmp0]"=&r"(tmp[0]),
3325  [h]"+&r"(h),
3326  [dst]"+&r"(dst), [src]"+&r"(src),
3327  [a]"+&f"(a.f), [b]"+&f"(b.f)
3328  : [sstride]"r"((mips_reg)sstride),
3329  [dstride]"r"((mips_reg)dstride),
3330  [ff_pw_4]"f"(ff_pw_4.f)
3331  : "memory"
3332  );
3333 #else
3334  int a = 8 - mx, b = mx;
3335  int x, y;
3336 
3337  for (y = 0; y < h; y++) {
3338  for (x = 0; x < 4; x++)
3339  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3340  dst += dstride;
3341  src += sstride;
3342  }
3343 #endif
3344 }
3345 
3346 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3347  ptrdiff_t sstride, int h, int mx, int my)
3348 {
3349 #if 1
3350  union mmi_intfloat64 c, d;
3351  double ftmp[7];
3352  uint32_t tmp[1];
3353  mips_reg src1;
3356  c.i = 8 - my;
3357  d.i = my;
3358 
3359  /*
3360  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3361  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3362  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3363  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3364  */
3365  __asm__ volatile (
3366  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3367  "li %[tmp0], 0x03 \n\t"
3368  "mtc1 %[tmp0], %[ftmp4] \n\t"
3369  "pshufh %[c], %[c], %[ftmp0] \n\t"
3370  "pshufh %[d], %[d], %[ftmp0] \n\t"
3371 
3372  "1: \n\t"
3373  PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3374 
3375  "addiu %[h], %[h], -0x01 \n\t"
3376  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3377  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3378  "bnez %[h], 1b \n\t"
3379  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3380  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3381  [ftmp4]"=&f"(ftmp[4]),
3382  [tmp0]"=&r"(tmp[0]),
3385  [src1]"=&r"(src1),
3386  [h]"+&r"(h),
3387  [dst]"+&r"(dst), [src]"+&r"(src),
3388  [c]"+&f"(c.f), [d]"+&f"(d.f)
3389  : [sstride]"r"((mips_reg)sstride),
3390  [dstride]"r"((mips_reg)dstride),
3391  [ff_pw_4]"f"(ff_pw_4.f)
3392  : "memory"
3393  );
3394 #else
3395  int c = 8 - my, d = my;
3396  int x, y;
3397 
3398  for (y = 0; y < h; y++) {
3399  for (x = 0; x < 4; x++)
3400  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3401  dst += dstride;
3402  src += sstride;
3403  }
3404 #endif
3405 }
3406 
3407 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3408  ptrdiff_t sstride, int h, int mx, int my)
3409 {
3410 #if 1
3411  DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3412  uint8_t *tmp = tmp_array;
3413 
3414  ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3415  ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3416 #else
3417  int a = 8 - mx, b = mx;
3418  int c = 8 - my, d = my;
3419  int x, y;
3420  uint8_t tmp_array[36];
3421  uint8_t *tmp = tmp_array;
3422 
3423  for (y = 0; y < h + 1; y++) {
3424  for (x = 0; x < 4; x++)
3425  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3426  tmp += 4;
3427  src += sstride;
3428  }
3429 
3430  tmp = tmp_array;
3431 
3432  for (y = 0; y < h; y++) {
3433  for (x = 0; x < 4; x++)
3434  dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3435  dst += dstride;
3436  tmp += 4;
3437  }
3438 #endif
3439 }
DECLARE_UINT32_T
#define DECLARE_UINT32_T
Definition: vp8dsp_mmi.c:32
PUT_VP8_EPEL4_V6_MMI
#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:258
q1
static const uint8_t q1[256]
Definition: twofish.c:100
mem_internal.h
ff_put_vp8_epel4_h4_mmi
void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1724
FILTER_4TAP
#define FILTER_4TAP(src, F, stride)
Definition: vp8dsp.c:488
vp8_filter_common_isnot4tap
static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:693
ff_vp8_h_loop_filter16_mmi
void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1357
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:360
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_pw_4
const union av_intfloat64 ff_pw_4
Definition: constants.c:28
ff_vp8_v_loop_filter_simple_mmi
void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1424
DECLARE_VAR_LOW32
#define DECLARE_VAR_LOW32
Definition: mmiutils.h:37
av_unused
#define av_unused
Definition: attributes.h:131
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
w
uint8_t w
Definition: llviddspenc.c:38
t0
#define t0
Definition: regdef.h:28
b
#define b
Definition: input.c:41
ff_put_vp8_pixels16_mmi
void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1442
ff_put_vp8_bilinear16_v_mmi
void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3023
t1
#define t1
Definition: regdef.h:29
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
mips_reg
#define mips_reg
Definition: asmdefs.h:46
RESTRICT_ASM_DOUBLE_1
#define RESTRICT_ASM_DOUBLE_1
Definition: vp8dsp_mmi.c:33
ff_crop_tab
#define ff_crop_tab
Definition: motionpixels_tablegen.c:26
ff_vp8_h_loop_filter_simple_mmi
void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1433
PUT_VP8_EPEL8_V4_MMI
#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:492
t10
#define t10
Definition: regdef.h:55
ff_vp8_luma_dc_wht_mmi
void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:946
vp8_simple_limit
static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:720
ff_pw_64
const union av_intfloat64 ff_pw_64
Definition: constants.c:44
PUT_VP8_BILINEAR4_H_MMI
#define PUT_VP8_BILINEAR4_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:560
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:295
ff_put_vp8_bilinear16_h_mmi
void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2945
ff_put_vp8_bilinear8_v_mmi
void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3189
vp8_filter_common_is4tap
static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:668
ff_put_vp8_epel8_h4v4_mmi
void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2517
constants.h
val
static double val(void *priv, double ch)
Definition: aeval.c:78
fourtap_subpel_filters
static const uint64_t fourtap_subpel_filters[7][6]
Definition: vp8dsp_mmi.c:620
DECLARE_DOUBLE_2
#define DECLARE_DOUBLE_2
Definition: vp8dsp_mmi.c:31
mmiutils.h
a1
#define a1
Definition: regdef.h:47
ff_put_vp8_epel16_h6v6_mmi
void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2828
ff_put_vp8_epel8_h4v6_mmi
void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2633
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp8_v_loop_filter8_mmi
static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:786
ff_put_vp8_epel4_h4v6_mmi
void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2672
AV_ZERO64
#define AV_ZERO64(d)
Definition: intreadwrite.h:629
ff_put_vp8_epel16_v6_mmi
void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2241
PUT_VP8_BILINEAR4_V_MMI
#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:602
PUT_VP8_BILINEAR8_V_MMI
#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:577
ff_vp8_h_loop_filter16_inner_mmi
void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1395
ff_put_vp8_epel8_v4_mmi
void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2105
ff_vp8_idct_add_mmi
void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1104
av_intfloat64
Definition: intfloat.h:32
ff_put_vp8_epel4_h4v4_mmi
void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2556
q0
static const uint8_t q0[256]
Definition: twofish.c:81
E
#define E
Definition: avdct.c:32
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
FILTER_6TAP
#define FILTER_6TAP(src, F, stride)
Definition: vp8dsp.c:483
ff_put_vp8_epel4_h6_mmi
void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1951
PUT_VP8_BILINEAR8_H_MMI
#define PUT_VP8_BILINEAR8_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:536
RESTRICT_ASM_UINT32_T
#define RESTRICT_ASM_UINT32_T
Definition: vp8dsp_mmi.c:35
PUT_VP8_EPEL4_V4_MMI
#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:301
ff_put_vp8_bilinear4_h_mmi
void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3287
ff_put_vp8_epel8_h6v6_mmi
void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2867
ff_put_vp8_epel8_h4_mmi
void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1654
MMI_VP8_LOOP_FILTER
#define MMI_VP8_LOOP_FILTER
Definition: vp8dsp_mmi.c:49
av_intfloat64::i
uint64_t i
Definition: intfloat.h:33
ff_put_vp8_epel4_h6v6_mmi
void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2906
vp8_v_loop_filter8_inner_mmi
static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:845
ff_put_vp8_epel16_h6_mmi
void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1787
PUT_VP8_EPEL4_H4_MMI
#define PUT_VP8_EPEL4_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:230
ff_vp8_v_loop_filter16_inner_mmi
void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1380
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RESTRICT_ASM_DOUBLE_2
#define RESTRICT_ASM_DOUBLE_2
Definition: vp8dsp_mmi.c:34
ff_vp8_v_loop_filter8uv_mmi
void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1365
t11
#define t11
Definition: regdef.h:56
vp8_h_loop_filter8_mmi
static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:860
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vp8_idct_dc_add4y_mmi
void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1331
ff_put_vp8_epel8_h6v4_mmi
void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2750
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:109
ff_put_vp8_epel16_v4_mmi
void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2019
ff_put_vp8_bilinear8_hv_mmi
void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3253
ff_put_vp8_bilinear4_v_mmi
void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3346
ff_put_vp8_epel8_h6_mmi
void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1877
t12
#define t12
Definition: regdef.h:58
ff_vp8_idct_dc_add_mmi
void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1273
ff_vp8_idct_dc_add4uv_mmi
void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1340
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:52
DECLARE_VAR_ALL64
#define DECLARE_VAR_ALL64
Definition: mmiutils.h:39
attributes.h
vp8_normal_limit
static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, int E, int I)
Definition: vp8dsp_mmi.c:768
a0
#define a0
Definition: regdef.h:46
ff_put_vp8_epel16_h4v4_mmi
void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2478
ff_put_vp8_epel4_v4_mmi
void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2176
ff_put_vp8_bilinear16_hv_mmi
void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3093
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
ff_put_vp8_bilinear8_h_mmi
void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3127
t3
#define t3
Definition: regdef.h:31
ff_vp8_v_loop_filter8uv_inner_mmi
void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1410
ff_vp8_v_loop_filter16_mmi
void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1350
ff_put_vp8_epel16_h4_mmi
void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1569
clip_int8
#define clip_int8(n)
Definition: vp8dsp_mmi.c:667
ff_put_vp8_bilinear4_hv_mmi
void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3407
a2
#define a2
Definition: regdef.h:48
DECLARE_DOUBLE_1
#define DECLARE_DOUBLE_1
Definition: vp8dsp_mmi.c:30
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
PUT_VP8_EPEL8_V6_MMI
#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:430
ff_put_vp8_epel16_h6v4_mmi
void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2711
PUT_VP8_EPEL8_H4_MMI
#define PUT_VP8_EPEL8_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:389
av_intfloat64::f
double f
Definition: intfloat.h:34
stride
#define stride
Definition: h264pred_template.c:537
ff_put_vp8_epel8_v6_mmi
void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2332
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
vp8_h_loop_filter8_inner_mmi
static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:931
ff_vp8_h_loop_filter8uv_inner_mmi
void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1417
vp8dsp_mips.h
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:49
ff_vp8_luma_dc_wht_dc_mmi
void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:1080
RESTRICT_ASM_LOW32
#define RESTRICT_ASM_LOW32
Definition: mmiutils.h:38
ff_put_vp8_epel16_h4v6_mmi
void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2594
t2
#define t2
Definition: regdef.h:30
ff_put_vp8_pixels4_mmi
void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1529
TRANSPOSE_8B
#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 8x8 byte packaged data.
Definition: mmiutils.h:311
PUT_VP8_EPEL4_H6_MMI
#define PUT_VP8_EPEL4_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:193
cm
#define cm
Definition: dvbsubdec.c:39
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:50
ff_put_vp8_epel4_h6v4_mmi
void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2789
av_clip_uint8
#define av_clip_uint8
Definition: common.h:104
src0
const pixel *const src0
Definition: h264pred_template.c:420
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
filter_mbedge
static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:741
mmi_intfloat64
Definition: asmdefs.h:103
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:731
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
ff_put_vp8_epel4_v6_mmi
void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2408
d
d
Definition: ffmpeg_filter.c:409
ff_put_vp8_pixels8_mmi
void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1489
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
PUT_VP8_EPEL8_H6_MMI
#define PUT_VP8_EPEL8_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:332
h
h
Definition: vp9dsp_template.c:2038
MAX_NEG_CROP
#define MAX_NEG_CROP
Definition: mathops.h:31
RESTRICT_ASM_ALL64
#define RESTRICT_ASM_ALL64
Definition: mmiutils.h:40
ff_vp8_h_loop_filter8uv_mmi
void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1372
subpel_filters
static const uint8_t subpel_filters[7][6]
Definition: vp8dsp.c:459