FFmpeg
simple_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "idctdsp_mips.h"
23 
24 static void simple_idct_msa(int16_t *block)
25 {
26  int32_t const_val;
27  v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
28  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
29  v8i16 w1, w3, w5, w7;
30  v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
31  v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
32  v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
33  v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
34  v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
35  v4i32 w2, w4, w6;
36  v8i16 select_vec, temp;
37  v8i16 zero = { 0 };
38  v4i32 const_val0 = __msa_ldi_w(1);
39  v4i32 const_val1 = __msa_ldi_w(1);
40 
41  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
42  const_val0 <<= 10;
43  const_val = 16383 * ((1 << 19) / 16383);
44  const_val1 = __msa_insert_w(const_val0, 0, const_val);
45  const_val1 = __msa_splati_w(const_val1, 0);
46  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
47  in0, in1, in2, in3, in4, in5, in6, in7);
48  select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
49  select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
50  UNPCK_SH_SW(in0, a0_r, a0_l);
51  UNPCK_SH_SW(in2, temp3_r, temp3_l);
52  temp = in0 << 3;
53  w2 = (v4i32) __msa_splati_h(weights, 2);
54  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
55  w4 = (v4i32) __msa_splati_h(weights, 4);
56  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
57  w6 = (v4i32) __msa_splati_h(weights, 6);
58  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
59  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
60  ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
61  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
62  temp1_r, temp1_l, temp2_r, temp2_l);
63  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
64  temp2_l, temp2_r, temp1_l, temp1_r,
65  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
66  UNPCK_SH_SW(in4, temp0_r, temp0_l);
67  UNPCK_SH_SW(in6, temp3_r, temp3_l);
68  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
69  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
70  temp2_r, temp2_l, temp1_r, temp1_l);
71  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
72  SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
73  a1_r, a1_l, a2_r, a2_l);
74  ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
75  a3_r, a3_l, a0_r, a0_l);
76  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
77  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
78  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
79  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
80  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
81  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
82  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
83  const0, const1, const2, const3);
84  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
85  const5 = __msa_ilvod_h(-w1, -w5);
86  const7 = __msa_ilvod_h(w3, -w1);
87  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
88  b0_r, b1_r, b2_r, b3_r);
89  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
90  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
91  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
92  b0_l, b1_l, b2_l, b3_l);
93  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
94  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
95  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
96  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
97  temp0_r, temp0_l, temp1_r, temp1_l,
98  temp2_r, temp2_l, temp3_r, temp3_l,
99  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
100  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
101  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
102  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
103  temp2_l, temp2_r, temp3_l, temp3_r,
104  temp0_r, temp1_r, temp2_r, temp3_r);
105  in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
106  (v16u8) select_vec);
107  in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
108  (v16u8) select_vec);
109  in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
110  (v16u8) select_vec);
111  in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
112  (v16u8) select_vec);
113  SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
114  SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
115  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
116  a0_r, a1_r, a2_r, a3_r);
117  in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
118  in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
119  in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
120  in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
121  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
122  in0, in1, in2, in3, in4, in5, in6, in7);
123 
124  UNPCK_SH_SW(in0, a0_r, a0_l);
125  UNPCK_SH_SW(in2, temp3_r, temp3_l);
126  w2 = (v4i32) __msa_splati_h(weights, 2);
127  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
128  w4 = (v4i32) __msa_splati_h(weights, 4);
129  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
130  w6 = (v4i32) __msa_splati_h(weights, 6);
131  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
132  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
133  ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
134  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
135  temp1_r, temp1_l, temp2_r, temp2_l);
136  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
137  temp2_l, temp2_r, temp1_l, temp1_r,
138  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
139  UNPCK_SH_SW(in4, temp0_r, temp0_l);
140  UNPCK_SH_SW(in6, temp3_r, temp3_l);
141  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
142  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
143  temp2_r, temp2_l, temp1_r, temp1_l);
144  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
145  SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
146  a1_r, a1_l, a2_r, a2_l);
147  ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
148  a3_r, a3_l, a0_r, a0_l);
149  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
150  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
151  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
152  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
153  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
154  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
155  const0, const1, const2, const3);
156  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
157  b0_r, b1_r, b2_r, b3_r);
158  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
159  b0_l, b1_l, b2_l, b3_l);
160  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
161  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
162  const5 = __msa_ilvod_h(-w1, -w5);
163  const7 = __msa_ilvod_h(w3, -w1);
164  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
165  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
166  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
167  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
168  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
169  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
170  temp0_r, temp0_l, temp1_r, temp1_l,
171  temp2_r, temp2_l, temp3_r, temp3_l,
172  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
173  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
174  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
175  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
176  temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
177  SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
178  SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
179  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
180  a0_r, a1_r, a2_r, a3_r);
181  ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
182  block, 8);
183 }
184 
185 static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
186  int16_t *block)
187 {
188  int32_t const_val;
189  uint64_t tmp0, tmp1, tmp2, tmp3;
190  v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
191  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
192  v8i16 w1, w3, w5, w7;
193  v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
194  v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
195  v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
196  v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
197  v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
198  v4i32 w2, w4, w6;
199  v8i16 select_vec, temp;
200  v8i16 zero = { 0 };
201  v4i32 const_val0 = __msa_ldi_w(1);
202  v4i32 const_val1 = __msa_ldi_w(1);
203 
204  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
205  const_val0 <<= 10;
206  const_val = 16383 * ((1 << 19) / 16383);
207  const_val1 = __msa_insert_w(const_val0, 0, const_val);
208  const_val1 = __msa_splati_w(const_val1, 0);
209  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
210  in0, in1, in2, in3, in4, in5, in6, in7);
211  select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
212  select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
213  UNPCK_SH_SW(in0, a0_r, a0_l);
214  UNPCK_SH_SW(in2, temp3_r, temp3_l);
215  temp = in0 << 3;
216  w2 = (v4i32) __msa_splati_h(weights, 2);
217  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
218  w4 = (v4i32) __msa_splati_h(weights, 4);
219  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
220  w6 = (v4i32) __msa_splati_h(weights, 6);
221  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
222  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
223  ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
224  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
225  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
226  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
227  temp2_l, temp2_r, temp1_l, temp1_r,
228  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
229  UNPCK_SH_SW(in4, temp0_r, temp0_l);
230  UNPCK_SH_SW(in6, temp3_r, temp3_l);
231  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
232  MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
233  MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
234  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
235  SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
236  SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
237  ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
238  ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
239  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
240  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
241  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
242  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
243  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
244  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
245  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
246  const0, const1, const2, const3);
247  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
248  const5 = __msa_ilvod_h(-w1, -w5);
249  const7 = __msa_ilvod_h(w3, -w1);
250  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
251  b0_r, b1_r, b2_r, b3_r);
252  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
253  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
254  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
255  b0_l, b1_l, b2_l, b3_l);
256  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
257  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
258  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
259  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
260  temp0_r, temp0_l, temp1_r, temp1_l,
261  temp2_r, temp2_l, temp3_r, temp3_l,
262  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
263  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
264  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
265  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
266  temp2_l, temp2_r, temp3_l, temp3_r,
267  temp0_r, temp1_r, temp2_r, temp3_r);
268  in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
269  (v16u8) select_vec);
270  in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
271  (v16u8) select_vec);
272  in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
273  (v16u8) select_vec);
274  in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
275  (v16u8) select_vec);
276  SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
277  SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
278  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
279  a0_r, a1_r, a2_r, a3_r);
280  in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
281  in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
282  in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
283  in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
284  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
285  in0, in1, in2, in3, in4, in5, in6, in7);
286  UNPCK_SH_SW(in0, a0_r, a0_l);
287  UNPCK_SH_SW(in2, temp3_r, temp3_l);
288  w2 = (v4i32) __msa_splati_h(weights, 2);
289  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
290  w4 = (v4i32) __msa_splati_h(weights, 4);
291  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
292  w6 = (v4i32) __msa_splati_h(weights, 6);
293  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
294  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
295  ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
296  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
297  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
298  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
299  temp2_l, temp2_r, temp1_l, temp1_r,
300  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
301  UNPCK_SH_SW(in4, temp0_r, temp0_l);
302  UNPCK_SH_SW(in6, temp3_r, temp3_l);
303  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
304  MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
305  MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
306  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
307  SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
308  SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
309  ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
310  ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
311  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
312  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
313  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
314  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
315  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
316  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
317  const0, const1, const2, const3);
318  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
319  b0_r, b1_r, b2_r, b3_r);
320  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
321  b0_l, b1_l, b2_l, b3_l);
322  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
323  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
324  const5 = __msa_ilvod_h(-w1, -w5);
325  const7 = __msa_ilvod_h(w3, -w1);
326  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
327  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
328  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
329  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
330  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
331  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
332  temp0_r, temp0_l, temp1_r, temp1_l,
333  temp2_r, temp2_l, temp3_r, temp3_l,
334  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
335  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
336  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
337  SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
338  SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
339  PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
340  temp3_l, temp3_r, in0, in1, in2, in3);
341  PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
342  in4, in5, in6, in7);
343  CLIP_SH4_0_255(in0, in1, in2, in3);
344  PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
345  in0, in1, in2, in3);
346  tmp0 = __msa_copy_u_d((v2i64) in0, 1);
347  tmp1 = __msa_copy_u_d((v2i64) in1, 1);
348  tmp2 = __msa_copy_u_d((v2i64) in2, 1);
349  tmp3 = __msa_copy_u_d((v2i64) in3, 1);
350  SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
351  CLIP_SH4_0_255(in4, in5, in6, in7);
352  PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
353  in4, in5, in6, in7);
354  tmp3 = __msa_copy_u_d((v2i64) in4, 1);
355  tmp2 = __msa_copy_u_d((v2i64) in5, 1);
356  tmp1 = __msa_copy_u_d((v2i64) in6, 1);
357  tmp0 = __msa_copy_u_d((v2i64) in7, 1);
358  SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
359 }
360 
361 static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
362  int16_t *block)
363 {
364  int32_t const_val;
365  uint64_t tmp0, tmp1, tmp2, tmp3;
366  v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
367  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
368  v8i16 w1, w3, w5, w7;
369  v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
370  v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
371  v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
372  v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
373  v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
374  v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
375  v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
376  v4i32 w2, w4, w6;
377  v8i16 select_vec, temp;
378  v8i16 zero = { 0 };
379  v4i32 const_val0 = __msa_ldi_w(1);
380  v4i32 const_val1 = __msa_ldi_w(1);
381 
382  const_val0 <<= 10;
383  const_val = 16383 * ((1 << 19) / 16383);
384  const_val1 = __msa_insert_w(const_val0, 0, const_val);
385  const_val1 = __msa_splati_w(const_val1, 0);
386  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
387  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
388  in0, in1, in2, in3, in4, in5, in6, in7);
389 
390  select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
391  select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
392  UNPCK_SH_SW(in0, a0_r, a0_l);
393  UNPCK_SH_SW(in2, temp3_r, temp3_l);
394  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
395  UNPCK_SH_SW(in4, temp4_r, temp4_l);
396  UNPCK_SH_SW(in6, temp7_r, temp7_l);
397  ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
398  temp = in0 << 3;
399  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
400  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
401  const0, const1, const2, const3);
402  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
403  const5 = __msa_ilvod_h(-w1, -w5);
404  const7 = __msa_ilvod_h(w3, -w1);
405  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
406  b0_r, b1_r, b2_r, b3_r);
407  DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
408  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
409  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
410  b0_l, b1_l, b2_l, b3_l);
411  DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
412  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
413  w2 = (v4i32) __msa_splati_h(weights, 2);
414  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
415  w4 = (v4i32) __msa_splati_h(weights, 4);
416  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
417  w6 = (v4i32) __msa_splati_h(weights, 6);
418  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
419  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
420  ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
421  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
422  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
423  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
424  temp2_l, temp2_r, temp1_l, temp1_r,
425  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
426  MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
427  MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
428  MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
429  ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
430  SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
431  SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
432  ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
433  ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
434  SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
435  ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
436  SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
437  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
438  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
439  temp0_r, temp0_l, temp1_r, temp1_l,
440  temp2_r, temp2_l, temp3_r, temp3_l,
441  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
442  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
443  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
444  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
445  temp2_l, temp2_r, temp3_l, temp3_r,
446  temp0_r, temp1_r, temp2_r, temp3_r);
447  in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
448  (v16u8) select_vec);
449  in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
450  (v16u8) select_vec);
451  in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
452  (v16u8) select_vec);
453  in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
454  (v16u8) select_vec);
455  SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
456  SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
457  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
458  a0_r, a1_r, a2_r, a3_r);
459  in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
460  in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
461  in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
462  in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
463  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
464  in0, in1, in2, in3, in4, in5, in6, in7);
465 
466  UNPCK_SH_SW(in0, a0_r, a0_l);
467  UNPCK_SH_SW(in2, temp3_r, temp3_l);
468  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
469  ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
470  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
471  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
472  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
473  temp2_l, temp2_r, temp1_l, temp1_r,
474  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
475  UNPCK_SH_SW(in4, temp0_r, temp0_l);
476  UNPCK_SH_SW(in6, temp3_r, temp3_l);
477  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
478  MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
479  MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
480  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
481  SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
482  SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
483  ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
484  ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
485  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
486  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
487  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
488  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
489  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
490  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
491  b0_r, b1_r, b2_r, b3_r);
492  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
493  b0_l, b1_l, b2_l, b3_l);
494  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
495  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
496  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
497  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
498  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
499  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
500  temp0_r, temp0_l, temp1_r, temp1_l,
501  temp2_r, temp2_l, temp3_r, temp3_l,
502  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
503  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
504  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
505  LD_SH4(dst, dst_stride, in0, in1, in2, in3);
506  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
507  temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
508  ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
509  temp0_l, temp1_l, temp2_l, temp3_l);
510  in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l);
511  in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l);
512  in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l);
513  in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l);
514  CLIP_SH4_0_255(in0, in1, in2, in3);
515  PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
516  in0, in1, in2, in3);
517  tmp0 = __msa_copy_u_d((v2i64) in0, 1);
518  tmp1 = __msa_copy_u_d((v2i64) in1, 1);
519  tmp2 = __msa_copy_u_d((v2i64) in2, 1);
520  tmp3 = __msa_copy_u_d((v2i64) in3, 1);
521  SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
522 
523  SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
524  SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
525  LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
526  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
527  a0_r, a1_r, a2_r, a3_r);
528  ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
529  a3_l, a2_l, a1_l, a0_l);
530  in4 = (v8i16) (a3_r) + (v8i16) (a3_l);
531  in5 = (v8i16) (a2_r) + (v8i16) (a2_l);
532  in6 = (v8i16) (a1_r) + (v8i16) (a1_l);
533  in7 = (v8i16) (a0_r) + (v8i16) (a0_l);
534  CLIP_SH4_0_255(in4, in5, in6, in7);
535  PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
536  in4, in5, in6, in7);
537  tmp0 = __msa_copy_u_d((v2i64) in4, 1);
538  tmp1 = __msa_copy_u_d((v2i64) in5, 1);
539  tmp2 = __msa_copy_u_d((v2i64) in6, 1);
540  tmp3 = __msa_copy_u_d((v2i64) in7, 1);
541  SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
542 }
543 
544 void ff_simple_idct_msa(int16_t *block)
545 {
546  simple_idct_msa(block);
547 }
548 
549 void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
550 {
551  simple_idct_put_msa(dst, dst_stride, block);
552 }
553 
554 void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
555 {
556  simple_idct_add_msa(dst, dst_stride, block);
557 }
else temp
Definition: vf_mcdeint.c:256
#define ILVR_H4_SH(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
void ff_simple_idct_msa(int16_t *block)
#define ILVRL_H2_SW(...)
#define DOTP_SH4_SW(...)
#define SRA_4V(in0, in1, in2, in3, shift)
The exact code depends on how similar the blocks are and how related they are to the block
uint8_t
#define SPLATI_H4_SH(...)
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static void simple_idct_msa(int16_t *block)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define DPADD_SH4_SW(...)
#define zero
Definition: regdef.h:64
#define TRANSPOSE8x8_SH_SH(...)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define LD_SH8(...)
#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9,in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7,out8, out9, out10, out11, out12, out13, out14, out15)
#define PCKEV_B4_SH(...)
int32_t
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,out0, out1, out2, out3, out4, out5, out6, out7)
#define UNPCK_SH_SW(in, out0, out1)
static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
#define ILVR_B4_SW(...)
#define ADD2(in0, in1, in2, in3, out0, out1)
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
#define ST_SW8(...)
#define PCKEV_H4_SW(...)
#define SUB2(in0, in1, in2, in3, out0, out1)
#define ILVR_H2_SH(...)
#define LD_SH4(...)
#define PCKEV_H4_SH(...)
static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)