00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00023 {
00024 int stride = line_size;
00025 __asm__ volatile (
00026 "and r12, %[pixels], #7 \n\t"
00027 "bic %[pixels], %[pixels], #7 \n\t"
00028 "tmcr wcgr1, r12 \n\t"
00029 "add r4, %[pixels], %[line_size] \n\t"
00030 "add r5, %[block], %[line_size] \n\t"
00031 "mov %[line_size], %[line_size], lsl #1 \n\t"
00032 "1: \n\t"
00033 "wldrd wr0, [%[pixels]] \n\t"
00034 "subs %[h], %[h], #2 \n\t"
00035 "wldrd wr1, [%[pixels], #8] \n\t"
00036 "add %[pixels], %[pixels], %[line_size] \n\t"
00037 "wldrd wr3, [r4] \n\t"
00038 "pld [%[pixels]] \n\t"
00039 "pld [%[pixels], #32] \n\t"
00040 "wldrd wr4, [r4, #8] \n\t"
00041 "add r4, r4, %[line_size] \n\t"
00042 "walignr1 wr8, wr0, wr1 \n\t"
00043 "pld [r4] \n\t"
00044 "pld [r4, #32] \n\t"
00045 "walignr1 wr10, wr3, wr4 \n\t"
00046 "wstrd wr8, [%[block]] \n\t"
00047 "add %[block], %[block], %[line_size] \n\t"
00048 "wstrd wr10, [r5] \n\t"
00049 "add r5, r5, %[line_size] \n\t"
00050 "bne 1b \n\t"
00051 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00052 :
00053 : "memory", "r4", "r5", "r12");
00054 }
00055
00056 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00057 {
00058 int stride = line_size;
00059 __asm__ volatile (
00060 "and r12, %[pixels], #7 \n\t"
00061 "bic %[pixels], %[pixels], #7 \n\t"
00062 "tmcr wcgr1, r12 \n\t"
00063 "add r4, %[pixels], %[line_size] \n\t"
00064 "add r5, %[block], %[line_size] \n\t"
00065 "mov %[line_size], %[line_size], lsl #1 \n\t"
00066 "1: \n\t"
00067 "wldrd wr0, [%[pixels]] \n\t"
00068 "subs %[h], %[h], #2 \n\t"
00069 "wldrd wr1, [%[pixels], #8] \n\t"
00070 "add %[pixels], %[pixels], %[line_size] \n\t"
00071 "wldrd wr3, [r4] \n\t"
00072 "pld [%[pixels]] \n\t"
00073 "pld [%[pixels], #32] \n\t"
00074 "wldrd wr4, [r4, #8] \n\t"
00075 "add r4, r4, %[line_size] \n\t"
00076 "walignr1 wr8, wr0, wr1 \n\t"
00077 "wldrd wr0, [%[block]] \n\t"
00078 "wldrd wr2, [r5] \n\t"
00079 "pld [r4] \n\t"
00080 "pld [r4, #32] \n\t"
00081 "walignr1 wr10, wr3, wr4 \n\t"
00082 WAVG2B" wr8, wr8, wr0 \n\t"
00083 WAVG2B" wr10, wr10, wr2 \n\t"
00084 "wstrd wr8, [%[block]] \n\t"
00085 "add %[block], %[block], %[line_size] \n\t"
00086 "wstrd wr10, [r5] \n\t"
00087 "pld [%[block]] \n\t"
00088 "pld [%[block], #32] \n\t"
00089 "add r5, r5, %[line_size] \n\t"
00090 "pld [r5] \n\t"
00091 "pld [r5, #32] \n\t"
00092 "bne 1b \n\t"
00093 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00094 :
00095 : "memory", "r4", "r5", "r12");
00096 }
00097
00098 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00099 {
00100 int stride = line_size;
00101 __asm__ volatile (
00102 "and r12, %[pixels], #7 \n\t"
00103 "bic %[pixels], %[pixels], #7 \n\t"
00104 "tmcr wcgr1, r12 \n\t"
00105 "add r4, %[pixels], %[line_size] \n\t"
00106 "add r5, %[block], %[line_size] \n\t"
00107 "mov %[line_size], %[line_size], lsl #1 \n\t"
00108 "1: \n\t"
00109 "wldrd wr0, [%[pixels]] \n\t"
00110 "wldrd wr1, [%[pixels], #8] \n\t"
00111 "subs %[h], %[h], #2 \n\t"
00112 "wldrd wr2, [%[pixels], #16] \n\t"
00113 "add %[pixels], %[pixels], %[line_size] \n\t"
00114 "wldrd wr3, [r4] \n\t"
00115 "pld [%[pixels]] \n\t"
00116 "pld [%[pixels], #32] \n\t"
00117 "walignr1 wr8, wr0, wr1 \n\t"
00118 "wldrd wr4, [r4, #8] \n\t"
00119 "walignr1 wr9, wr1, wr2 \n\t"
00120 "wldrd wr5, [r4, #16] \n\t"
00121 "add r4, r4, %[line_size] \n\t"
00122 "pld [r4] \n\t"
00123 "pld [r4, #32] \n\t"
00124 "walignr1 wr10, wr3, wr4 \n\t"
00125 "wstrd wr8, [%[block]] \n\t"
00126 "walignr1 wr11, wr4, wr5 \n\t"
00127 "wstrd wr9, [%[block], #8] \n\t"
00128 "add %[block], %[block], %[line_size] \n\t"
00129 "wstrd wr10, [r5] \n\t"
00130 "wstrd wr11, [r5, #8] \n\t"
00131 "add r5, r5, %[line_size] \n\t"
00132 "bne 1b \n\t"
00133 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00134 :
00135 : "memory", "r4", "r5", "r12");
00136 }
00137
00138 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00139 {
00140 int stride = line_size;
00141 __asm__ volatile (
00142 "pld [%[pixels]] \n\t"
00143 "pld [%[pixels], #32] \n\t"
00144 "pld [%[block]] \n\t"
00145 "pld [%[block], #32] \n\t"
00146 "and r12, %[pixels], #7 \n\t"
00147 "bic %[pixels], %[pixels], #7 \n\t"
00148 "tmcr wcgr1, r12 \n\t"
00149 "add r4, %[pixels], %[line_size]\n\t"
00150 "add r5, %[block], %[line_size] \n\t"
00151 "mov %[line_size], %[line_size], lsl #1 \n\t"
00152 "1: \n\t"
00153 "wldrd wr0, [%[pixels]] \n\t"
00154 "wldrd wr1, [%[pixels], #8] \n\t"
00155 "subs %[h], %[h], #2 \n\t"
00156 "wldrd wr2, [%[pixels], #16] \n\t"
00157 "add %[pixels], %[pixels], %[line_size] \n\t"
00158 "wldrd wr3, [r4] \n\t"
00159 "pld [%[pixels]] \n\t"
00160 "pld [%[pixels], #32] \n\t"
00161 "walignr1 wr8, wr0, wr1 \n\t"
00162 "wldrd wr4, [r4, #8] \n\t"
00163 "walignr1 wr9, wr1, wr2 \n\t"
00164 "wldrd wr5, [r4, #16] \n\t"
00165 "add r4, r4, %[line_size] \n\t"
00166 "wldrd wr0, [%[block]] \n\t"
00167 "pld [r4] \n\t"
00168 "wldrd wr1, [%[block], #8] \n\t"
00169 "pld [r4, #32] \n\t"
00170 "wldrd wr2, [r5] \n\t"
00171 "walignr1 wr10, wr3, wr4 \n\t"
00172 "wldrd wr3, [r5, #8] \n\t"
00173 WAVG2B" wr8, wr8, wr0 \n\t"
00174 WAVG2B" wr9, wr9, wr1 \n\t"
00175 WAVG2B" wr10, wr10, wr2 \n\t"
00176 "wstrd wr8, [%[block]] \n\t"
00177 "walignr1 wr11, wr4, wr5 \n\t"
00178 WAVG2B" wr11, wr11, wr3 \n\t"
00179 "wstrd wr9, [%[block], #8] \n\t"
00180 "add %[block], %[block], %[line_size] \n\t"
00181 "wstrd wr10, [r5] \n\t"
00182 "pld [%[block]] \n\t"
00183 "pld [%[block], #32] \n\t"
00184 "wstrd wr11, [r5, #8] \n\t"
00185 "add r5, r5, %[line_size] \n\t"
00186 "pld [r5] \n\t"
00187 "pld [r5, #32] \n\t"
00188 "bne 1b \n\t"
00189 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00190 :
00191 : "memory", "r4", "r5", "r12");
00192 }
00193
00194 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00195 {
00196 int stride = line_size;
00197
00198
00199 SET_RND(wr15);
00200 __asm__ volatile(
00201 "pld [%[pixels]] \n\t"
00202 "pld [%[pixels], #32] \n\t"
00203 "and r12, %[pixels], #7 \n\t"
00204 "bic %[pixels], %[pixels], #7 \n\t"
00205 "tmcr wcgr1, r12 \n\t"
00206 "add r12, r12, #1 \n\t"
00207 "add r4, %[pixels], %[line_size]\n\t"
00208 "tmcr wcgr2, r12 \n\t"
00209 "add r5, %[block], %[line_size] \n\t"
00210 "mov %[line_size], %[line_size], lsl #1 \n\t"
00211
00212 "1: \n\t"
00213 "wldrd wr10, [%[pixels]] \n\t"
00214 "cmp r12, #8 \n\t"
00215 "wldrd wr11, [%[pixels], #8] \n\t"
00216 "add %[pixels], %[pixels], %[line_size] \n\t"
00217 "wldrd wr13, [r4] \n\t"
00218 "pld [%[pixels]] \n\t"
00219 "wldrd wr14, [r4, #8] \n\t"
00220 "pld [%[pixels], #32] \n\t"
00221 "add r4, r4, %[line_size] \n\t"
00222 "walignr1 wr0, wr10, wr11 \n\t"
00223 "pld [r4] \n\t"
00224 "pld [r4, #32] \n\t"
00225 "walignr1 wr2, wr13, wr14 \n\t"
00226 "wmoveq wr4, wr11 \n\t"
00227 "wmoveq wr6, wr14 \n\t"
00228 "walignr2ne wr4, wr10, wr11 \n\t"
00229 "walignr2ne wr6, wr13, wr14 \n\t"
00230 WAVG2B" wr0, wr0, wr4 \n\t"
00231 WAVG2B" wr2, wr2, wr6 \n\t"
00232 "wstrd wr0, [%[block]] \n\t"
00233 "subs %[h], %[h], #2 \n\t"
00234 "wstrd wr2, [r5] \n\t"
00235 "add %[block], %[block], %[line_size] \n\t"
00236 "add r5, r5, %[line_size] \n\t"
00237 "bne 1b \n\t"
00238 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00239 :
00240 : "r4", "r5", "r12", "memory");
00241 }
00242
00243 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00244 {
00245 int stride = line_size;
00246
00247
00248 SET_RND(wr15);
00249 __asm__ volatile(
00250 "pld [%[pixels]] \n\t"
00251 "pld [%[pixels], #32] \n\t"
00252 "and r12, %[pixels], #7 \n\t"
00253 "bic %[pixels], %[pixels], #7 \n\t"
00254 "tmcr wcgr1, r12 \n\t"
00255 "add r12, r12, #1 \n\t"
00256 "add r4, %[pixels], %[line_size]\n\t"
00257 "tmcr wcgr2, r12 \n\t"
00258 "add r5, %[block], %[line_size] \n\t"
00259 "mov %[line_size], %[line_size], lsl #1 \n\t"
00260
00261 "1: \n\t"
00262 "wldrd wr10, [%[pixels]] \n\t"
00263 "cmp r12, #8 \n\t"
00264 "wldrd wr11, [%[pixels], #8] \n\t"
00265 "wldrd wr12, [%[pixels], #16] \n\t"
00266 "add %[pixels], %[pixels], %[line_size] \n\t"
00267 "wldrd wr13, [r4] \n\t"
00268 "pld [%[pixels]] \n\t"
00269 "wldrd wr14, [r4, #8] \n\t"
00270 "pld [%[pixels], #32] \n\t"
00271 "wldrd wr15, [r4, #16] \n\t"
00272 "add r4, r4, %[line_size] \n\t"
00273 "walignr1 wr0, wr10, wr11 \n\t"
00274 "pld [r4] \n\t"
00275 "pld [r4, #32] \n\t"
00276 "walignr1 wr1, wr11, wr12 \n\t"
00277 "walignr1 wr2, wr13, wr14 \n\t"
00278 "walignr1 wr3, wr14, wr15 \n\t"
00279 "wmoveq wr4, wr11 \n\t"
00280 "wmoveq wr5, wr12 \n\t"
00281 "wmoveq wr6, wr14 \n\t"
00282 "wmoveq wr7, wr15 \n\t"
00283 "walignr2ne wr4, wr10, wr11 \n\t"
00284 "walignr2ne wr5, wr11, wr12 \n\t"
00285 "walignr2ne wr6, wr13, wr14 \n\t"
00286 "walignr2ne wr7, wr14, wr15 \n\t"
00287 WAVG2B" wr0, wr0, wr4 \n\t"
00288 WAVG2B" wr1, wr1, wr5 \n\t"
00289 "wstrd wr0, [%[block]] \n\t"
00290 WAVG2B" wr2, wr2, wr6 \n\t"
00291 "wstrd wr1, [%[block], #8] \n\t"
00292 WAVG2B" wr3, wr3, wr7 \n\t"
00293 "add %[block], %[block], %[line_size] \n\t"
00294 "wstrd wr2, [r5] \n\t"
00295 "subs %[h], %[h], #2 \n\t"
00296 "wstrd wr3, [r5, #8] \n\t"
00297 "add r5, r5, %[line_size] \n\t"
00298 "bne 1b \n\t"
00299 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00300 :
00301 : "r4", "r5", "r12", "memory");
00302 }
00303
00304 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00305 {
00306 int stride = line_size;
00307
00308
00309 SET_RND(wr15);
00310 __asm__ volatile(
00311 "pld [%[pixels]] \n\t"
00312 "pld [%[pixels], #32] \n\t"
00313 "pld [%[block]] \n\t"
00314 "pld [%[block], #32] \n\t"
00315 "and r12, %[pixels], #7 \n\t"
00316 "bic %[pixels], %[pixels], #7 \n\t"
00317 "tmcr wcgr1, r12 \n\t"
00318 "add r12, r12, #1 \n\t"
00319 "add r4, %[pixels], %[line_size]\n\t"
00320 "tmcr wcgr2, r12 \n\t"
00321 "add r5, %[block], %[line_size] \n\t"
00322 "mov %[line_size], %[line_size], lsl #1 \n\t"
00323 "pld [r5] \n\t"
00324 "pld [r5, #32] \n\t"
00325
00326 "1: \n\t"
00327 "wldrd wr10, [%[pixels]] \n\t"
00328 "cmp r12, #8 \n\t"
00329 "wldrd wr11, [%[pixels], #8] \n\t"
00330 "add %[pixels], %[pixels], %[line_size] \n\t"
00331 "wldrd wr13, [r4] \n\t"
00332 "pld [%[pixels]] \n\t"
00333 "wldrd wr14, [r4, #8] \n\t"
00334 "pld [%[pixels], #32] \n\t"
00335 "add r4, r4, %[line_size] \n\t"
00336 "walignr1 wr0, wr10, wr11 \n\t"
00337 "pld [r4] \n\t"
00338 "pld [r4, #32] \n\t"
00339 "walignr1 wr2, wr13, wr14 \n\t"
00340 "wmoveq wr4, wr11 \n\t"
00341 "wmoveq wr6, wr14 \n\t"
00342 "walignr2ne wr4, wr10, wr11 \n\t"
00343 "wldrd wr10, [%[block]] \n\t"
00344 "walignr2ne wr6, wr13, wr14 \n\t"
00345 "wldrd wr12, [r5] \n\t"
00346 WAVG2B" wr0, wr0, wr4 \n\t"
00347 WAVG2B" wr2, wr2, wr6 \n\t"
00348 WAVG2B" wr0, wr0, wr10 \n\t"
00349 WAVG2B" wr2, wr2, wr12 \n\t"
00350 "wstrd wr0, [%[block]] \n\t"
00351 "subs %[h], %[h], #2 \n\t"
00352 "wstrd wr2, [r5] \n\t"
00353 "add %[block], %[block], %[line_size] \n\t"
00354 "add r5, r5, %[line_size] \n\t"
00355 "pld [%[block]] \n\t"
00356 "pld [%[block], #32] \n\t"
00357 "pld [r5] \n\t"
00358 "pld [r5, #32] \n\t"
00359 "bne 1b \n\t"
00360 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00361 :
00362 : "r4", "r5", "r12", "memory");
00363 }
00364
00365 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00366 {
00367 int stride = line_size;
00368
00369
00370 SET_RND(wr15);
00371 __asm__ volatile(
00372 "pld [%[pixels]] \n\t"
00373 "pld [%[pixels], #32] \n\t"
00374 "pld [%[block]] \n\t"
00375 "pld [%[block], #32] \n\t"
00376 "and r12, %[pixels], #7 \n\t"
00377 "bic %[pixels], %[pixels], #7 \n\t"
00378 "tmcr wcgr1, r12 \n\t"
00379 "add r12, r12, #1 \n\t"
00380 "add r4, %[pixels], %[line_size]\n\t"
00381 "tmcr wcgr2, r12 \n\t"
00382 "add r5, %[block], %[line_size] \n\t"
00383 "mov %[line_size], %[line_size], lsl #1 \n\t"
00384 "pld [r5] \n\t"
00385 "pld [r5, #32] \n\t"
00386
00387 "1: \n\t"
00388 "wldrd wr10, [%[pixels]] \n\t"
00389 "cmp r12, #8 \n\t"
00390 "wldrd wr11, [%[pixels], #8] \n\t"
00391 "wldrd wr12, [%[pixels], #16] \n\t"
00392 "add %[pixels], %[pixels], %[line_size] \n\t"
00393 "wldrd wr13, [r4] \n\t"
00394 "pld [%[pixels]] \n\t"
00395 "wldrd wr14, [r4, #8] \n\t"
00396 "pld [%[pixels], #32] \n\t"
00397 "wldrd wr15, [r4, #16] \n\t"
00398 "add r4, r4, %[line_size] \n\t"
00399 "walignr1 wr0, wr10, wr11 \n\t"
00400 "pld [r4] \n\t"
00401 "pld [r4, #32] \n\t"
00402 "walignr1 wr1, wr11, wr12 \n\t"
00403 "walignr1 wr2, wr13, wr14 \n\t"
00404 "walignr1 wr3, wr14, wr15 \n\t"
00405 "wmoveq wr4, wr11 \n\t"
00406 "wmoveq wr5, wr12 \n\t"
00407 "wmoveq wr6, wr14 \n\t"
00408 "wmoveq wr7, wr15 \n\t"
00409 "walignr2ne wr4, wr10, wr11 \n\t"
00410 "walignr2ne wr5, wr11, wr12 \n\t"
00411 "walignr2ne wr6, wr13, wr14 \n\t"
00412 "walignr2ne wr7, wr14, wr15 \n\t"
00413 "wldrd wr10, [%[block]] \n\t"
00414 WAVG2B" wr0, wr0, wr4 \n\t"
00415 "wldrd wr11, [%[block], #8] \n\t"
00416 WAVG2B" wr1, wr1, wr5 \n\t"
00417 "wldrd wr12, [r5] \n\t"
00418 WAVG2B" wr2, wr2, wr6 \n\t"
00419 "wldrd wr13, [r5, #8] \n\t"
00420 WAVG2B" wr3, wr3, wr7 \n\t"
00421 WAVG2B" wr0, wr0, wr10 \n\t"
00422 WAVG2B" wr1, wr1, wr11 \n\t"
00423 WAVG2B" wr2, wr2, wr12 \n\t"
00424 WAVG2B" wr3, wr3, wr13 \n\t"
00425 "wstrd wr0, [%[block]] \n\t"
00426 "subs %[h], %[h], #2 \n\t"
00427 "wstrd wr1, [%[block], #8] \n\t"
00428 "add %[block], %[block], %[line_size] \n\t"
00429 "wstrd wr2, [r5] \n\t"
00430 "pld [%[block]] \n\t"
00431 "wstrd wr3, [r5, #8] \n\t"
00432 "add r5, r5, %[line_size] \n\t"
00433 "pld [%[block], #32] \n\t"
00434 "pld [r5] \n\t"
00435 "pld [r5, #32] \n\t"
00436 "bne 1b \n\t"
00437 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00438 :
00439 :"r4", "r5", "r12", "memory");
00440 }
00441
00442 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00443 {
00444 int stride = line_size;
00445
00446
00447 __asm__ volatile(
00448 "pld [%[pixels]] \n\t"
00449 "pld [%[pixels], #32] \n\t"
00450 "and r12, %[pixels], #7 \n\t"
00451 "tmcr wcgr1, r12 \n\t"
00452 "bic %[pixels], %[pixels], #7 \n\t"
00453
00454 "wldrd wr10, [%[pixels]] \n\t"
00455 "wldrd wr11, [%[pixels], #8] \n\t"
00456 "pld [%[block]] \n\t"
00457 "add %[pixels], %[pixels], %[line_size] \n\t"
00458 "walignr1 wr0, wr10, wr11 \n\t"
00459 "pld [%[pixels]] \n\t"
00460 "pld [%[pixels], #32] \n\t"
00461
00462 "1: \n\t"
00463 "wldrd wr10, [%[pixels]] \n\t"
00464 "wldrd wr11, [%[pixels], #8] \n\t"
00465 "add %[pixels], %[pixels], %[line_size] \n\t"
00466 "pld [%[pixels]] \n\t"
00467 "pld [%[pixels], #32] \n\t"
00468 "walignr1 wr4, wr10, wr11 \n\t"
00469 "wldrd wr10, [%[block]] \n\t"
00470 WAVG2B" wr8, wr0, wr4 \n\t"
00471 WAVG2B" wr8, wr8, wr10 \n\t"
00472 "wstrd wr8, [%[block]] \n\t"
00473 "add %[block], %[block], %[line_size] \n\t"
00474
00475 "wldrd wr10, [%[pixels]] \n\t"
00476 "wldrd wr11, [%[pixels], #8] \n\t"
00477 "pld [%[block]] \n\t"
00478 "add %[pixels], %[pixels], %[line_size] \n\t"
00479 "pld [%[pixels]] \n\t"
00480 "pld [%[pixels], #32] \n\t"
00481 "walignr1 wr0, wr10, wr11 \n\t"
00482 "wldrd wr10, [%[block]] \n\t"
00483 WAVG2B" wr8, wr0, wr4 \n\t"
00484 WAVG2B" wr8, wr8, wr10 \n\t"
00485 "wstrd wr8, [%[block]] \n\t"
00486 "add %[block], %[block], %[line_size] \n\t"
00487
00488 "subs %[h], %[h], #2 \n\t"
00489 "pld [%[block]] \n\t"
00490 "bne 1b \n\t"
00491 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00492 :
00493 : "cc", "memory", "r12");
00494 }
00495
00496 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00497 {
00498 int stride = line_size;
00499
00500
00501 __asm__ volatile(
00502 "pld [%[pixels]] \n\t"
00503 "pld [%[pixels], #32] \n\t"
00504 "and r12, %[pixels], #7 \n\t"
00505 "tmcr wcgr1, r12 \n\t"
00506 "bic %[pixels], %[pixels], #7 \n\t"
00507
00508 "wldrd wr10, [%[pixels]] \n\t"
00509 "wldrd wr11, [%[pixels], #8] \n\t"
00510 "wldrd wr12, [%[pixels], #16] \n\t"
00511 "add %[pixels], %[pixels], %[line_size] \n\t"
00512 "pld [%[pixels]] \n\t"
00513 "pld [%[pixels], #32] \n\t"
00514 "walignr1 wr0, wr10, wr11 \n\t"
00515 "walignr1 wr1, wr11, wr12 \n\t"
00516
00517 "1: \n\t"
00518 "wldrd wr10, [%[pixels]] \n\t"
00519 "wldrd wr11, [%[pixels], #8] \n\t"
00520 "wldrd wr12, [%[pixels], #16] \n\t"
00521 "add %[pixels], %[pixels], %[line_size] \n\t"
00522 "pld [%[pixels]] \n\t"
00523 "pld [%[pixels], #32] \n\t"
00524 "walignr1 wr4, wr10, wr11 \n\t"
00525 "walignr1 wr5, wr11, wr12 \n\t"
00526 WAVG2B" wr8, wr0, wr4 \n\t"
00527 WAVG2B" wr9, wr1, wr5 \n\t"
00528 "wstrd wr8, [%[block]] \n\t"
00529 "wstrd wr9, [%[block], #8] \n\t"
00530 "add %[block], %[block], %[line_size] \n\t"
00531
00532 "wldrd wr10, [%[pixels]] \n\t"
00533 "wldrd wr11, [%[pixels], #8] \n\t"
00534 "wldrd wr12, [%[pixels], #16] \n\t"
00535 "add %[pixels], %[pixels], %[line_size] \n\t"
00536 "pld [%[pixels]] \n\t"
00537 "pld [%[pixels], #32] \n\t"
00538 "walignr1 wr0, wr10, wr11 \n\t"
00539 "walignr1 wr1, wr11, wr12 \n\t"
00540 WAVG2B" wr8, wr0, wr4 \n\t"
00541 WAVG2B" wr9, wr1, wr5 \n\t"
00542 "wstrd wr8, [%[block]] \n\t"
00543 "wstrd wr9, [%[block], #8] \n\t"
00544 "add %[block], %[block], %[line_size] \n\t"
00545
00546 "subs %[h], %[h], #2 \n\t"
00547 "bne 1b \n\t"
00548 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00549 :
00550 : "r4", "r5", "r12", "memory");
00551 }
00552
00553 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00554 {
00555 int stride = line_size;
00556
00557
00558 __asm__ volatile(
00559 "pld [%[pixels]] \n\t"
00560 "pld [%[pixels], #32] \n\t"
00561 "and r12, %[pixels], #7 \n\t"
00562 "tmcr wcgr1, r12 \n\t"
00563 "bic %[pixels], %[pixels], #7 \n\t"
00564
00565 "wldrd wr10, [%[pixels]] \n\t"
00566 "wldrd wr11, [%[pixels], #8] \n\t"
00567 "pld [%[block]] \n\t"
00568 "wldrd wr12, [%[pixels], #16] \n\t"
00569 "add %[pixels], %[pixels], %[line_size] \n\t"
00570 "pld [%[pixels]] \n\t"
00571 "pld [%[pixels], #32] \n\t"
00572 "walignr1 wr0, wr10, wr11 \n\t"
00573 "walignr1 wr1, wr11, wr12 \n\t"
00574
00575 "1: \n\t"
00576 "wldrd wr10, [%[pixels]] \n\t"
00577 "wldrd wr11, [%[pixels], #8] \n\t"
00578 "wldrd wr12, [%[pixels], #16] \n\t"
00579 "add %[pixels], %[pixels], %[line_size] \n\t"
00580 "pld [%[pixels]] \n\t"
00581 "pld [%[pixels], #32] \n\t"
00582 "walignr1 wr4, wr10, wr11 \n\t"
00583 "walignr1 wr5, wr11, wr12 \n\t"
00584 "wldrd wr10, [%[block]] \n\t"
00585 "wldrd wr11, [%[block], #8] \n\t"
00586 WAVG2B" wr8, wr0, wr4 \n\t"
00587 WAVG2B" wr9, wr1, wr5 \n\t"
00588 WAVG2B" wr8, wr8, wr10 \n\t"
00589 WAVG2B" wr9, wr9, wr11 \n\t"
00590 "wstrd wr8, [%[block]] \n\t"
00591 "wstrd wr9, [%[block], #8] \n\t"
00592 "add %[block], %[block], %[line_size] \n\t"
00593
00594 "wldrd wr10, [%[pixels]] \n\t"
00595 "wldrd wr11, [%[pixels], #8] \n\t"
00596 "pld [%[block]] \n\t"
00597 "wldrd wr12, [%[pixels], #16] \n\t"
00598 "add %[pixels], %[pixels], %[line_size] \n\t"
00599 "pld [%[pixels]] \n\t"
00600 "pld [%[pixels], #32] \n\t"
00601 "walignr1 wr0, wr10, wr11 \n\t"
00602 "walignr1 wr1, wr11, wr12 \n\t"
00603 "wldrd wr10, [%[block]] \n\t"
00604 "wldrd wr11, [%[block], #8] \n\t"
00605 WAVG2B" wr8, wr0, wr4 \n\t"
00606 WAVG2B" wr9, wr1, wr5 \n\t"
00607 WAVG2B" wr8, wr8, wr10 \n\t"
00608 WAVG2B" wr9, wr9, wr11 \n\t"
00609 "wstrd wr8, [%[block]] \n\t"
00610 "wstrd wr9, [%[block], #8] \n\t"
00611 "add %[block], %[block], %[line_size] \n\t"
00612
00613 "subs %[h], %[h], #2 \n\t"
00614 "pld [%[block]] \n\t"
00615 "bne 1b \n\t"
00616 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00617 :
00618 : "r4", "r5", "r12", "memory");
00619 }
00620
00621 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00622 {
00623
00624
00625 SET_RND(wr15);
00626 __asm__ volatile(
00627 "pld [%[pixels]] \n\t"
00628 "mov r12, #2 \n\t"
00629 "pld [%[pixels], #32] \n\t"
00630 "tmcr wcgr0, r12 \n\t"
00631 "and r12, %[pixels], #7 \n\t"
00632 "bic %[pixels], %[pixels], #7 \n\t"
00633 "tmcr wcgr1, r12 \n\t"
00634
00635
00636
00637 "wldrd wr12, [%[pixels]] \n\t"
00638 "add r12, r12, #1 \n\t"
00639 "wldrd wr13, [%[pixels], #8] \n\t"
00640 "tmcr wcgr2, r12 \n\t"
00641 "add %[pixels], %[pixels], %[line_size] \n\t"
00642 "cmp r12, #8 \n\t"
00643 "pld [%[pixels]] \n\t"
00644 "pld [%[pixels], #32] \n\t"
00645 "walignr1 wr2, wr12, wr13 \n\t"
00646 "wmoveq wr10, wr13 \n\t"
00647 "walignr2ne wr10, wr12, wr13 \n\t"
00648 "wunpckelub wr0, wr2 \n\t"
00649 "wunpckehub wr1, wr2 \n\t"
00650 "wunpckelub wr8, wr10 \n\t"
00651 "wunpckehub wr9, wr10 \n\t"
00652 "waddhus wr0, wr0, wr8 \n\t"
00653 "waddhus wr1, wr1, wr9 \n\t"
00654
00655 "1: \n\t"
00656
00657
00658 "wldrd wr12, [%[pixels]] \n\t"
00659 "cmp r12, #8 \n\t"
00660 "wldrd wr13, [%[pixels], #8] \n\t"
00661 "add %[pixels], %[pixels], %[line_size] \n\t"
00662 "walignr1 wr6, wr12, wr13 \n\t"
00663 "pld [%[pixels]] \n\t"
00664 "pld [%[pixels], #32] \n\t"
00665 "wmoveq wr10, wr13 \n\t"
00666 "walignr2ne wr10, wr12, wr13 \n\t"
00667 "wunpckelub wr4, wr6 \n\t"
00668 "wunpckehub wr5, wr6 \n\t"
00669 "wunpckelub wr8, wr10 \n\t"
00670 "wunpckehub wr9, wr10 \n\t"
00671 "waddhus wr4, wr4, wr8 \n\t"
00672 "waddhus wr5, wr5, wr9 \n\t"
00673 "waddhus wr8, wr0, wr4 \n\t"
00674 "waddhus wr9, wr1, wr5 \n\t"
00675 "waddhus wr8, wr8, wr15 \n\t"
00676 "waddhus wr9, wr9, wr15 \n\t"
00677 "wsrlhg wr8, wr8, wcgr0 \n\t"
00678 "wsrlhg wr9, wr9, wcgr0 \n\t"
00679 "wpackhus wr8, wr8, wr9 \n\t"
00680 "wstrd wr8, [%[block]] \n\t"
00681 "add %[block], %[block], %[line_size] \n\t"
00682
00683
00684
00685 "wldrd wr12, [%[pixels]] \n\t"
00686 "wldrd wr13, [%[pixels], #8] \n\t"
00687 "add %[pixels], %[pixels], %[line_size] \n\t"
00688 "walignr1 wr2, wr12, wr13 \n\t"
00689 "pld [%[pixels]] \n\t"
00690 "pld [%[pixels], #32] \n\t"
00691 "wmoveq wr10, wr13 \n\t"
00692 "walignr2ne wr10, wr12, wr13 \n\t"
00693 "wunpckelub wr0, wr2 \n\t"
00694 "wunpckehub wr1, wr2 \n\t"
00695 "wunpckelub wr8, wr10 \n\t"
00696 "wunpckehub wr9, wr10 \n\t"
00697 "waddhus wr0, wr0, wr8 \n\t"
00698 "waddhus wr1, wr1, wr9 \n\t"
00699 "waddhus wr8, wr0, wr4 \n\t"
00700 "waddhus wr9, wr1, wr5 \n\t"
00701 "waddhus wr8, wr8, wr15 \n\t"
00702 "waddhus wr9, wr9, wr15 \n\t"
00703 "wsrlhg wr8, wr8, wcgr0 \n\t"
00704 "wsrlhg wr9, wr9, wcgr0 \n\t"
00705 "wpackhus wr8, wr8, wr9 \n\t"
00706 "subs %[h], %[h], #2 \n\t"
00707 "wstrd wr8, [%[block]] \n\t"
00708 "add %[block], %[block], %[line_size] \n\t"
00709 "bne 1b \n\t"
00710 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00711 : [line_size]"r"(line_size)
00712 : "r12", "memory");
00713 }
00714
00715 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00716 {
00717
00718
00719 SET_RND(wr15);
00720 __asm__ volatile(
00721 "pld [%[pixels]] \n\t"
00722 "mov r12, #2 \n\t"
00723 "pld [%[pixels], #32] \n\t"
00724 "tmcr wcgr0, r12 \n\t"
00725
00726 "and r12, %[pixels], #7 \n\t"
00727 "bic %[pixels], %[pixels], #7 \n\t"
00728 "tmcr wcgr1, r12 \n\t"
00729 "add r12, r12, #1 \n\t"
00730 "tmcr wcgr2, r12 \n\t"
00731
00732
00733
00734 "wldrd wr12, [%[pixels]] \n\t"
00735 "cmp r12, #8 \n\t"
00736 "wldrd wr13, [%[pixels], #8] \n\t"
00737 "wldrd wr14, [%[pixels], #16] \n\t"
00738 "add %[pixels], %[pixels], %[line_size] \n\t"
00739 "pld [%[pixels]] \n\t"
00740 "walignr1 wr2, wr12, wr13 \n\t"
00741 "pld [%[pixels], #32] \n\t"
00742 "walignr1 wr3, wr13, wr14 \n\t"
00743 "wmoveq wr10, wr13 \n\t"
00744 "wmoveq wr11, wr14 \n\t"
00745 "walignr2ne wr10, wr12, wr13 \n\t"
00746 "walignr2ne wr11, wr13, wr14 \n\t"
00747 "wunpckelub wr0, wr2 \n\t"
00748 "wunpckehub wr1, wr2 \n\t"
00749 "wunpckelub wr2, wr3 \n\t"
00750 "wunpckehub wr3, wr3 \n\t"
00751 "wunpckelub wr8, wr10 \n\t"
00752 "wunpckehub wr9, wr10 \n\t"
00753 "wunpckelub wr10, wr11 \n\t"
00754 "wunpckehub wr11, wr11 \n\t"
00755 "waddhus wr0, wr0, wr8 \n\t"
00756 "waddhus wr1, wr1, wr9 \n\t"
00757 "waddhus wr2, wr2, wr10 \n\t"
00758 "waddhus wr3, wr3, wr11 \n\t"
00759
00760 "1: \n\t"
00761
00762
00763 "wldrd wr12, [%[pixels]] \n\t"
00764 "cmp r12, #8 \n\t"
00765 "wldrd wr13, [%[pixels], #8] \n\t"
00766 "wldrd wr14, [%[pixels], #16] \n\t"
00767 "add %[pixels], %[pixels], %[line_size] \n\t"
00768 "walignr1 wr6, wr12, wr13 \n\t"
00769 "pld [%[pixels]] \n\t"
00770 "pld [%[pixels], #32] \n\t"
00771 "walignr1 wr7, wr13, wr14 \n\t"
00772 "wmoveq wr10, wr13 \n\t"
00773 "wmoveq wr11, wr14 \n\t"
00774 "walignr2ne wr10, wr12, wr13 \n\t"
00775 "walignr2ne wr11, wr13, wr14 \n\t"
00776 "wunpckelub wr4, wr6 \n\t"
00777 "wunpckehub wr5, wr6 \n\t"
00778 "wunpckelub wr6, wr7 \n\t"
00779 "wunpckehub wr7, wr7 \n\t"
00780 "wunpckelub wr8, wr10 \n\t"
00781 "wunpckehub wr9, wr10 \n\t"
00782 "wunpckelub wr10, wr11 \n\t"
00783 "wunpckehub wr11, wr11 \n\t"
00784 "waddhus wr4, wr4, wr8 \n\t"
00785 "waddhus wr5, wr5, wr9 \n\t"
00786 "waddhus wr6, wr6, wr10 \n\t"
00787 "waddhus wr7, wr7, wr11 \n\t"
00788 "waddhus wr8, wr0, wr4 \n\t"
00789 "waddhus wr9, wr1, wr5 \n\t"
00790 "waddhus wr10, wr2, wr6 \n\t"
00791 "waddhus wr11, wr3, wr7 \n\t"
00792 "waddhus wr8, wr8, wr15 \n\t"
00793 "waddhus wr9, wr9, wr15 \n\t"
00794 "waddhus wr10, wr10, wr15 \n\t"
00795 "waddhus wr11, wr11, wr15 \n\t"
00796 "wsrlhg wr8, wr8, wcgr0 \n\t"
00797 "wsrlhg wr9, wr9, wcgr0 \n\t"
00798 "wsrlhg wr10, wr10, wcgr0 \n\t"
00799 "wsrlhg wr11, wr11, wcgr0 \n\t"
00800 "wpackhus wr8, wr8, wr9 \n\t"
00801 "wpackhus wr9, wr10, wr11 \n\t"
00802 "wstrd wr8, [%[block]] \n\t"
00803 "wstrd wr9, [%[block], #8] \n\t"
00804 "add %[block], %[block], %[line_size] \n\t"
00805
00806
00807
00808 "wldrd wr12, [%[pixels]] \n\t"
00809 "wldrd wr13, [%[pixels], #8] \n\t"
00810 "wldrd wr14, [%[pixels], #16] \n\t"
00811 "add %[pixels], %[pixels], %[line_size] \n\t"
00812 "walignr1 wr2, wr12, wr13 \n\t"
00813 "pld [%[pixels]] \n\t"
00814 "pld [%[pixels], #32] \n\t"
00815 "walignr1 wr3, wr13, wr14 \n\t"
00816 "wmoveq wr10, wr13 \n\t"
00817 "wmoveq wr11, wr14 \n\t"
00818 "walignr2ne wr10, wr12, wr13 \n\t"
00819 "walignr2ne wr11, wr13, wr14 \n\t"
00820 "wunpckelub wr0, wr2 \n\t"
00821 "wunpckehub wr1, wr2 \n\t"
00822 "wunpckelub wr2, wr3 \n\t"
00823 "wunpckehub wr3, wr3 \n\t"
00824 "wunpckelub wr8, wr10 \n\t"
00825 "wunpckehub wr9, wr10 \n\t"
00826 "wunpckelub wr10, wr11 \n\t"
00827 "wunpckehub wr11, wr11 \n\t"
00828 "waddhus wr0, wr0, wr8 \n\t"
00829 "waddhus wr1, wr1, wr9 \n\t"
00830 "waddhus wr2, wr2, wr10 \n\t"
00831 "waddhus wr3, wr3, wr11 \n\t"
00832 "waddhus wr8, wr0, wr4 \n\t"
00833 "waddhus wr9, wr1, wr5 \n\t"
00834 "waddhus wr10, wr2, wr6 \n\t"
00835 "waddhus wr11, wr3, wr7 \n\t"
00836 "waddhus wr8, wr8, wr15 \n\t"
00837 "waddhus wr9, wr9, wr15 \n\t"
00838 "waddhus wr10, wr10, wr15 \n\t"
00839 "waddhus wr11, wr11, wr15 \n\t"
00840 "wsrlhg wr8, wr8, wcgr0 \n\t"
00841 "wsrlhg wr9, wr9, wcgr0 \n\t"
00842 "wsrlhg wr10, wr10, wcgr0 \n\t"
00843 "wsrlhg wr11, wr11, wcgr0 \n\t"
00844 "wpackhus wr8, wr8, wr9 \n\t"
00845 "wpackhus wr9, wr10, wr11 \n\t"
00846 "wstrd wr8, [%[block]] \n\t"
00847 "wstrd wr9, [%[block], #8] \n\t"
00848 "add %[block], %[block], %[line_size] \n\t"
00849
00850 "subs %[h], %[h], #2 \n\t"
00851 "bne 1b \n\t"
00852 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00853 : [line_size]"r"(line_size)
00854 : "r12", "memory");
00855 }
00856
00857 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00858 {
00859
00860
00861 SET_RND(wr15);
00862 __asm__ volatile(
00863 "pld [%[block]] \n\t"
00864 "pld [%[block], #32] \n\t"
00865 "pld [%[pixels]] \n\t"
00866 "mov r12, #2 \n\t"
00867 "pld [%[pixels], #32] \n\t"
00868 "tmcr wcgr0, r12 \n\t"
00869 "and r12, %[pixels], #7 \n\t"
00870 "bic %[pixels], %[pixels], #7 \n\t"
00871 "tmcr wcgr1, r12 \n\t"
00872
00873
00874
00875 "wldrd wr12, [%[pixels]] \n\t"
00876 "add r12, r12, #1 \n\t"
00877 "wldrd wr13, [%[pixels], #8] \n\t"
00878 "tmcr wcgr2, r12 \n\t"
00879 "add %[pixels], %[pixels], %[line_size] \n\t"
00880 "cmp r12, #8 \n\t"
00881 "pld [%[pixels]] \n\t"
00882 "pld [%[pixels], #32] \n\t"
00883 "walignr1 wr2, wr12, wr13 \n\t"
00884 "wmoveq wr10, wr13 \n\t"
00885 "walignr2ne wr10, wr12, wr13 \n\t"
00886 "wunpckelub wr0, wr2 \n\t"
00887 "wunpckehub wr1, wr2 \n\t"
00888 "wunpckelub wr8, wr10 \n\t"
00889 "wunpckehub wr9, wr10 \n\t"
00890 "waddhus wr0, wr0, wr8 \n\t"
00891 "waddhus wr1, wr1, wr9 \n\t"
00892
00893 "1: \n\t"
00894
00895
00896 "wldrd wr12, [%[pixels]] \n\t"
00897 "cmp r12, #8 \n\t"
00898 "wldrd wr13, [%[pixels], #8] \n\t"
00899 "add %[pixels], %[pixels], %[line_size] \n\t"
00900 "walignr1 wr6, wr12, wr13 \n\t"
00901 "pld [%[pixels]] \n\t"
00902 "pld [%[pixels], #32] \n\t"
00903 "wmoveq wr10, wr13 \n\t"
00904 "walignr2ne wr10, wr12, wr13 \n\t"
00905 "wunpckelub wr4, wr6 \n\t"
00906 "wunpckehub wr5, wr6 \n\t"
00907 "wunpckelub wr8, wr10 \n\t"
00908 "wunpckehub wr9, wr10 \n\t"
00909 "waddhus wr4, wr4, wr8 \n\t"
00910 "waddhus wr5, wr5, wr9 \n\t"
00911 "waddhus wr8, wr0, wr4 \n\t"
00912 "waddhus wr9, wr1, wr5 \n\t"
00913 "waddhus wr8, wr8, wr15 \n\t"
00914 "waddhus wr9, wr9, wr15 \n\t"
00915 "wldrd wr12, [%[block]] \n\t"
00916 "wsrlhg wr8, wr8, wcgr0 \n\t"
00917 "wsrlhg wr9, wr9, wcgr0 \n\t"
00918 "wpackhus wr8, wr8, wr9 \n\t"
00919 WAVG2B" wr8, wr8, wr12 \n\t"
00920 "wstrd wr8, [%[block]] \n\t"
00921 "add %[block], %[block], %[line_size] \n\t"
00922 "wldrd wr12, [%[pixels]] \n\t"
00923 "pld [%[block]] \n\t"
00924 "pld [%[block], #32] \n\t"
00925
00926
00927
00928 "wldrd wr13, [%[pixels], #8] \n\t"
00929 "add %[pixels], %[pixels], %[line_size] \n\t"
00930 "walignr1 wr2, wr12, wr13 \n\t"
00931 "pld [%[pixels]] \n\t"
00932 "pld [%[pixels], #32] \n\t"
00933 "wmoveq wr10, wr13 \n\t"
00934 "walignr2ne wr10, wr12, wr13 \n\t"
00935 "wunpckelub wr0, wr2 \n\t"
00936 "wunpckehub wr1, wr2 \n\t"
00937 "wunpckelub wr8, wr10 \n\t"
00938 "wunpckehub wr9, wr10 \n\t"
00939 "waddhus wr0, wr0, wr8 \n\t"
00940 "waddhus wr1, wr1, wr9 \n\t"
00941 "waddhus wr8, wr0, wr4 \n\t"
00942 "waddhus wr9, wr1, wr5 \n\t"
00943 "waddhus wr8, wr8, wr15 \n\t"
00944 "waddhus wr9, wr9, wr15 \n\t"
00945 "wldrd wr12, [%[block]] \n\t"
00946 "wsrlhg wr8, wr8, wcgr0 \n\t"
00947 "wsrlhg wr9, wr9, wcgr0 \n\t"
00948 "wpackhus wr8, wr8, wr9 \n\t"
00949 "subs %[h], %[h], #2 \n\t"
00950 WAVG2B" wr8, wr8, wr12 \n\t"
00951 "wstrd wr8, [%[block]] \n\t"
00952 "add %[block], %[block], %[line_size] \n\t"
00953 "pld [%[block]] \n\t"
00954 "pld [%[block], #32] \n\t"
00955 "bne 1b \n\t"
00956 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00957 : [line_size]"r"(line_size)
00958 : "r12", "memory");
00959 }
00960
00961 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00962 {
00963
00964
00965 SET_RND(wr15);
00966 __asm__ volatile(
00967 "pld [%[block]] \n\t"
00968 "pld [%[block], #32] \n\t"
00969 "pld [%[pixels]] \n\t"
00970 "mov r12, #2 \n\t"
00971 "pld [%[pixels], #32] \n\t"
00972 "tmcr wcgr0, r12 \n\t"
00973
00974 "and r12, %[pixels], #7 \n\t"
00975 "bic %[pixels], %[pixels], #7 \n\t"
00976 "tmcr wcgr1, r12 \n\t"
00977 "add r12, r12, #1 \n\t"
00978 "tmcr wcgr2, r12 \n\t"
00979
00980
00981
00982 "wldrd wr12, [%[pixels]] \n\t"
00983 "cmp r12, #8 \n\t"
00984 "wldrd wr13, [%[pixels], #8] \n\t"
00985 "wldrd wr14, [%[pixels], #16] \n\t"
00986 "add %[pixels], %[pixels], %[line_size] \n\t"
00987 "pld [%[pixels]] \n\t"
00988 "walignr1 wr2, wr12, wr13 \n\t"
00989 "pld [%[pixels], #32] \n\t"
00990 "walignr1 wr3, wr13, wr14 \n\t"
00991 "wmoveq wr10, wr13 \n\t"
00992 "wmoveq wr11, wr14 \n\t"
00993 "walignr2ne wr10, wr12, wr13 \n\t"
00994 "walignr2ne wr11, wr13, wr14 \n\t"
00995 "wunpckelub wr0, wr2 \n\t"
00996 "wunpckehub wr1, wr2 \n\t"
00997 "wunpckelub wr2, wr3 \n\t"
00998 "wunpckehub wr3, wr3 \n\t"
00999 "wunpckelub wr8, wr10 \n\t"
01000 "wunpckehub wr9, wr10 \n\t"
01001 "wunpckelub wr10, wr11 \n\t"
01002 "wunpckehub wr11, wr11 \n\t"
01003 "waddhus wr0, wr0, wr8 \n\t"
01004 "waddhus wr1, wr1, wr9 \n\t"
01005 "waddhus wr2, wr2, wr10 \n\t"
01006 "waddhus wr3, wr3, wr11 \n\t"
01007
01008 "1: \n\t"
01009
01010
01011 "wldrd wr12, [%[pixels]] \n\t"
01012 "cmp r12, #8 \n\t"
01013 "wldrd wr13, [%[pixels], #8] \n\t"
01014 "wldrd wr14, [%[pixels], #16] \n\t"
01015 "add %[pixels], %[pixels], %[line_size] \n\t"
01016 "walignr1 wr6, wr12, wr13 \n\t"
01017 "pld [%[pixels]] \n\t"
01018 "pld [%[pixels], #32] \n\t"
01019 "walignr1 wr7, wr13, wr14 \n\t"
01020 "wmoveq wr10, wr13 \n\t"
01021 "wmoveq wr11, wr14 \n\t"
01022 "walignr2ne wr10, wr12, wr13 \n\t"
01023 "walignr2ne wr11, wr13, wr14 \n\t"
01024 "wunpckelub wr4, wr6 \n\t"
01025 "wunpckehub wr5, wr6 \n\t"
01026 "wunpckelub wr6, wr7 \n\t"
01027 "wunpckehub wr7, wr7 \n\t"
01028 "wunpckelub wr8, wr10 \n\t"
01029 "wunpckehub wr9, wr10 \n\t"
01030 "wunpckelub wr10, wr11 \n\t"
01031 "wunpckehub wr11, wr11 \n\t"
01032 "waddhus wr4, wr4, wr8 \n\t"
01033 "waddhus wr5, wr5, wr9 \n\t"
01034 "waddhus wr6, wr6, wr10 \n\t"
01035 "waddhus wr7, wr7, wr11 \n\t"
01036 "waddhus wr8, wr0, wr4 \n\t"
01037 "waddhus wr9, wr1, wr5 \n\t"
01038 "waddhus wr10, wr2, wr6 \n\t"
01039 "waddhus wr11, wr3, wr7 \n\t"
01040 "waddhus wr8, wr8, wr15 \n\t"
01041 "waddhus wr9, wr9, wr15 \n\t"
01042 "waddhus wr10, wr10, wr15 \n\t"
01043 "waddhus wr11, wr11, wr15 \n\t"
01044 "wsrlhg wr8, wr8, wcgr0 \n\t"
01045 "wsrlhg wr9, wr9, wcgr0 \n\t"
01046 "wldrd wr12, [%[block]] \n\t"
01047 "wldrd wr13, [%[block], #8] \n\t"
01048 "wsrlhg wr10, wr10, wcgr0 \n\t"
01049 "wsrlhg wr11, wr11, wcgr0 \n\t"
01050 "wpackhus wr8, wr8, wr9 \n\t"
01051 "wpackhus wr9, wr10, wr11 \n\t"
01052 WAVG2B" wr8, wr8, wr12 \n\t"
01053 WAVG2B" wr9, wr9, wr13 \n\t"
01054 "wstrd wr8, [%[block]] \n\t"
01055 "wstrd wr9, [%[block], #8] \n\t"
01056 "add %[block], %[block], %[line_size] \n\t"
01057
01058
01059
01060 "wldrd wr12, [%[pixels]] \n\t"
01061 "pld [%[block]] \n\t"
01062 "wldrd wr13, [%[pixels], #8] \n\t"
01063 "pld [%[block], #32] \n\t"
01064 "wldrd wr14, [%[pixels], #16] \n\t"
01065 "add %[pixels], %[pixels], %[line_size] \n\t"
01066 "walignr1 wr2, wr12, wr13 \n\t"
01067 "pld [%[pixels]] \n\t"
01068 "pld [%[pixels], #32] \n\t"
01069 "walignr1 wr3, wr13, wr14 \n\t"
01070 "wmoveq wr10, wr13 \n\t"
01071 "wmoveq wr11, wr14 \n\t"
01072 "walignr2ne wr10, wr12, wr13 \n\t"
01073 "walignr2ne wr11, wr13, wr14 \n\t"
01074 "wunpckelub wr0, wr2 \n\t"
01075 "wunpckehub wr1, wr2 \n\t"
01076 "wunpckelub wr2, wr3 \n\t"
01077 "wunpckehub wr3, wr3 \n\t"
01078 "wunpckelub wr8, wr10 \n\t"
01079 "wunpckehub wr9, wr10 \n\t"
01080 "wunpckelub wr10, wr11 \n\t"
01081 "wunpckehub wr11, wr11 \n\t"
01082 "waddhus wr0, wr0, wr8 \n\t"
01083 "waddhus wr1, wr1, wr9 \n\t"
01084 "waddhus wr2, wr2, wr10 \n\t"
01085 "waddhus wr3, wr3, wr11 \n\t"
01086 "waddhus wr8, wr0, wr4 \n\t"
01087 "waddhus wr9, wr1, wr5 \n\t"
01088 "waddhus wr10, wr2, wr6 \n\t"
01089 "waddhus wr11, wr3, wr7 \n\t"
01090 "waddhus wr8, wr8, wr15 \n\t"
01091 "waddhus wr9, wr9, wr15 \n\t"
01092 "waddhus wr10, wr10, wr15 \n\t"
01093 "waddhus wr11, wr11, wr15 \n\t"
01094 "wsrlhg wr8, wr8, wcgr0 \n\t"
01095 "wsrlhg wr9, wr9, wcgr0 \n\t"
01096 "wldrd wr12, [%[block]] \n\t"
01097 "wldrd wr13, [%[block], #8] \n\t"
01098 "wsrlhg wr10, wr10, wcgr0 \n\t"
01099 "wsrlhg wr11, wr11, wcgr0 \n\t"
01100 "wpackhus wr8, wr8, wr9 \n\t"
01101 "wpackhus wr9, wr10, wr11 \n\t"
01102 WAVG2B" wr8, wr8, wr12 \n\t"
01103 WAVG2B" wr9, wr9, wr13 \n\t"
01104 "wstrd wr8, [%[block]] \n\t"
01105 "wstrd wr9, [%[block], #8] \n\t"
01106 "add %[block], %[block], %[line_size] \n\t"
01107 "subs %[h], %[h], #2 \n\t"
01108 "pld [%[block]] \n\t"
01109 "pld [%[block], #32] \n\t"
01110 "bne 1b \n\t"
01111 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
01112 : [line_size]"r"(line_size)
01113 : "r12", "memory");
01114 }