From e387d69a178a627222905ef9d6dd88dcabd1d143 Mon Sep 17 00:00:00 2001
From: bruce-wu <bruce-wu@hotmail.com>
Date: Sat, 22 Sep 2012 13:02:15 +0800
Subject: [PATCH] modify-macro-h264_chroma_mc8-and-h264_chroma_mc4-patch
---
libavcodec/arm/h264cmc_neon.S | 62 +++++++++++++++++-----------------------
1 files changed, 26 insertions(+), 36 deletions(-)
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index c7e5460..3aeb545 100644
|
a
|
b
|
T cmp r7, #0 |
| 58 | 58 | vdup.8 d1, r12 |
| 59 | 59 | vld1.8 {d4, d5}, [r1], r4 |
| 60 | 60 | vdup.8 d2, r6 |
| 61 | | vld1.8 {d6, d7}, [r5], r4 |
| 62 | 61 | vdup.8 d3, r7 |
| 63 | 62 | |
| 64 | 63 | vext.8 d5, d4, d5, #1 |
| 65 | | vext.8 d7, d6, d7, #1 |
| 66 | 64 | |
| 67 | | 1: pld [r5] |
| | 65 | 1: |
| | 66 | vld1.64 {d6, d7}, [r5], r4 |
| | 67 | pld [r5] |
| 68 | 68 | vmull.u8 q8, d4, d0 |
| | 69 | vext.8 d7, d6, d7, #1 |
| 69 | 70 | vmlal.u8 q8, d5, d1 |
| 70 | 71 | vld1.8 {d4, d5}, [r1], r4 |
| 71 | 72 | vmlal.u8 q8, d6, d2 |
| … |
… |
T cmp r7, #0 |
| 76 | 77 | vmlal.u8 q9, d7, d1 |
| 77 | 78 | vmlal.u8 q9, d4, d2 |
| 78 | 79 | vmlal.u8 q9, d5, d3 |
| 79 | | vld1.8 {d6, d7}, [r5], r4 |
| 80 | 80 | pld [r1] |
| 81 | 81 | .ifc \codec,h264 |
| 82 | 82 | vrshrn.u16 d16, q8, #6 |
| … |
… |
T cmp r7, #0 |
| 92 | 92 | vld1.8 {d21}, [lr,:64], r2 |
| 93 | 93 | vrhadd.u8 q8, q8, q10 |
| 94 | 94 | .endif |
| 95 | | vext.8 d7, d6, d7, #1 |
| 96 | 95 | vst1.8 {d16}, [r0,:64], r2 |
| 97 | 96 | vst1.8 {d17}, [r0,:64], r2 |
| 98 | 97 | bgt 1b |
| … |
… |
T cmp r7, #0 |
| 108 | 107 | |
| 109 | 108 | add r5, r1, r2 |
| 110 | 109 | lsl r4, r2, #1 |
| | 110 | |
| | 111 | 3: |
| 111 | 112 | vld1.8 {d4}, [r1], r4 |
| 112 | 113 | vld1.8 {d6}, [r5], r4 |
| 113 | 114 | |
| 114 | | 3: pld [r5] |
| | 115 | pld [r5] |
| 115 | 116 | vmull.u8 q8, d4, d0 |
| 116 | 117 | vmlal.u8 q8, d6, d1 |
| 117 | | vld1.8 {d4}, [r1], r4 |
| 118 | 118 | vmull.u8 q9, d6, d0 |
| 119 | 119 | vmlal.u8 q9, d4, d1 |
| 120 | | vld1.8 {d6}, [r5], r4 |
| 121 | 120 | .ifc \codec,h264 |
| 122 | 121 | vrshrn.u16 d16, q8, #6 |
| 123 | 122 | vrshrn.u16 d17, q9, #6 |
| … |
… |
T cmp r7, #0 |
| 145 | 144 | vext.8 d5, d4, d5, #1 |
| 146 | 145 | vext.8 d7, d6, d7, #1 |
| 147 | 146 | |
| 148 | | 5: pld [r1] |
| | 147 | pld [r1] |
| 149 | 148 | subs r3, r3, #2 |
| 150 | 149 | vmull.u8 q8, d4, d0 |
| 151 | 150 | vmlal.u8 q8, d5, d1 |
| 152 | | vld1.8 {d4, d5}, [r1], r2 |
| 153 | 151 | vmull.u8 q9, d6, d0 |
| 154 | 152 | vmlal.u8 q9, d7, d1 |
| 155 | 153 | pld [r1] |
| 156 | | vext.8 d5, d4, d5, #1 |
| 157 | 154 | .ifc \codec,h264 |
| 158 | 155 | vrshrn.u16 d16, q8, #6 |
| 159 | 156 | vrshrn.u16 d17, q9, #6 |
| … |
… |
T cmp r7, #0 |
| 168 | 165 | vld1.8 {d21}, [lr,:64], r2 |
| 169 | 166 | vrhadd.u8 q8, q8, q10 |
| 170 | 167 | .endif |
| 171 | | vld1.8 {d6, d7}, [r1], r2 |
| 172 | | vext.8 d7, d6, d7, #1 |
| 173 | 168 | vst1.8 {d16}, [r0,:64], r2 |
| 174 | 169 | vst1.8 {d17}, [r0,:64], r2 |
| 175 | | bgt 5b |
| | 170 | bgt 4b |
| 176 | 171 | |
| 177 | 172 | pop {r4-r7, pc} |
| 178 | 173 | endfunc |
| … |
… |
endfunc |
| 182 | 177 | .macro h264_chroma_mc4 type, codec=h264 |
| 183 | 178 | function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 |
| 184 | 179 | push {r4-r7, lr} |
| 185 | | ldrd r4, r5, [sp, #20] |
| | 180 | ldrd r4, [sp, #20] |
| 186 | 181 | .ifc \type,avg |
| 187 | 182 | mov lr, r0 |
| 188 | 183 | .endif |
| … |
… |
T cmp r7, #0 |
| 216 | 211 | vdup.8 d1, r12 |
| 217 | 212 | vld1.8 {d4}, [r1], r4 |
| 218 | 213 | vdup.8 d2, r6 |
| 219 | | vld1.8 {d6}, [r5], r4 |
| 220 | 214 | vdup.8 d3, r7 |
| 221 | 215 | |
| 222 | 216 | vext.8 d5, d4, d5, #1 |
| 223 | | vext.8 d7, d6, d7, #1 |
| 224 | | vtrn.32 d4, d5 |
| 225 | | vtrn.32 d6, d7 |
| 226 | | |
| 227 | 217 | vtrn.32 d0, d1 |
| 228 | 218 | vtrn.32 d2, d3 |
| | 219 | vtrn.32 d4, d5 |
| 229 | 220 | |
| 230 | | 1: pld [r5] |
| | 221 | 1: |
| | 222 | vld1.8 {d6}, [r5], r4 |
| | 223 | pld [r5] |
| | 224 | vext.8 d7, d6, d7, #1 |
| 231 | 225 | vmull.u8 q8, d4, d0 |
| 232 | | vmlal.u8 q8, d6, d2 |
| | 226 | vtrn.32 d6, d7 |
| | 227 | |
| 233 | 228 | vld1.8 {d4}, [r1], r4 |
| | 229 | vmlal.u8 q8, d6, d2 |
| 234 | 230 | vext.8 d5, d4, d5, #1 |
| 235 | | vtrn.32 d4, d5 |
| 236 | 231 | vmull.u8 q9, d6, d0 |
| | 232 | vtrn.32 d4, d5 |
| 237 | 233 | vmlal.u8 q9, d4, d2 |
| 238 | | vld1.8 {d6}, [r5], r4 |
| | 234 | |
| 239 | 235 | vadd.i16 d16, d16, d17 |
| 240 | 236 | vadd.i16 d17, d18, d19 |
| 241 | 237 | .ifc \codec,h264 |
| … |
… |
T cmp r7, #0 |
| 251 | 247 | vld1.32 {d20[1]}, [lr,:32], r2 |
| 252 | 248 | vrhadd.u8 d16, d16, d20 |
| 253 | 249 | .endif |
| 254 | | vext.8 d7, d6, d7, #1 |
| 255 | | vtrn.32 d6, d7 |
| 256 | 250 | vst1.32 {d16[0]}, [r0,:32], r2 |
| 257 | 251 | vst1.32 {d16[1]}, [r0,:32], r2 |
| 258 | 252 | bgt 1b |
| … |
… |
T cmp r7, #0 |
| 271 | 265 | add r5, r1, r2 |
| 272 | 266 | lsl r4, r2, #1 |
| 273 | 267 | vld1.32 {d4[0]}, [r1], r4 |
| | 268 | 3: |
| 274 | 269 | vld1.32 {d4[1]}, [r5], r4 |
| 275 | 270 | |
| 276 | | 3: pld [r5] |
| | 271 | pld [r5] |
| 277 | 272 | vmull.u8 q8, d4, d0 |
| 278 | 273 | vld1.32 {d4[0]}, [r1], r4 |
| 279 | 274 | vmull.u8 q9, d4, d1 |
| 280 | | vld1.32 {d4[1]}, [r5], r4 |
| | 275 | |
| 281 | 276 | vadd.i16 d16, d16, d17 |
| 282 | 277 | vadd.i16 d17, d18, d19 |
| 283 | 278 | .ifc \codec,h264 |
| … |
… |
T cmp r7, #0 |
| 306 | 301 | vtrn.32 d4, d5 |
| 307 | 302 | vtrn.32 d6, d7 |
| 308 | 303 | |
| 309 | | 5: vmull.u8 q8, d4, d0 |
| | 304 | vmull.u8 q8, d4, d0 |
| 310 | 305 | vmull.u8 q9, d6, d0 |
| 311 | 306 | subs r3, r3, #2 |
| 312 | | vld1.8 {d4}, [r1], r2 |
| 313 | | vext.8 d5, d4, d5, #1 |
| 314 | | vtrn.32 d4, d5 |
| | 307 | |
| 315 | 308 | vadd.i16 d16, d16, d17 |
| 316 | 309 | vadd.i16 d17, d18, d19 |
| 317 | 310 | pld [r1] |
| … |
… |
T cmp r7, #0 |
| 326 | 319 | vld1.32 {d20[1]}, [lr,:32], r2 |
| 327 | 320 | vrhadd.u8 d16, d16, d20 |
| 328 | 321 | .endif |
| 329 | | vld1.8 {d6}, [r1], r2 |
| 330 | | vext.8 d7, d6, d7, #1 |
| 331 | | vtrn.32 d6, d7 |
| 332 | 322 | pld [r1] |
| 333 | 323 | vst1.32 {d16[0]}, [r0,:32], r2 |
| 334 | 324 | vst1.32 {d16[1]}, [r0,:32], r2 |
| 335 | | bgt 5b |
| | 325 | bgt 4b |
| 336 | 326 | |
| 337 | 327 | pop {r4-r7, pc} |
| 338 | 328 | endfunc |