[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: h264, add idct for 10bit
Mikhail Nitenko
mnitenko at gmail.com
Fri Aug 20 00:31:02 EEST 2021
Benchmarks: A53 A72
h264_idct4_add_10bpp_c: 187.7 115.2
h264_idct4_add_10bpp_neon: 72.5 45.0
h264_idct4_add_dc_10bpp_c: 96.0 61.2
h264_idct4_add_dc_10bpp_neon: 36.0 19.5
h264_idct8_add4_10bpp_c: 2115.5 1424.2
h264_idct8_add4_10bpp_neon: 734.0 459.5
h264_idct8_add_10bpp_c: 1017.5 709.0
h264_idct8_add_10bpp_neon: 345.5 216.5
h264_idct8_add_dc_10bpp_c: 316.0 235.5
h264_idct8_add_dc_10bpp_neon: 69.7 44.0
h264_idct_add16_10bpp_c: 2540.2 1498.5
h264_idct_add16_10bpp_neon: 1080.5 616.0
h264_idct_add16intra_10bpp_c: 784.7 439.5
h264_idct_add16intra_10bpp_neon: 641.0 462.2
Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
there is a function that is not covered by tests, but I tested it with
sample videos, not sure what to do with it
libavcodec/aarch64/h264dsp_init_aarch64.c | 28 ++
libavcodec/aarch64/h264idct_neon.S | 524 ++++++++++++++++++++++
2 files changed, 552 insertions(+)
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 6bf3ecb8a1..78ed9d06cd 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -106,6 +106,24 @@ void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t strid
void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
+void ff_h264_idct_add_neon_10(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_neon_10(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_neon_10(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon_10(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon_10(uint8_t **dest, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+
+void ff_h264_idct8_add_neon_10(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_neon_10(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add4_neon_10(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
@@ -162,5 +180,15 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
}
+
+ c->h264_idct_add = ff_h264_idct_add_neon_10;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_neon_10;
+ c->h264_idct_add16 = ff_h264_idct_add16_neon_10;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon_10;
+ if (chroma_format_idc <= 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_neon_10;
+ c->h264_idct8_add = ff_h264_idct8_add_neon_10;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon_10;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_neon_10;
}
}
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 7de44205d3..f238a2cd3f 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -411,3 +411,527 @@ const scan8
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
endconst
+
+function ff_h264_idct_add_neon_10, export=1
+.L_ff_h264_idct_add_neon_10:
+ ld1 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1]
+
+ sxtw x2, w2
+ movi v30.8H, #0
+
+ add v4.4S, v0.4S, v2.4S
+ sshr v16.4S, v1.4S, #1
+ st1 {v30.8H}, [x1], #16
+ st1 {v30.8H}, [x1], #16
+ sshr v17.4S, v3.4S, #1
+ st1 {v30.8H}, [x1], #16
+ st1 {v30.8H}, [x1], #16
+ sub v5.4S, v0.4S, v2.4S
+ sub v6.4S, v16.4S, v3.4S
+ add v7.4S, v1.4S, v17.4S
+ add v0.4S, v4.4S, v7.4S
+ add v1.4S, v5.4S, v6.4S
+ sub v2.4S, v5.4S, v6.4S
+ sub v3.4S, v4.4S, v7.4S
+
+ transpose_4x4S v0, v1, v2, v3, v4, v5, v6, v7
+
+ add v4.4S, v0.4S, v2.4S
+ ld1 {v18.D}[0], [x0], x2
+ sshr v16.4S, v3.4S, #1
+ sshr v17.4S, v1.4S, #1
+ ld1 {v18.D}[1], [x0], x2
+ sub v5.4S, v0.4S, v2.4S
+ ld1 {v19.D}[1], [x0], x2
+ add v6.4S, v16.4S, v1.4S
+ sub v7.4S, v17.4S, v3.4S
+ ld1 {v19.D}[0], [x0], x2
+ sub x0, x0, x2, lsl #2
+ add v0.4S, v4.4S, v6.4S
+ add v1.4S, v5.4S, v7.4S
+ sub v2.4S, v4.4S, v6.4S
+ sub v3.4S, v5.4S, v7.4S
+
+ srshr v0.4S, v0.4S, #6
+ srshr v1.4S, v1.4S, #6
+ srshr v2.4S, v2.4S, #6
+ srshr v3.4S, v3.4S, #6
+
+ uaddw v0.4S, v0.4S, v18.4H
+ uaddw2 v1.4S, v1.4S, v18.8H
+ uaddw v2.4S, v2.4S, v19.4H
+ uaddw2 v3.4S, v3.4S, v19.8H
+
+ sqxtun v0.4H, v0.4S
+ sqxtun2 v0.8H, v1.4S
+ sqxtun v1.4H, v2.4S
+ sqxtun2 v1.8H, v3.4S
+
+ st1 {v0.D}[0], [x0], x2
+ st1 {v0.D}[1], [x0], x2
+ st1 {v1.D}[1], [x0], x2
+ st1 {v1.D}[0], [x0], x2
+
+ sub x1, x1, #64
+ ret
+endfunc
+
+function ff_h264_idct_dc_add_neon_10, export=1
+.L_ff_h264_idct_dc_add_neon_10:
+ sxtw x2, w2
+ mov x3, #0
+ ld1r {v2.4S}, [x1]
+ dup v3.4S, v2.S[0]
+ str x3, [x1]
+ srshr v2.4S, v2.4S, #6
+ srshr v3.4S, v3.4S, #6
+ ld1 {v0.D}[0], [x0], x2
+ ld1 {v0.D}[1], [x0], x2
+ uaddw v4.4S, v2.4S, v0.4H
+ uaddw2 v5.4S, v3.4S, v0.8H
+ ld1 {v1.D}[0], [x0], x2
+ ld1 {v1.D}[1], [x0], x2
+ uaddw v6.4S, v2.4S, v1.4H
+ uaddw2 v7.4S, v3.4S, v1.8H
+ sqxtun v0.4H, v4.4S
+ sqxtun2 v0.8H, v5.4S
+ sqxtun v1.4H, v6.4S
+ sqxtun2 v1.8H, v7.4S
+ sub x0, x0, x2, lsl #2
+
+ mvni v4.8H, #0xFC, lsl #8
+ smin v0.8H, v0.8H, v4.8H
+ smin v1.8H, v1.8H, v4.8H
+
+ st1 {v0.D}[0], [x0], x2
+ st1 {v0.D}[1], [x0], x2
+ st1 {v1.D}[0], [x0], x2
+ st1 {v1.D}[1], [x0], x2
+ ret
+endfunc
+
+function ff_h264_idct_add16_neon_10, export=1
+ mov x12, x30
+ mov x6, x0 // dest
+ mov x5, x1 // block_offset
+ mov x1, x2 // block
+ mov w9, w3 // stride
+ movrel x7, scan8
+ mov x10, #16
+ movrel x13, .L_ff_h264_idct_dc_add_neon_10
+ movrel x14, .L_ff_h264_idct_add_neon_10
+1: mov w2, w9
+ ldrb w3, [x7], #1
+ ldrsw x0, [x5], #4
+ ldrb w3, [x4, w3, uxtw]
+ subs w3, w3, #1
+ b.lt 2f
+ ldrsh w3, [x1]
+ add x0, x0, x6
+ ccmp w3, #0, #4, eq
+ csel x15, x13, x14, ne
+ blr x15
+2: subs x10, x10, #1
+ add x1, x1, #64
+ b.ne 1b
+ ret x12
+endfunc
+
+function ff_h264_idct_add16intra_neon_10, export=1
+ mov x12, x30
+ mov x6, x0 // dest
+ mov x5, x1 // block_offset
+ mov x1, x2 // block
+ mov w9, w3 // stride
+ movrel x7, scan8
+ mov x10, #16
+ movrel x13, .L_ff_h264_idct_dc_add_neon_10
+ movrel x14, .L_ff_h264_idct_add_neon_10
+1: mov w2, w9
+ ldrb w3, [x7], #1
+ ldrsw x0, [x5], #4
+ ldrb w3, [x4, w3, uxtw]
+ add x0, x0, x6
+ cmp w3, #0
+ ldrsh w3, [x1]
+ csel x15, x13, x14, eq
+ ccmp w3, #0, #0, eq
+ b.eq 2f
+ blr x15
+2: subs x10, x10, #1
+ add x1, x1, #64
+ b.ne 1b
+ ret x12
+endfunc
+
+function ff_h264_idct_add8_neon_10, export=1 // NO TESTS but test video looks fine (did not look fine before the fixes so it is definitely working somehow)
+ sub sp, sp, #0x40
+ stp x19, x20, [sp]
+ mov x12, x30
+ ldp x6, x15, [x0] // dest[0], dest[1]
+ add x5, x1, #16*4 // block_offset
+ add x9, x2, #32*32 // block
+ mov w19, w3 // stride
+ movrel x13, .L_ff_h264_idct_dc_add_neon_10
+ movrel x14, .L_ff_h264_idct_add_neon_10
+ movrel x7, scan8, 16
+ mov x10, #0 // i
+ mov x11, #16
+1: mov w2, w19
+ ldrb w3, [x7, x10] // scan8[i]
+ ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
+ ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
+ add x0, x0, x6 // block_offset[i] + dst[j-1]
+ add x1, x9, x10, lsl #6 // block + i * 16 * 2
+ cmp w3, #0
+ ldrsw x3, [x1] // block[i*16]
+ csel x20, x13, x14, eq
+ ccmp x3, #0, #0, eq
+ b.eq 2f
+ blr x20
+2: add x10, x10, #1
+ cmp x10, #4
+ csel x10, x11, x10, eq
+ csel x6, x15, x6, eq
+ cmp x10, #20
+ b.lt 1b
+ ldp x19, x20, [sp]
+ add sp, sp, #0x40
+ ret x12
+endfunc
+
+.macro idct8x8_cols_10 pass
+ .if \pass == 0
+ va .req v0
+ vaa .req v1
+ vb .req v28
+ vbb .req v29
+
+ sshr v0.4S, v20.4S, #1
+ sshr v1.4S, v21.4S, #1
+ add v2.4S, v16.4S, v24.4S // a0
+ add v3.4S, v17.4S, v25.4S
+
+ ld1 {v28.4S, v29.4S, v30.4S, v31.4S}, [x1]
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+
+ sub v4.4S, v16.4S, v24.4S
+ sub v5.4S, v17.4S, v25.4S
+ sshr v6.4S, v28.4S, #1
+ sshr v7.4S, v29.4S, #1
+ sub v0.4S, v0.4S, v28.4S
+ sub v1.4S, v1.4S, v29.4S
+ add v6.4S, v6.4S, v20.4S // a6
+ add v7.4S, v7.4S, v21.4S
+ .else
+ va .req v28
+ vaa .req v29
+ vb .req v0
+ vbb .req v1
+
+ sshr v28.4S, v20.4S, #1
+ sshr v29.4S, v21.4S, #1
+ sshr v6.4S, v0.4S, #1
+ sshr v7.4S, v1.4S, #1
+ add v2.4S, v16.4S, v24.4S
+ add v3.4S, v17.4S, v25.4S
+ sub v4.4S, v16.4S, v24.4S
+ sub v5.4S, v17.4S, v25.4S
+ sub v28.4S, v28.4S, v0.4S
+ sub v29.4S, v29.4S, v1.4S
+ add v6.4S, v6.4S, v20.4S
+ add v7.4S, v7.4S, v21.4S
+ .endif
+ add v20.4S, v4.4S, va.4S
+ add v21.4S, v5.4S, vaa.4S
+ sub v24.4S, v4.4S, va.4S
+ sub v25.4S, v5.4S, vaa.4S
+ add v16.4S, v2.4S, v6.4S // b0
+ add v17.4S, v3.4S, v7.4S
+ sub vb.4S, v2.4S, v6.4S
+ sub vbb.4S, v3.4S, v7.4S
+ sub v2.4S, v26.4S, v22.4S
+ sub v3.4S, v27.4S, v23.4S
+ add v4.4S, v30.4S, v18.4S
+ add v5.4S, v31.4S, v19.4S
+ sub va.4S, v30.4S, v18.4S
+ sub vaa.4S, v31.4S, v19.4S
+ add v6.4S, v26.4S, v22.4S
+ add v7.4S, v27.4S, v23.4S
+ sub v2.4S, v2.4S, v30.4S
+ sub v3.4S, v3.4S, v31.4S
+ sub v4.4S, v4.4S, v22.4S
+ sub v5.4S, v5.4S, v23.4S
+ add va.4S, va.4S, v26.4S
+ add vaa.4S, vaa.4S, v27.4S
+ add v6.4S, v6.4S, v18.4S
+ add v7.4S, v7.4S, v19.4S
+ sshr v18.4S, v18.4S, #1
+ sshr v19.4S, v19.4S, #1
+ sshr v22.4S, v22.4S, #1
+ sshr v23.4S, v23.4S, #1
+ sshr v26.4S, v26.4S, #1
+ sshr v27.4S, v27.4S, #1
+ sshr v30.4S, v30.4S, #1
+ sshr v31.4S, v31.4S, #1
+ sub v2.4S, v2.4S, v30.4S
+ sub v3.4S, v3.4S, v31.4S
+ sub v4.4S, v4.4S, v22.4S
+ sub v5.4S, v5.4S, v23.4S
+ add va.4S, va.4S, v26.4S
+ add vaa.4S, vaa.4S, v27.4S
+ add v6.4S, v6.4S, v18.4S // a7
+ add v7.4S, v7.4S, v19.4S
+ sshr v18.4S, v2.4S, #2
+ sshr v19.4S, v3.4S, #2
+ sshr v22.4S, v4.4S, #2
+ sshr v23.4S, v5.4S, #2
+ sshr v26.4S, va.4S, #2
+ sshr v27.4S, vaa.4S, #2
+ sshr v30.4S, v6.4S, #2
+ sshr v31.4S, v7.4S, #2
+ sub v6.4S, v6.4S, v18.4S
+ sub v7.4S, v7.4S, v19.4S
+ sub va.4S, v22.4S, va.4S
+ sub vaa.4S, v23.4S, vaa.4S
+ add v4.4S, v4.4S, v26.4S
+ add v5.4S, v5.4S, v27.4S
+ add v2.4S, v2.4S, v30.4S
+ add v3.4S, v3.4S, v31.4S
+ .if \pass == 0
+ sub v30.4S, v16.4S, v6.4S
+ sub v31.4S, v17.4S, v7.4S
+ add v16.4S, v16.4S, v6.4S
+ add v17.4S, v17.4S, v7.4S
+ add v18.4S, v20.4S, v0.4S
+ add v19.4S, v21.4S, v1.4S
+ sub v0.4S, v20.4S, v0.4S
+ sub v1.4S, v21.4S, v1.4S
+ add v20.4S, v24.4S, v4.4S
+ add v21.4S, v25.4S, v5.4S
+ add v22.4S, v28.4S, v2.4S
+ add v23.4S, v29.4S, v3.4S
+ sub v26.4S, v24.4S, v4.4S
+ sub v27.4S, v25.4S, v5.4S
+ sub v24.4S, v28.4S, v2.4S
+ sub v25.4S, v29.4S, v3.4S
+ .else
+ sub v30.4S, v16.4S, v6.4S
+ sub v31.4S, v17.4S, v7.4S
+ add v16.4S, v16.4S, v6.4S
+ add v17.4S, v17.4S, v7.4S
+ add v18.4S, v20.4S, v28.4S
+ add v19.4S, v21.4S, v29.4S
+ sub v28.4S, v20.4S, v28.4S
+ sub v29.4S, v21.4S, v29.4S
+ add v20.4S, v24.4S, v4.4S
+ add v21.4S, v25.4S, v5.4S
+ sub v26.4S, v24.4S, v4.4S
+ sub v27.4S, v25.4S, v5.4S
+ add v22.4S, v0.4S, v2.4S
+ add v23.4S, v1.4S, v3.4S
+ sub v24.4S, v0.4S, v2.4S
+ sub v25.4S, v1.4S, v3.4S
+ .endif
+ .unreq va
+ .unreq vaa
+ .unreq vb
+ .unreq vbb
+.endm
+
+function ff_h264_idct8_add_neon_10, export=1
+.L_ff_h264_idct8_add_neon_10:
+ movi v7.4S, #0
+ sxtw x2, w2
+
+ ld1 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1]
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+
+ ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1]
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+
+ ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1]
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+ st1 {v7.4S}, [x1], #16
+
+ idct8x8_cols_10 0
+
+ transpose_8x8S v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v0, v1, v30, v31, v2, v3, v4, v5
+ idct8x8_cols_10 1
+
+ mov x3, x0
+ srshr v16.4S, v16.4S, #6
+ srshr v17.4S, v17.4S, #6
+ ld1 {v0.8H}, [x0], x2
+ srshr v18.4S, v18.4S, #6
+ srshr v19.4S, v19.4S, #6
+ ld1 {v1.8H}, [x0], x2
+ srshr v20.4S, v20.4S, #6
+ srshr v21.4S, v21.4S, #6
+ ld1 {v2.8H}, [x0], x2
+ srshr v22.4S, v22.4S, #6
+ srshr v23.4S, v23.4S, #6
+ ld1 {v3.8H}, [x0], x2
+ srshr v24.4S, v24.4S, #6
+ srshr v25.4S, v25.4S, #6
+ ld1 {v4.8H}, [x0], x2
+ srshr v26.4S, v26.4S, #6
+ srshr v27.4S, v27.4S, #6
+ ld1 {v5.8H}, [x0], x2
+ srshr v28.4S, v28.4S, #6
+ srshr v29.4S, v29.4S, #6
+ ld1 {v6.8H}, [x0], x2
+ srshr v30.4S, v30.4S, #6
+ srshr v31.4S, v31.4S, #6
+ ld1 {v7.8H}, [x0], x2
+ uaddw v16.4S, v16.4S, v0.4H
+ uaddw2 v17.4S, v17.4S, v0.8H
+ uaddw v18.4S, v18.4S, v1.4H
+ uaddw2 v19.4S, v19.4S, v1.8H
+ uaddw v20.4S, v20.4S, v2.4H
+ uaddw2 v21.4S, v21.4S, v2.8H
+ sqxtun v0.4H, v16.4S
+ sqxtun2 v0.8H, v17.4S
+ uaddw v22.4S, v22.4S, v3.4H
+ uaddw2 v23.4S, v23.4S, v3.8H
+ sqxtun v1.4H, v18.4S
+ sqxtun2 v1.8H, v19.4S
+ uaddw v24.4S, v24.4S, v4.4H
+ uaddw2 v25.4S, v25.4S, v4.8H
+ sqxtun v2.4H, v20.4S
+ sqxtun2 v2.8H, v21.4S
+ st1 {v0.8H}, [x3], x2
+ uaddw v26.4S, v26.4S, v5.4H
+ uaddw2 v27.4S, v27.4S, v5.8H
+ sqxtun v3.4H, v22.4S
+ sqxtun2 v3.8H, v23.4S
+ st1 {v1.8H}, [x3], x2
+ uaddw v28.4S, v28.4S, v6.4H
+ uaddw2 v29.4S, v29.4S, v6.8H
+ sqxtun v4.4H, v24.4S
+ sqxtun2 v4.8H, v25.4S
+ st1 {v2.8H}, [x3], x2
+ uaddw v30.4S, v30.4S, v7.4H
+ uaddw2 v31.4S, v31.4S, v7.8H
+ sqxtun v5.4H, v26.4S
+ sqxtun2 v5.8H, v27.4S
+ st1 {v3.8H}, [x3], x2
+ sqxtun v6.4H, v28.4S
+ sqxtun2 v6.8H, v29.4S
+ sqxtun v7.4H, v30.4S
+ sqxtun2 v7.8H, v31.4S
+ st1 {v4.8H}, [x3], x2
+ st1 {v5.8H}, [x3], x2
+ st1 {v6.8H}, [x3], x2
+ st1 {v7.8H}, [x3], x2
+
+ sub x1, x1, #256
+ ret
+endfunc
+
+function ff_h264_idct8_dc_add_neon_10, export=1
+.L_ff_h264_idct8_dc_add_neon_10:
+ mov x3, #0
+ sxtw x2, w2
+ ld1r {v31.4S}, [x1]
+ str x3, [x1]
+ ld1 {v0.8H}, [x0], x2
+ srshr v30.4S, v31.4S, #6
+ srshr v31.4S, v31.4S, #6
+ ld1 {v1.8H}, [x0], x2
+ ld1 {v2.8H}, [x0], x2
+ uaddw v16.4S, v31.4S, v0.4H
+ uaddw2 v17.4S, v31.4S, v0.8H
+ ld1 {v3.8H}, [x0], x2
+ uaddw v18.4S, v31.4S, v1.4H
+ uaddw2 v19.4S, v31.4S, v1.8H
+ ld1 {v4.8H}, [x0], x2
+ uaddw v20.4S, v31.4S, v2.4H
+ uaddw2 v21.4S, v31.4S, v2.8H
+ ld1 {v5.8H}, [x0], x2
+ uaddw v22.4S, v31.4S, v3.4H
+ uaddw2 v23.4S, v31.4S, v3.8H
+ ld1 {v6.8H}, [x0], x2
+ uaddw v24.4S, v31.4S, v4.4H
+ uaddw2 v25.4S, v31.4S, v4.8H
+ ld1 {v7.8H}, [x0], x2
+ uaddw v26.4S, v31.4S, v5.4H
+ uaddw2 v27.4S, v31.4S, v5.8H
+ uaddw v28.4S, v31.4S, v6.4H
+ uaddw2 v29.4S, v31.4S, v6.8H
+ uaddw v30.4S, v31.4S, v7.4H
+ uaddw2 v31.4S, v31.4S, v7.8H
+ sqxtun v0.4H, v16.4S
+ sqxtun2 v0.8H, v17.4S
+ sqxtun v1.4H, v18.4S
+ sqxtun2 v1.8H, v19.4S
+ sqxtun v2.4H, v20.4S
+ sqxtun2 v2.8H, v21.4S
+ sqxtun v3.4H, v22.4S
+ sqxtun2 v3.8H, v23.4S
+ sub x0, x0, x2, lsl #3
+
+ mvni v16.8H, #0xFC, lsl #8
+ smin v0.8H, v0.8H, v16.8H
+ st1 {v0.8H}, [x0], x2
+ sqxtun v4.4H, v24.4S
+ sqxtun2 v4.8H, v25.4S
+ smin v1.8H, v1.8H, v16.8H
+ st1 {v1.8H}, [x0], x2
+ sqxtun v5.4H, v26.4S
+ sqxtun2 v5.8H, v27.4S
+ smin v2.8H, v2.8H, v16.8H
+ st1 {v2.8H}, [x0], x2
+ sqxtun v6.4H, v28.4S
+ sqxtun2 v6.8H, v29.4S
+ smin v3.8H, v3.8H, v16.8H
+ st1 {v3.8H}, [x0], x2
+ sqxtun v7.4H, v30.4S
+ sqxtun2 v7.8H, v31.4S
+ smin v4.8H, v4.8H, v16.8H
+ st1 {v4.8H}, [x0], x2
+ smin v5.8H, v5.8H, v16.8H
+ st1 {v5.8H}, [x0], x2
+ smin v6.8H, v6.8H, v16.8H
+ st1 {v6.8H}, [x0], x2
+ smin v7.8H, v7.8H, v16.8H
+ st1 {v7.8H}, [x0], x2
+ ret
+endfunc
+
+function ff_h264_idct8_add4_neon_10, export=1
+ mov x12, x30
+ mov x6, x0
+ mov x5, x1
+ mov x1, x2
+ mov w2, w3
+ movrel x7, scan8
+ mov w10, #16
+ movrel x13, .L_ff_h264_idct8_dc_add_neon_10
+ movrel x14, .L_ff_h264_idct8_add_neon_10
+1: ldrb w9, [x7], #4
+ ldrsw x0, [x5], #16 // block_offset
+ ldrb w9, [x4, w9, UXTW] // nnz
+ subs w9, w9, #1
+ b.lt 2f
+ ldr w11, [x1]
+ add x0, x6, x0
+ ccmp w11, #0, #4, eq
+ csel x15, x13, x14, ne
+ blr x15
+2: subs w10, w10, #4
+ add x1, x1, #256
+ b.ne 1b
+ ret x12
+endfunc
--
2.32.0
More information about the ffmpeg-devel
mailing list