[FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions
Mikhail Nitenko
mnitenko at gmail.com
Thu Aug 19 23:53:18 EEST 2021
Benchmarks: A53 A72
avg_h264_qpel_8_mc01_10_c: 932.7 638.5
avg_h264_qpel_8_mc01_10_neon: 397.7 212.2
avg_h264_qpel_8_mc02_10_c: 946.2 691.2
avg_h264_qpel_8_mc02_10_neon: 365.0 199.0
avg_h264_qpel_8_mc03_10_c: 932.7 639.5
avg_h264_qpel_8_mc03_10_neon: 399.2 214.0
avg_h264_qpel_8_mc10_10_c: 1441.7 810.2
avg_h264_qpel_8_mc10_10_neon: 341.7 156.0
avg_h264_qpel_8_mc11_10_c: 2158.0 1330.0
avg_h264_qpel_8_mc11_10_neon: 671.0 343.5
avg_h264_qpel_8_mc13_10_c: 2163.7 1327.7
avg_h264_qpel_8_mc13_10_neon: 673.0 335.0
avg_h264_qpel_8_mc20_10_c: 1434.0 769.5
avg_h264_qpel_8_mc20_10_neon: 309.7 140.5
avg_h264_qpel_8_mc30_10_c: 1448.2 802.0
avg_h264_qpel_8_mc30_10_neon: 357.7 156.7
avg_h264_qpel_8_mc31_10_c: 2188.5 1329.2
avg_h264_qpel_8_mc31_10_neon: 699.0 346.2
avg_h264_qpel_8_mc33_10_c: 2192.2 1337.5
avg_h264_qpel_8_mc33_10_neon: 700.0 349.0
avg_h264_qpel_16_mc01_10_c: 3768.5 2583.5
avg_h264_qpel_16_mc01_10_neon: 1572.5 854.5
avg_h264_qpel_16_mc02_10_c: 3783.0 2736.2
avg_h264_qpel_16_mc02_10_neon: 1442.7 796.7
avg_h264_qpel_16_mc03_10_c: 3789.5 2572.5
avg_h264_qpel_16_mc03_10_neon: 1574.0 854.2
avg_h264_qpel_16_mc10_10_c: 5879.0 3276.0
avg_h264_qpel_16_mc10_10_neon: 1331.5 611.0
avg_h264_qpel_16_mc11_10_c: 8711.7 5344.0
avg_h264_qpel_16_mc11_10_neon: 2634.0 1349.0
avg_h264_qpel_16_mc13_10_c: 8645.0 5309.2
avg_h264_qpel_16_mc13_10_neon: 2630.7 1356.5
avg_h264_qpel_16_mc20_10_c: 5722.5 3111.0
avg_h264_qpel_16_mc20_10_neon: 1203.5 561.0
avg_h264_qpel_16_mc30_10_c: 5926.0 3252.0
avg_h264_qpel_16_mc30_10_neon: 1395.5 613.5
avg_h264_qpel_16_mc31_10_c: 8722.2 5310.2
avg_h264_qpel_16_mc31_10_neon: 2739.7 1382.2
avg_h264_qpel_16_mc33_10_c: 8754.7 5312.7
avg_h264_qpel_16_mc33_10_neon: 2735.7 1402.7
put_h264_qpel_8_mc01_10_c: 854.7 589.0
put_h264_qpel_8_mc01_10_neon: 356.7 196.2
put_h264_qpel_8_mc02_10_c: 780.0 548.5
put_h264_qpel_8_mc02_10_neon: 324.0 181.2
put_h264_qpel_8_mc03_10_c: 854.7 591.7
put_h264_qpel_8_mc03_10_neon: 358.2 199.0
put_h264_qpel_8_mc10_10_c: 1364.7 754.2
put_h264_qpel_8_mc10_10_neon: 305.7 140.7
put_h264_qpel_8_mc11_10_c: 2079.0 1282.2
put_h264_qpel_8_mc11_10_neon: 630.0 328.2
put_h264_qpel_8_mc13_10_c: 2078.5 1279.0
put_h264_qpel_8_mc13_10_neon: 632.0 322.5
put_h264_qpel_8_mc20_10_c: 1221.5 683.7
put_h264_qpel_8_mc20_10_neon: 273.7 125.0
put_h264_qpel_8_mc30_10_c: 1377.2 758.0
put_h264_qpel_8_mc30_10_neon: 326.7 141.5
put_h264_qpel_8_mc31_10_c: 2107.0 1278.5
put_h264_qpel_8_mc31_10_neon: 658.0 331.2
put_h264_qpel_8_mc33_10_c: 2107.0 1285.0
put_h264_qpel_8_mc33_10_neon: 659.0 332.0
put_h264_qpel_16_mc01_10_c: 3529.7 2412.5
put_h264_qpel_16_mc01_10_neon: 1408.5 786.5
put_h264_qpel_16_mc02_10_c: 3151.5 2121.0
put_h264_qpel_16_mc02_10_neon: 1278.7 725.5
put_h264_qpel_16_mc03_10_c: 3546.5 2375.5
put_h264_qpel_16_mc03_10_neon: 1410.0 787.7
put_h264_qpel_16_mc10_10_c: 5511.5 2999.0
put_h264_qpel_16_mc10_10_neon: 1187.5 558.2
put_h264_qpel_16_mc11_10_c: 8424.2 5137.7
put_h264_qpel_16_mc11_10_neon: 2465.0 1277.7
put_h264_qpel_16_mc13_10_c: 8597.2 5127.7
put_h264_qpel_16_mc13_10_neon: 2466.7 1290.5
put_h264_qpel_16_mc20_10_c: 4894.5 2745.7
put_h264_qpel_16_mc20_10_neon: 1059.5 494.2
put_h264_qpel_16_mc30_10_c: 5576.5 3035.0
put_h264_qpel_16_mc30_10_neon: 1251.5 558.2
put_h264_qpel_16_mc31_10_c: 8695.5 5150.5
put_h264_qpel_16_mc31_10_neon: 2570.7 1320.5
put_h264_qpel_16_mc33_10_c: 8702.5 5131.2
put_h264_qpel_16_mc33_10_neon: 2571.7 1337.0
Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
libavcodec/aarch64/h264qpel_init_aarch64.c | 91 +++-
libavcodec/aarch64/h264qpel_neon.S | 515 +++++++++++++++++++++
2 files changed, 604 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 77f41d9a21..93fa5246c4 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -95,12 +95,55 @@ void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
{
- const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
- if (have_neon(cpu_flags) && !high_bit_depth) {
+ if (have_neon(cpu_flags) && bit_depth <= 8) {
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
@@ -168,5 +211,49 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+ } else if (have_neon(cpu_flags) && bit_depth == 10) {
+ c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon_10;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon_10;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon_10;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon_10;
+
+ c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon_10;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon_10;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon_10;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon_10;
+
+ c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon_10;
+
+ c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon_10;
}
}
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index d27cfac494..eb18469b7f 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -932,3 +932,518 @@ endfunc
h264_qpel16 put
h264_qpel16 avg
+
+//trashes v0-v5, v7
+.macro lowpass_8_10 r0, r1, r2, r3, d0, d1
+ ext v2.16B, \r0\().16B, \r1\().16B, #4
+ ext v3.16B, \r0\().16B, \r1\().16B, #6
+ add v2.8H, v2.8H, v3.8H
+ ext v4.16B, \r0\().16B, \r1\().16B, #2
+ ext v5.16B, \r0\().16B, \r1\().16B, #8
+ add v4.8H, v4.8H, v5.8H
+ ext v1.16B, \r0\().16B, \r1\().16B, #10
+ uaddl2 \d1\().4S, \r0\().8H, v1.8H
+ uaddl \d0\().4S, \r0\().4H, v1.4H
+ ext v0.16B, \r2\().16B, \r3\().16B, #4
+ umlal \d0\().4S, v2.4H, v6.H[1]
+ umlal2 \d1\().4S, v2.8H, v6.H[1]
+ ext v1.16B, \r2\().16B, \r3\().16B, #6
+ add v0.8H, v0.8H, v1.8H
+ ext v1.16B, \r2\().16B, \r3\().16B, #2
+ umlsl \d0\().4S, v4.4H, v6.H[0]
+ umlsl2 \d1\().4S, v4.8H, v6.H[0]
+ sqrshrun \d0\().4H, \d0\().4S, #5
+ sqrshrun2 \d0\().8H, \d1\().4S, #5
+ ext v3.16B, \r2\().16B, \r3\().16B, #8
+ add v1.8H, v1.8H, v3.8H
+ ext v2.16B, \r2\().16B, \r3\().16B, #10
+ uaddl v3.4S, \r2\().4H, v2.4H
+ uaddl2 v4.4S, \r2\().8H, v2.8H
+ umlal v3.4S, v0.4H, v6.H[1]
+ umlal2 v4.4S, v0.8H, v6.H[1]
+ umlsl v3.4S, v1.4H, v6.H[0]
+ umlsl2 v4.4S, v1.8H, v6.H[0]
+ mvni v5.8h, #0xFC, lsl #8 // 1023 for clipping
+ sqrshrun \d1\().4H, v3.4S, #5
+ sqrshrun2 \d1\().8H, v4.4S, #5
+ smin \d0\().8H, \d0\().8H, v5.8h
+ smin \d1\().8H, \d1\().8H, v5.8h
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+ mov x4, x30
+ mov x12, #32
+ mov x3, #16
+ bl put_h264_qpel8_h_lowpass_neon_10
+ sub x1, x1, x2, lsl #4
+ add x1, x1, #16
+ mov x12, #32
+ mov x30, x4
+ b put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+ mov x13, x30
+ mov x12, #32
+ bl \type\()_h264_qpel8_h_lowpass_neon_10
+ sub x0, x0, x3, lsl #4
+ sub x1, x1, x2, lsl #4
+ add x0, x0, #16
+ add x1, x1, #16
+ mov x12, #32
+ mov x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1: ld1 {v28.8H, v29.8H}, [x1], x2
+ ld1 {v16.8H, v17.8H}, [x1], x2
+ subs x12, x12, #4
+ lowpass_8_10 v28, v29, v16, v17, v28, v20
+ .ifc \type,avg
+ ld1 {v2.8H}, [x0], x3
+ urhadd v28.8H, v28.8H, v2.8H
+ ld1 {v3.8H}, [x0]
+ urhadd v20.8H, v20.8H, v3.8H
+ sub x0, x0, x3
+ .endif
+ st1 {v28.8H}, [x0], x3
+ st1 {v20.8H}, [x0], x3
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+ h264_qpel_h_lowpass_10 put
+ h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+ mov x13, x30
+ mov x12, #32
+ bl \type\()_h264_qpel8_h_lowpass_l2_neon_10
+ sub x0, x0, x2, lsl #4
+ sub x1, x1, x2, lsl #4
+ sub x3, x3, x2, lsl #4
+ add x0, x0, #16
+ add x1, x1, #16
+ add x3, x3, #16
+ mov x12, #32
+ mov x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1: ld1 {v26.8H, v27.8H}, [x1], x2
+ ld1 {v16.8H, v17.8H}, [x1], x2
+ ld1 {v28.8H}, [x3], x2
+ ld1 {v29.8H}, [x3], x2
+ subs x12, x12, #4
+ lowpass_8_10 v26, v27, v16, v17, v26, v27
+ urhadd v26.8H, v26.8H, v28.8H
+ urhadd v27.8H, v27.8H, v29.8H
+ .ifc \type,avg
+ ld1 {v2.8H}, [x0], x2
+ urhadd v26.8H, v26.8H, v2.8H
+ ld1 {v3.8H}, [x0]
+ urhadd v27.8H, v27.8H, v3.8H
+ sub x0, x0, x2
+ .endif
+ st1 {v26.8H}, [x0], x2
+ st1 {v27.8H}, [x0], x2
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+ h264_qpel_h_lowpass_l2_10 put
+ h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+ mov x4, x30
+ mov x2, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+ b put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+ mov x4, x30
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ sub x0, x0, x2, lsl #4
+ add x0, x0, #16
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #16
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+ ld1 {v16.8H}, [x1], x3
+ ld1 {v18.8H}, [x1], x3
+ ld1 {v20.8H}, [x1], x3
+ ld1 {v22.8H}, [x1], x3
+ ld1 {v24.8H}, [x1], x3
+ ld1 {v26.8H}, [x1], x3
+ ld1 {v28.8H}, [x1], x3
+ ld1 {v30.8H}, [x1], x3
+ ld1 {v17.8H}, [x1], x3
+ ld1 {v19.8H}, [x1], x3
+ ld1 {v21.8H}, [x1], x3
+ ld1 {v23.8H}, [x1], x3
+ ld1 {v25.8H}, [x1]
+
+ transpose_8x8H v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
+ transpose_8x8H v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
+ lowpass_8_10 v16, v17, v18, v19, v16, v17
+ lowpass_8_10 v20, v21, v22, v23, v18, v19
+ lowpass_8_10 v24, v25, v26, v27, v20, v21
+ lowpass_8_10 v28, v29, v30, v31, v22, v23
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+
+ .ifc \type,avg
+ ld1 {v24.8H}, [x0], x2
+ urhadd v16.8H, v16.8H, v24.8H
+ ld1 {v25.8H}, [x0], x2
+ urhadd v17.8H, v17.8H, v25.8H
+ ld1 {v26.8H}, [x0], x2
+ urhadd v18.8H, v18.8H, v26.8H
+ ld1 {v27.8H}, [x0], x2
+ urhadd v19.8H, v19.8H, v27.8H
+ ld1 {v28.8H}, [x0], x2
+ urhadd v20.8H, v20.8H, v28.8H
+ ld1 {v29.8H}, [x0], x2
+ urhadd v21.8H, v21.8H, v29.8H
+ ld1 {v30.8H}, [x0], x2
+ urhadd v22.8H, v22.8H, v30.8H
+ ld1 {v31.8H}, [x0], x2
+ urhadd v23.8H, v23.8H, v31.8H
+ sub x0, x0, x2, lsl #3
+ .endif
+
+ st1 {v16.8H}, [x0], x2
+ st1 {v17.8H}, [x0], x2
+ st1 {v18.8H}, [x0], x2
+ st1 {v19.8H}, [x0], x2
+ st1 {v20.8H}, [x0], x2
+ st1 {v21.8H}, [x0], x2
+ st1 {v22.8H}, [x0], x2
+ st1 {v23.8H}, [x0], x2
+
+ ret
+endfunc
+.endm
+
+ h264_qpel_v_lowpass_10 put
+ h264_qpel_v_lowpass_10 avg
+
+.macro h264_qpel_v_lowpass_l2_10 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon_10
+ mov x4, x30
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ sub x0, x0, x3, lsl #4
+ sub x12, x12, x2, lsl #4
+ add x0, x0, #16
+ add x12, x12, #16
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #16
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ ld1 {v16.8H}, [x1], x3
+ ld1 {v18.8H}, [x1], x3
+ ld1 {v20.8H}, [x1], x3
+ ld1 {v22.8H}, [x1], x3
+ ld1 {v24.8H}, [x1], x3
+ ld1 {v26.8H}, [x1], x3
+ ld1 {v28.8H}, [x1], x3
+ ld1 {v30.8H}, [x1], x3
+ ld1 {v17.8H}, [x1], x3
+ ld1 {v19.8H}, [x1], x3
+ ld1 {v21.8H}, [x1], x3
+ ld1 {v23.8H}, [x1], x3
+ ld1 {v25.8H}, [x1]
+
+ transpose_8x8H v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
+ transpose_8x8H v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
+ lowpass_8_10 v16, v17, v18, v19, v16, v17
+ lowpass_8_10 v20, v21, v22, v23, v18, v19
+ lowpass_8_10 v24, v25, v26, v27, v20, v21
+ lowpass_8_10 v28, v29, v30, v31, v22, v23
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+
+ ld1 {v24.8H}, [x12], x2
+ ld1 {v25.8H}, [x12], x2
+ ld1 {v26.8H}, [x12], x2
+ ld1 {v27.8H}, [x12], x2
+ ld1 {v28.8H}, [x12], x2
+ urhadd v16.8H, v24.8H, v16.8H
+ urhadd v17.8H, v25.8H, v17.8H
+ ld1 {v29.8H}, [x12], x2
+ urhadd v18.8H, v26.8H, v18.8H
+ urhadd v19.8H, v27.8H, v19.8H
+ ld1 {v30.8H}, [x12], x2
+ urhadd v20.8H, v28.8H, v20.8H
+ urhadd v21.8H, v29.8H, v21.8H
+ ld1 {v31.8H}, [x12], x2
+ urhadd v22.8H, v30.8H, v22.8H
+ urhadd v23.8H, v31.8H, v23.8H
+
+ .ifc \type,avg
+ ld1 {v24.8H}, [x0], x3
+ urhadd v16.8H, v16.8H, v24.8H
+ ld1 {v25.8H}, [x0], x3
+ urhadd v17.8H, v17.8H, v25.8H
+ ld1 {v26.8H}, [x0], x3
+ urhadd v18.8H, v18.8H, v26.8H
+ ld1 {v27.8H}, [x0], x3
+ urhadd v19.8H, v19.8H, v27.8H
+ ld1 {v28.8H}, [x0], x3
+ urhadd v20.8H, v20.8H, v28.8H
+ ld1 {v29.8H}, [x0], x3
+ urhadd v21.8H, v21.8H, v29.8H
+ ld1 {v30.8H}, [x0], x3
+ urhadd v22.8H, v22.8H, v30.8H
+ ld1 {v31.8H}, [x0], x3
+ urhadd v23.8H, v23.8H, v31.8H
+ sub x0, x0, x3, lsl #3
+ .endif
+
+ st1 {v16.8H}, [x0], x3
+ st1 {v17.8H}, [x0], x3
+ st1 {v18.8H}, [x0], x3
+ st1 {v19.8H}, [x0], x3
+ st1 {v20.8H}, [x0], x3
+ st1 {v21.8H}, [x0], x3
+ st1 {v22.8H}, [x0], x3
+ st1 {v23.8H}, [x0], x3
+
+ ret
+endfunc
+.endm
+
+ h264_qpel_v_lowpass_l2_10 put
+ h264_qpel_v_lowpass_l2_10 avg
+
+.macro h264_qpel8_10 type
+function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
+ lowpass_const w3
+ mov x3, x1
+ sub x1, x1, #4
+ mov x12, #16
+ b \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
+ lowpass_const w3
+ sub x1, x1, #4
+ mov x3, x2
+ mov x12, #16
+ b \type\()_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
+ lowpass_const w3
+ add x3, x1, #2
+ sub x1, x1, #4
+ mov x12, #16
+ b \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
+ mov x14, x30
+ mov x12, x1
+\type\()_h264_qpel8_mc01_10:
+ lowpass_const w3
+ mov x3, x2
+ sub x1, x1, x2, lsl #1
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel8_mc11_10:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #128
+ mov x0, sp
+ sub x1, x1, #4
+ mov x3, #16
+ mov x12, #16
+ bl put_h264_qpel8_h_lowpass_neon_10
+ mov x0, x8
+ mov x3, x2
+ mov x12, sp
+ sub x1, x9, x2, lsl #1
+ mov x2, #16
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ sub x1, x1, #2
+ b \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
+ mov x14, x30
+ lowpass_const w3
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
+ mov x14, x30
+ add x12, x1, x2
+ b \type\()_h264_qpel8_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ sub x1, x1, #2
+ b \type\()_h264_qpel8_mc11_10
+endfunc
+.endm
+
+ h264_qpel8_10 put
+ h264_qpel8_10 avg
+
+.macro h264_qpel16_10 type
+function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
+ lowpass_const w3
+ mov x3, x1
+ sub x1, x1, #4
+ b \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
+ lowpass_const w3
+ sub x1, x1, #4
+ mov x3, x2
+ b \type\()_h264_qpel16_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
+ lowpass_const w3
+ add x3, x1, #2
+ sub x1, x1, #4
+ b \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
+ mov x14, x30
+ mov x12, x1
+\type\()_h264_qpel16_mc01_10:
+ lowpass_const w3
+ mov x3, x2
+ sub x1, x1, x2, lsl #1
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel16_mc11_10:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #512
+ mov x0, sp
+ sub x1, x1, #4
+ mov x3, #32
+ bl put_h264_qpel16_h_lowpass_neon_10
+ mov x0, x8
+ mov x3, x2
+ mov x12, sp
+ sub x1, x9, x2, lsl #1
+ mov x2, #32
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ sub x1, x1, #2
+ b \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
+ mov x14, x30
+ lowpass_const w3
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ bl \type\()_h264_qpel16_v_lowpass_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
+ mov x14, x30
+ add x12, x1, x2
+ b \type\()_h264_qpel16_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ sub x1, x1, #2
+ b \type\()_h264_qpel16_mc11_10
+endfunc
+.endm
+
+ h264_qpel16_10 put
+ h264_qpel16_10 avg
--
2.32.0
More information about the ffmpeg-devel
mailing list