[FFmpeg-devel] Add ARM64 NEON optimization for HEVC decoder
章军海
243186085 at qq.com
Wed Jan 27 17:17:17 CET 2016
Add arm64 neon optimization for HEVC decoder, which have improved performance in large scale.
From c96995ea3bbfbbc42b7af7b447c8ada35f4b8a32 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:14:14 +0800
Subject: [PATCH 01/12] Create hevcdsp_idct_noen.s for aarch64
The file is used to optimize hevc idct transform for aarch64
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 1151 ++++++++++++++++++++++++++++++++
1 file changed, 1151 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
new file mode 100644
index 0000000..2bc23c2
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+ trn1 \r8\().8H, \r0\().8H, \r1\().8H
+ trn2 \r9\().8H, \r0\().8H, \r1\().8H
+ trn1 \r1\().8H, \r2\().8H, \r3\().8H
+ trn2 \r3\().8H, \r2\().8H, \r3\().8H
+ trn1 \r0\().8H, \r4\().8H, \r5\().8H
+ trn2 \r5\().8H, \r4\().8H, \r5\().8H
+ trn1 \r2\().8H, \r6\().8H, \r7\().8H
+ trn2 \r7\().8H, \r6\().8H, \r7\().8H
+ trn1 \r4\().4S, \r0\().4S, \r2\().4S
+ trn2 \r2\().4S, \r0\().4S, \r2\().4S
+ trn1 \r6\().4S, \r5\().4S, \r7\().4S
+ trn2 \r7\().4S, \r5\().4S, \r7\().4S
+ trn1 \r5\().4S, \r9\().4S, \r3\().4S
+ trn2 \r9\().4S, \r9\().4S, \r3\().4S
+ trn1 \r3\().4S, \r8\().4S, \r1\().4S
+ trn2 \r8\().4S, \r8\().4S, \r1\().4S
+ trn1 \r0\().2D, \r3\().2D, \r4\().2D
+ trn2 \r4\().2D, \r3\().2D, \r4\().2D
+ trn1 \r1\().2D, \r5\().2D, \r6\().2D
+ trn2 \r5\().2D, \r5\().2D, \r6\().2D
+ trn2 \r6\().2D, \r8\().2D, \r2\().2D
+ trn1 \r2\().2D, \r8\().2D, \r2\().2D
+ trn1 \r3\().2D, \r9\().2D, \r7\().2D
+ trn2 \r7\().2D, \r9\().2D, \r7\().2D
+.endm
+
+.macro transpose_16b_4x4 r0, r1, r2, r3, r4, r5, r6, r7
+ trn1 \r4\().4H, \r0\().4H, \r1\().4H
+ trn2 \r5\().4H, \r0\().4H, \r1\().4H
+ trn1 \r7\().4H, \r2\().4H, \r3\().4H
+ trn2 \r6\().4H, \r2\().4H, \r3\().4H
+ trn1 \r0\().2S, \r4\().2S, \r7\().2S
+ trn2 \r2\().2S, \r4\().2S, \r7\().2S
+ trn1 \r1\().2S, \r5\().2S, \r6\().2S
+ trn2 \r3\().2S, \r5\().2S, \r6\().2S
+.endm
+
+.macro tr4_luma_shift r0, r1, shift
+ saddl v5.4S, \r0\().4H, \r1\().4H
+ trn2 v1.2D, \r1\().2D, \r1\().2D
+ saddl v2.4S, \r1\().4H, v1.4H
+ ssubl v4.4S, \r0\().4H, v1.4H
+ smull2 v6.4S, \r0\().8H, v0.H[0]
+ saddl v7.4S, \r0\().4H, v1.4H
+ ssubw v7.4S, v7.4S, \r1\().4H
+ mul v7.4S, v7.4S, v0.S[0]
+ mul v8.4S, v5.4S, v0.S[1]
+ mul v9.4S, v2.4S, v0.S[2]
+ add v8.4S, v8.4S, v9.4S
+ add v8.4S, v8.4S, v6.4S
+ mul v2.4S, v2.4S, v0.S[1]
+ mul v9.4S, v4.4S, v0.S[2]
+ sub v9.4S, v9.4S, v2.4S
+ add v9.4S, v9.4S, v6.4S
+ mul v5.4S, v5.4S, v0.S[2]
+ mul v4.4S, v4.4S, v0.S[1]
+ add v5.4S, v5.4S, v4.4S
+ sub v5.4S, v5.4S, v6.4S
+ sqrshrn \r0\().4H, v8.4S, \shift
+ sqrshrn \r1\().4H, v9.4S, \shift
+ sqrshrn2 \r0\().8H, v7.4S, \shift
+ sqrshrn2 \r1\().8H, v5.4S, \shift
+.endm
+
+.macro tr4 r0, r1, r2, r3
+ smull v4.4S, \r1\().4H, v0.H[0]
+ smull v6.4S, \r1\().4H, v0.H[1]
+ sshll v2.4S, \r0\().4H, #6
+ sshll v3.4S, \r2\().4H, #6
+ add v5.4S, v2.4S, v3.4S
+ sub v2.4S, v2.4S, v3.4S
+ smlal v4.4S, \r3\().4H, v0.H[1]
+ smlsl v6.4S, \r3\().4H, v0.H[0]
+ sub v3.4S, v5.4S, v4.4S
+ add v4.4S, v5.4S, v4.4S
+ add v5.4S, v2.4S, v6.4S
+ sub v6.4S, v2.4S, v6.4S
+.endm
+
+.macro tr4_shift r0, r1, shift
+ smull2 v4.4S, \r0\().8H, v0.H[0]
+ smull2 v6.4S, \r0\().8H, v0.H[1]
+ sshll v2.4S, \r0\().4H, #6
+ sshll v3.4S, \r1\().4H, #6
+ add v5.4S, v2.4S, v3.4S
+ sub v2.4S, v2.4S, v3.4S
+ smlal2 v4.4S, \r1\().8H, v0.H[1]
+ smlsl2 v6.4S, \r1\().8H, v0.H[0]
+ sub v3.4S, v5.4S, v4.4S
+ add v4.4S, v5.4S, v4.4S
+ add v5.4S, v2.4S, v6.4S
+ sub v6.4S, v2.4S, v6.4S
+ sqrshrn \r0\().4H, v4.4S, \shift
+ sqrshrn \r1\().4H, v5.4S, \shift
+ sqrshrn2 \r0\().8H, v6.4S, \shift
+ sqrshrn2 \r1\().8H, v3.4S, \shift
+.endm
+
+.macro tr8_begin in0, in1, in2, in3
+ smull v7.4S, \in0\().4H, v0.H[5]
+ smull v8.4S, \in0\().4H, v0.H[4]
+ smull v9.4S, \in0\().4H, v0.H[7]
+ smull v10.4S, \in0\().4H, v0.H[6]
+ smlal v7.4S, \in1\().4H, v0.H[4]
+ smlsl v8.4S, \in1\().4H, v0.H[6]
+ smlsl v9.4S, \in1\().4H, v0.H[5]
+ smlsl v10.4S, \in1\().4H, v0.H[7]
+ smlal v7.4S, \in2\().4H, v0.H[7]
+ smlsl v8.4S, \in2\().4H, v0.H[5]
+ smlal v9.4S, \in2\().4H, v0.H[6]
+ smlal v10.4S, \in2\().4H, v0.H[4]
+ smlal v7.4S, \in3\().4H, v0.H[6]
+ smlsl v8.4S, \in3\().4H, v0.H[7]
+ smlal v9.4S, \in3\().4H, v0.H[4]
+ smlsl v10.4S, \in3\().4H, v0.H[5]
+.endm
+
+.macro tr8_end shift
+ add v1.4S, v4.4S, v7.4S
+ sub v4.4S, v4.4S, v7.4S
+ add v2.4S, v5.4S, v8.4S
+ sub v5.4S, v5.4S, v8.4S
+ add v11.4S, v6.4S, v9.4S
+ sub v6.4S, v6.4S, v9.4S
+ add v12.4S, v3.4S, v10.4S
+ sub v3.4S, v3.4S, v10.4S
+ sqrshrn v9.4H, v4.4S, \shift
+ sqrshrn v8.4H, v5.4S, \shift
+ sqrshrn v7.4H, v6.4S, \shift
+ sqrshrn v6.4H, v3.4S, \shift
+ sqrshrn v5.4H, v12.4S, \shift
+ sqrshrn v4.4H, v11.4S, \shift
+ sqrshrn v3.4H, v2.4S, \shift
+ sqrshrn v2.4H, v1.4S, \shift
+.endm
+
+.macro tr8_end_0
+ sub v15.4S, v4.4S, v7.4S
+ sub v14.4S, v5.4S, v8.4S
+ sub v13.4S, v6.4S, v9.4S
+ sub v12.4S, v3.4S, v10.4S
+ add v11.4S, v3.4S, v10.4S
+ add v10.4S, v6.4S, v9.4S
+ add v9.4S, v5.4S, v8.4S
+ add v8.4S, v4.4S, v7.4S
+.endm
+
+.macro tr16_begin in0, in1, in2, in3, in4, in5, in6, in7
+ smull v2.4S, \in0\().4H, v1.H[1]
+ smull v3.4S, \in0\().4H, v1.H[0]
+ smull v4.4S, \in0\().4H, v1.H[3]
+ smull v5.4S, \in0\().4H, v1.H[2]
+ smull v6.4S, \in0\().4H, v1.H[5]
+ smull v7.4S, \in0\().4H, v1.H[4]
+ smull v8.4S, \in0\().4H, v1.H[7]
+ smull v9.4S, \in0\().4H, v1.H[6]
+ smlal v2.4S, \in1\().4H, v1.H[0]
+ smlal v3.4S, \in1\().4H, v1.H[5]
+ smlal v4.4S, \in1\().4H, v1.H[6]
+ smlsl v5.4S, \in1\().4H, v1.H[4]
+ smlsl v6.4S, \in1\().4H, v1.H[3]
+ smlsl v7.4S, \in1\().4H, v1.H[1]
+ smlsl v8.4S, \in1\().4H, v1.H[2]
+ smlsl v9.4S, \in1\().4H, v1.H[7]
+ smlal v2.4S, \in2\().4H, v1.H[3]
+ smlal v3.4S, \in2\().4H, v1.H[6]
+ smlsl v4.4S, \in2\().4H, v1.H[2]
+ smlsl v5.4S, \in2\().4H, v1.H[0]
+ smlsl v6.4S, \in2\().4H, v1.H[7]
+ smlal v7.4S, \in2\().4H, v1.H[5]
+ smlal v8.4S, \in2\().4H, v1.H[1]
+ smlal v9.4S, \in2\().4H, v1.H[4]
+ smlal v2.4S, \in3\().4H, v1.H[2]
+ smlsl v3.4S, \in3\().4H, v1.H[4]
+ smlsl v4.4S, \in3\().4H, v1.H[0]
+ smlal v5.4S, \in3\().4H, v1.H[6]
+ smlal v6.4S, \in3\().4H, v1.H[1]
+ smlal v7.4S, \in3\().4H, v1.H[7]
+ smlsl v8.4S, \in3\().4H, v1.H[3]
+ smlsl v9.4S, \in3\().4H, v1.H[5]
+ smlal v2.4S, \in4\().4H, v1.H[5]
+ smlsl v3.4S, \in4\().4H, v1.H[3]
+ smlsl v4.4S, \in4\().4H, v1.H[7]
+ smlal v5.4S, \in4\().4H, v1.H[1]
+ smlsl v6.4S, \in4\().4H, v1.H[6]
+ smlsl v7.4S, \in4\().4H, v1.H[0]
+ smlal v8.4S, \in4\().4H, v1.H[4]
+ smlal v9.4S, \in4\().4H, v1.H[2]
+ smlal v2.4S, \in5\().4H, v1.H[4]
+ smlsl v3.4S, \in5\().4H, v1.H[1]
+ smlal v4.4S, \in5\().4H, v1.H[5]
+ smlal v5.4S, \in5\().4H, v1.H[7]
+ smlsl v6.4S, \in5\().4H, v1.H[0]
+ smlal v7.4S, \in5\().4H, v1.H[2]
+ smlal v8.4S, \in5\().4H, v1.H[6]
+ smlsl v9.4S, \in5\().4H, v1.H[3]
+ smlal v2.4S, \in6\().4H, v1.H[7]
+ smlsl v3.4S, \in6\().4H, v1.H[2]
+ smlal v4.4S, \in6\().4H, v1.H[1]
+ smlsl v5.4S, \in6\().4H, v1.H[3]
+ smlal v6.4S, \in6\().4H, v1.H[4]
+ smlal v7.4S, \in6\().4H, v1.H[6]
+ smlsl v8.4S, \in6\().4H, v1.H[5]
+ smlal v9.4S, \in6\().4H, v1.H[0]
+ smlal v2.4S, \in7\().4H, v1.H[6]
+ smlsl v3.4S, \in7\().4H, v1.H[7]
+ smlal v4.4S, \in7\().4H, v1.H[4]
+ smlsl v5.4S, \in7\().4H, v1.H[5]
+ smlal v6.4S, \in7\().4H, v1.H[2]
+ smlsl v7.4S, \in7\().4H, v1.H[3]
+ smlal v8.4S, \in7\().4H, v1.H[0]
+ smlsl v9.4S, \in7\().4H, v1.H[1]
+.endm
+
+.macro tr32_begin
+ smull v6.4S, v16.4H, v0.H[0]
+ smull v7.4S, v16.4H, v0.H[0]
+ smull v8.4S, v16.4H, v0.H[2]
+ smull v9.4S, v16.4H, v0.H[3]
+ smlal v6.4S, v17.4H, v0.H[0]
+ smlal v7.4S, v17.4H, v0.H[4]
+ smlal v8.4S, v17.4H, v0.H[7]
+ smlal v9.4S, v17.4H, v1.H[2]
+ smlal v6.4S, v18.4H, v0.H[2]
+ smlal v7.4S, v18.4H, v0.H[7]
+ smlal v8.4S, v18.4H, v1.H[4]
+ smlsl v9.4S, v18.4H, v1.H[6]
+ smlal v6.4S, v19.4H, v0.H[3]
+ smlal v7.4S, v19.4H, v1.H[2]
+ smlsl v8.4S, v19.4H, v1.H[6]
+ smlsl v9.4S, v19.4H, v0.H[7]
+ smlal v6.4S, v20.4H, v0.H[4]
+ smlal v7.4S, v20.4H, v1.H[5]
+ smlsl v8.4S, v20.4H, v1.H[1]
+ smlsl v9.4S, v20.4H, v0.H[0]
+ smlal v6.4S, v21.4H, v0.H[5]
+ smlsl v7.4S, v21.4H, v1.H[7]
+ smlsl v8.4S, v21.4H, v0.H[4]
+ smlsl v9.4S, v21.4H, v0.H[6]
+ smlal v6.4S, v22.4H, v0.H[6]
+ smlsl v7.4S, v22.4H, v1.H[4]
+ smlsl v8.4S, v22.4H, v0.H[0]
+ smlsl v9.4S, v22.4H, v1.H[5]
+ smlal v6.4S, v23.4H, v0.H[7]
+ smlsl v7.4S, v23.4H, v1.H[1]
+ smlsl v8.4S, v23.4H, v0.H[5]
+ smlal v9.4S, v23.4H, v1.H[3]
+ smlal v6.4S, v24.4H, v1.H[0]
+ smlsl v7.4S, v24.4H, v0.H[6]
+ smlsl v8.4S, v24.4H, v1.H[2]
+ smlal v9.4S, v24.4H, v0.H[4]
+ smlal v6.4S, v25.4H, v1.H[1]
+ smlsl v7.4S, v25.4H, v0.H[3]
+ smlsl v8.4S, v25.4H, v1.H[7]
+ smlal v9.4S, v25.4H, v0.H[2]
+ smlal v6.4S, v26.4H, v1.H[2]
+ smlsl v7.4S, v26.4H, v0.H[0]
+ smlal v8.4S, v26.4H, v1.H[3]
+ smlal v9.4S, v26.4H, v1.H[1]
+ smlal v6.4S, v27.4H, v1.H[3]
+ smlsl v7.4S, v27.4H, v0.H[2]
+ smlal v8.4S, v27.4H, v0.H[6]
+ smlsl v9.4S, v27.4H, v1.H[7]
+ smlal v6.4S, v28.4H, v1.H[4]
+ smlsl v7.4S, v28.4H, v0.H[5]
+ smlal v8.4S, v28.4H, v0.H[0]
+ smlsl v9.4S, v28.4H, v1.H[0]
+ smlal v6.4S, v29.4H, v1.H[5]
+ smlsl v7.4S, v29.4H, v1.H[0]
+ smlal v8.4S, v29.4H, v0.H[3]
+ smlsl v9.4S, v29.4H, v0.H[0]
+ smlal v6.4S, v30.4H, v1.H[6]
+ smlsl v7.4S, v30.4H, v1.H[3]
+ smlal v8.4S, v30.4H, v1.H[0]
+ smlsl v9.4S, v30.4H, v0.H[5]
+ smlal v6.4S, v31.4H, v1.H[7]
+ smlsl v7.4S, v31.4H, v1.H[6]
+ smlal v8.4S, v31.4H, v1.H[5]
+ smlsl v9.4S, v31.4H, v1.H[0]
+ smull v2.4S, v16.4H, v0.H[4]
+ smull v3.4S, v16.4H, v0.H[5]
+ smull v4.4S, v16.4H, v0.H[6]
+ smull v5.4S, v16.4H, v0.H[7]
+ smlal v2.4S, v17.4H, v1.H[5]
+ smlsl v3.4S, v17.4H, v1.H[7]
+ smlsl v4.4S, v17.4H, v1.H[4]
+ smlsl v5.4S, v17.4H, v1.H[1]
+ smlsl v2.4S, v18.4H, v1.H[1]
+ smlsl v3.4S, v18.4H, v0.H[4]
+ smlsl v4.4S, v18.4H, v0.H[0]
+ smlsl v5.4S, v18.4H, v0.H[5]
+ smlsl v2.4S, v19.4H, v0.H[0]
+ smlsl v3.4S, v19.4H, v0.H[6]
+ smlsl v4.4S, v19.4H, v1.H[5]
+ smlal v5.4S, v19.4H, v1.H[3]
+ smlsl v2.4S, v20.4H, v1.H[0]
+ smlal v3.4S, v20.4H, v1.H[6]
+ smlal v4.4S, v20.4H, v0.H[5]
+ smlal v5.4S, v20.4H, v0.H[3]
+ smlal v2.4S, v21.4H, v1.H[6]
+ smlal v3.4S, v21.4H, v0.H[3]
+ smlal v4.4S, v21.4H, v0.H[7]
+ smlsl v5.4S, v21.4H, v1.H[5]
+ smlal v2.4S, v22.4H, v0.H[5]
+ smlal v3.4S, v22.4H, v0.H[7]
+ smlsl v4.4S, v22.4H, v1.H[3]
+ smlsl v5.4S, v22.4H, v0.H[0]
+ smlal v2.4S, v23.4H, v0.H[3]
+ smlsl v3.4S, v23.4H, v1.H[5]
+ smlsl v4.4S, v23.4H, v0.H[0]
+ smlal v5.4S, v23.4H, v1.H[7]
+ smlal v2.4S, v24.4H, v1.H[4]
+ smlsl v3.4S, v24.4H, v0.H[2]
+ smlsl v4.4S, v24.4H, v1.H[6]
+ smlal v5.4S, v24.4H, v0.H[0]
+ smlsl v2.4S, v25.4H, v1.H[2]
+ smlsl v3.4S, v25.4H, v1.H[0]
+ smlal v4.4S, v25.4H, v0.H[4]
+ smlal v5.4S, v25.4H, v1.H[6]
+ smlsl v2.4S, v26.4H, v0.H[0]
+ smlal v3.4S, v26.4H, v1.H[4]
+ smlal v4.4S, v26.4H, v1.H[0]
+ smlsl v5.4S, v26.4H, v0.H[2]
+ smlsl v2.4S, v27.4H, v0.H[7]
+ smlal v3.4S, v27.4H, v0.H[0]
+ smlsl v4.4S, v27.4H, v1.H[2]
+ smlsl v5.4S, v27.4H, v1.H[4]
+ smlal v2.4S, v28.4H, v1.H[7]
+ smlal v3.4S, v28.4H, v1.H[1]
+ smlsl v4.4S, v28.4H, v0.H[2]
+ smlal v5.4S, v28.4H, v0.H[4]
+ smlal v2.4S, v29.4H, v0.H[6]
+ smlsl v3.4S, v29.4H, v1.H[3]
+ smlsl v4.4S, v29.4H, v1.H[7]
+ smlal v5.4S, v29.4H, v1.H[2]
+ smlal v2.4S, v30.4H, v0.H[2]
+ smlsl v3.4S, v30.4H, v0.H[0]
+ smlal v4.4S, v30.4H, v0.H[3]
+ smlsl v5.4S, v30.4H, v0.H[6]
+ smlal v2.4S, v31.4H, v1.H[3]
+ smlsl v3.4S, v31.4H, v1.H[2]
+ smlal v4.4S, v31.4H, v1.H[1]
+ smlsl v5.4S, v31.4H, v1.H[0]
+ st1 {v6.2D - v9.2D}, [x6], #64
+ st1 {v2.2D - v5.2D}, [x6], #64
+
+ smull v6.4S, v16.4H, v1.H[0]
+ smull v7.4S, v16.4H, v1.H[1]
+ smull v8.4S, v16.4H, v1.H[2]
+ smull v9.4S, v16.4H, v1.H[3]
+ smlsl v6.4S, v17.4H, v0.H[6]
+ smlsl v7.4S, v17.4H, v0.H[3]
+ smlsl v8.4S, v17.4H, v0.H[0]
+ smlsl v9.4S, v17.4H, v0.H[2]
+ smlsl v6.4S, v18.4H, v1.H[2]
+ smlsl v7.4S, v18.4H, v1.H[7]
+ smlal v8.4S, v18.4H, v1.H[3]
+ smlal v9.4S, v18.4H, v0.H[6]
+ smlal v6.4S, v19.4H, v0.H[4]
+ smlal v7.4S, v19.4H, v0.H[2]
+ smlal v8.4S, v19.4H, v1.H[1]
+ smlsl v9.4S, v19.4H, v1.H[7]
+ smlal v6.4S, v20.4H, v1.H[4]
+ smlsl v7.4S, v20.4H, v1.H[2]
+ smlsl v8.4S, v20.4H, v0.H[0]
+ smlsl v9.4S, v20.4H, v0.H[7]
+ smlsl v6.4S, v21.4H, v0.H[2]
+ smlsl v7.4S, v21.4H, v1.H[0]
+ smlal v8.4S, v21.4H, v1.H[4]
+ smlal v9.4S, v21.4H, v0.H[0]
+ smlsl v6.4S, v22.4H, v1.H[6]
+ smlal v7.4S, v22.4H, v0.H[4]
+ smlal v8.4S, v22.4H, v1.H[0]
+ smlsl v9.4S, v22.4H, v1.H[2]
+ smlal v6.4S, v23.4H, v0.H[0]
+ smlal v7.4S, v23.4H, v1.H[6]
+ smlsl v8.4S, v23.4H, v0.H[2]
+ smlal v9.4S, v23.4H, v1.H[4]
+ smlsl v6.4S, v24.4H, v1.H[7]
+ smlsl v7.4S, v24.4H, v0.H[0]
+ smlal v8.4S, v24.4H, v1.H[5]
+ smlal v9.4S, v24.4H, v0.H[3]
+ smlsl v6.4S, v25.4H, v0.H[0]
+ smlal v7.4S, v25.4H, v1.H[3]
+ smlal v8.4S, v25.4H, v0.H[7]
+ smlsl v9.4S, v25.4H, v0.H[5]
+ smlal v6.4S, v26.4H, v1.H[5]
+ smlal v7.4S, v26.4H, v0.H[7]
+ smlsl v8.4S, v26.4H, v0.H[3]
+ smlal v9.4S, v26.4H, v1.H[6]
+ smlal v6.4S, v27.4H, v0.H[3]
+ smlsl v7.4S, v27.4H, v0.H[5]
+ smlal v8.4S, v27.4H, v1.H[6]
+ smlal v9.4S, v27.4H, v1.H[0]
+ smlsl v6.4S, v28.4H, v1.H[3]
+ smlsl v7.4S, v28.4H, v1.H[5]
+ smlal v8.4S, v28.4H, v0.H[6]
+ smlsl v9.4S, v28.4H, v0.H[0]
+ smlsl v6.4S, v29.4H, v0.H[5]
+ smlal v7.4S, v29.4H, v0.H[0]
+ smlsl v8.4S, v29.4H, v0.H[4]
+ smlal v9.4S, v29.4H, v1.H[1]
+ smlal v6.4S, v30.4H, v1.H[1]
+ smlsl v7.4S, v30.4H, v1.H[4]
+ smlal v8.4S, v30.4H, v1.H[7]
+ smlal v9.4S, v30.4H, v1.H[5]
+ smlal v6.4S, v31.4H, v0.H[7]
+ smlsl v7.4S, v31.4H, v0.H[6]
+ smlal v8.4S, v31.4H, v0.H[5]
+ smlsl v9.4S, v31.4H, v0.H[4]
+ smull v2.4S, v16.4H, v1.H[4]
+ smull v3.4S, v16.4H, v1.H[5]
+ smull v4.4S, v16.4H, v1.H[6]
+ smull v5.4S, v16.4H, v1.H[7]
+ smlsl v2.4S, v17.4H, v0.H[5]
+ smlsl v3.4S, v17.4H, v1.H[0]
+ smlsl v4.4S, v17.4H, v1.H[3]
+ smlsl v5.4S, v17.4H, v1.H[6]
+ smlal v2.4S, v18.4H, v0.H[0]
+ smlal v3.4S, v18.4H, v0.H[3]
+ smlal v4.4S, v18.4H, v1.H[0]
+ smlal v5.4S, v18.4H, v1.H[5]
+ smlsl v2.4S, v19.4H, v1.H[0]
+ smlsl v3.4S, v19.4H, v0.H[0]
+ smlsl v4.4S, v19.4H, v0.H[5]
+ smlsl v5.4S, v19.4H, v1.H[4]
+ smlal v2.4S, v20.4H, v1.H[7]
+ smlal v3.4S, v20.4H, v0.H[6]
+ smlal v4.4S, v20.4H, v0.H[2]
+ smlal v5.4S, v20.4H, v1.H[3]
+ smlal v2.4S, v21.4H, v1.H[1]
+ smlsl v3.4S, v21.4H, v1.H[3]
+ smlsl v4.4S, v21.4H, v0.H[0]
+ smlsl v5.4S, v21.4H, v1.H[2]
+ smlsl v2.4S, v22.4H, v0.H[2]
+ smlsl v3.4S, v22.4H, v1.H[7]
+ smlal v4.4S, v22.4H, v0.H[3]
+ smlal v5.4S, v22.4H, v1.H[1]
+ smlal v2.4S, v23.4H, v0.H[4]
+ smlal v3.4S, v23.4H, v1.H[2]
+ smlsl v4.4S, v23.4H, v0.H[6]
+ smlsl v5.4S, v23.4H, v1.H[0]
+ smlsl v2.4S, v24.4H, v1.H[3]
+ smlsl v3.4S, v24.4H, v0.H[5]
+ smlal v4.4S, v24.4H, v1.H[1]
+ smlal v5.4S, v24.4H, v0.H[7]
+ smlsl v2.4S, v25.4H, v1.H[5]
+ smlal v3.4S, v25.4H, v0.H[0]
+ smlsl v4.4S, v25.4H, v1.H[4]
+ smlsl v5.4S, v25.4H, v0.H[6]
+ smlal v2.4S, v26.4H, v0.H[6]
+ smlsl v3.4S, v26.4H, v0.H[4]
+ smlal v4.4S, v26.4H, v1.H[7]
+ smlal v5.4S, v26.4H, v0.H[5]
+ smlsl v2.4S, v27.4H, v0.H[0]
+ smlal v3.4S, v27.4H, v1.H[1]
+ smlal v4.4S, v27.4H, v1.H[5]
+ smlsl v5.4S, v27.4H, v0.H[4]
+ smlal v2.4S, v28.4H, v0.H[7]
+ smlsl v3.4S, v28.4H, v1.H[6]
+ smlsl v4.4S, v28.4H, v1.H[2]
+ smlal v5.4S, v28.4H, v0.H[3]
+ smlsl v2.4S, v29.4H, v1.H[6]
+ smlsl v3.4S, v29.4H, v1.H[4]
+ smlal v4.4S, v29.4H, v0.H[7]
+ smlsl v5.4S, v29.4H, v0.H[2]
+ smlsl v2.4S, v30.4H, v1.H[2]
+ smlal v3.4S, v30.4H, v0.H[7]
+ smlsl v4.4S, v30.4H, v0.H[4]
+ smlal v5.4S, v30.4H, v0.H[0]
+ smlal v2.4S, v31.4H, v0.H[3]
+ smlsl v3.4S, v31.4H, v0.H[2]
+ smlal v4.4S, v31.4H, v0.H[0]
+ smlsl v5.4S, v31.4H, v0.H[0]
+ st1 {v6.2D - v9.2D}, [x6], #64
+ st1 {v2.2D - v5.2D}, [x6], #64
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+ movrel x3, trans_coeff
+ mov x5, #16
+ mov x6, x0
+ ld1 {v0.2D}, [x3]
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ mov x0, x6
+ tr8_begin v25, v27, v29, v31
+ tr4 v24, v26, v28, v30
+ tr8_end #7
+ st1 {v2.1D}, [x0], x5
+ st1 {v3.1D}, [x0], x5
+ st1 {v4.1D}, [x0], x5
+ st1 {v5.1D}, [x0], x5
+ st1 {v6.1D}, [x0], x5
+ st1 {v7.1D}, [x0], x5
+ st1 {v8.1D}, [x0], x5
+ st1 {v9.1D}, [x0], x5
+ mov x0, x6
+ cmp x1, #4
+ b.lt 1f
+ add x0, x0, #8
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ sub x0, x0, #128
+ tr8_begin v25, v27, v29, v31
+ tr4 v24, v26, v28, v30
+ tr8_end #7
+ st1 {v2.1D}, [x0], x5
+ st1 {v3.1D}, [x0], x5
+ st1 {v4.1D}, [x0], x5
+ st1 {v5.1D}, [x0], x5
+ st1 {v6.1D}, [x0], x5
+ st1 {v7.1D}, [x0], x5
+ st1 {v8.1D}, [x0], x5
+ st1 {v9.1D}, [x0], x5
+ mov x0, x6
+1: ld1 {v24.1D - v27.1D}, [x0], #32
+ ld1 {v28.1D - v31.1D}, [x0]
+ mov x0, x6
+ transpose_16b_4x4 v24, v26, v28, v30, v11, v12, v13, v14
+ transpose_16b_4x4 v25, v27, v29, v31, v11, v12, v13, v14
+ tr8_begin v26, v30, v27, v31
+ tr4 v24, v28, v25, v29
+ tr8_end #12
+ transpose_16b_4x4 v2, v3, v4, v5, v11, v12, v13, v14
+ transpose_16b_4x4 v6, v7, v8, v9, v11, v12, v13, v14
+ zip1 v11.2D, v2.2D, v6.2D
+ zip1 v12.2D, v3.2D, v7.2D
+ zip1 v13.2D, v4.2D, v8.2D
+ zip1 v14.2D, v5.2D, v9.2D
+ st1 {v11.2D - v14.2D}, [x0], #64
+ ld1 {v24.4H - v27.4H}, [x0], #32
+ ld1 {v28.4H - v31.4H}, [x0]
+ sub x0, x0, #32
+ transpose_16b_4x4 v24, v26, v28, v30, v11, v12, v13, v14
+ transpose_16b_4x4 v25, v27, v29, v31, v11, v12, v13, v14
+ tr8_begin v26, v30, v27, v31
+ tr4 v24, v28, v25, v29
+ tr8_end #12
+ transpose_16b_4x4 v2, v3, v4, v5, v11, v12, v13, v14
+ transpose_16b_4x4 v6, v7, v8, v9, v11, v12, v13, v14
+ zip1 v11.2D, v2.2D, v6.2D
+ zip1 v12.2D, v3.2D, v7.2D
+ zip1 v13.2D, v4.2D, v8.2D
+ zip1 v14.2D, v5.2D, v9.2D
+ st1 {v11.2D - v12.2D}, [x0], #32
+ st1 {v13.2D - v14.2D}, [x0], #32
+ sub x0, x0, #64
+ ret
+endfunc
+
+function ff_hevc_transform_16x16_neon_8, export=1
+ mov x5, #32
+ lsr x6, x5, #1
+ add x7, x1, #4
+ cmp x1, #12
+ csel x1, x6, x7, gt
+ movrel x3, trans_coeff
+ mov x10, sp
+ mov x4, XZR
+0: ld1 {v0.2D - v1.2D}, [x3]
+ add x0, x0, x5
+ lsl x5, x5, #1
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ sub x0, x0, x5, lsl #3
+ sub x0, x0, x5, lsr #1
+ tr16_begin v24, v25, v26, v27, v28, v29, v30, v31
+ sub x10, x10, #128
+ st1 {v2.2D - v5.2D}, [x10], #64
+ st1 {v6.2D - v9.2D}, [x10], #64
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ sub x0, x0, x5, lsl #3
+ lsr x5, x5, #1
+ tr8_begin v25, v27, v29, v31
+ tr4 v24, v26, v28, v30
+ tr8_end_0
+ sub x10, x10, #128
+ ld1 {v0.2D - v3.2D}, [x10], #64
+ add v4.4S, v8.4S, v0.4S
+ sub v8.4S, v8.4S, v0.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ add v4.4S, v9.4S, v1.4S
+ sub v9.4S, v9.4S, v1.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ add v4.4S, v10.4S, v2.4S
+ sub v10.4S, v10.4S, v2.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ add v4.4S, v11.4S, v3.4S
+ sub v11.4S, v11.4S, v3.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ ld1 {v0.2D - v3.2D}, [x10], #64
+ add v4.4S, v12.4S, v0.4S
+ sub v12.4S, v12.4S, v0.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ add v4.4S, v13.4S, v1.4S
+ sub v13.4S, v13.4S, v1.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ add v4.4S, v14.4S, v2.4S
+ sub v14.4S, v14.4S, v2.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ add v4.4S, v15.4S, v3.4S
+ sub v15.4S, v15.4S, v3.4S
+ sqrshrn v5.4H, v4.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v15.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v14.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v13.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v12.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v11.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v10.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v9.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sqrshrn v5.4H, v8.4S, #7
+ st1 {v5.1D}, [x0], x5
+ sub x0, x0, x5, lsl #4
+ add x0, x0, #8
+ add x4, x4, #4
+ cmp x4, x1
+ b.lt 0b
+ sub x0, x0, x4, lsl #1
+ mov x4, #4
+1: ld1 {v0.2D - v1.2D}, [x3]
+ ld1 {v16.1D - v19.1D}, [x0], #32
+ ld1 {v20.1D - v23.1D}, [x0], #32
+ ld1 {v24.1D - v27.1D}, [x0], #32
+ ld1 {v28.1D - v31.1D}, [x0], #32
+ sub x0, x0, #128
+ transpose_16b_4x4 v16, v20, v24, v28, v11, v12, v13, v14
+ transpose_16b_4x4 v17, v21, v25, v29, v11, v12, v13, v14
+ transpose_16b_4x4 v18, v22, v26, v30, v11, v12, v13, v14
+ transpose_16b_4x4 v19, v23, v27, v31, v11, v12, v13, v14
+ tr16_begin v20, v28, v21, v29, v22, v30, v23, v31
+ sub x10, x10, #128
+ st1 {v2.2D - v5.2D}, [x10], #64
+ st1 {v6.2D - v9.2D}, [x10], #64
+ tr8_begin v24, v25, v26, v27
+ tr4 v16, v17, v18, v19
+ tr8_end_0
+ sub x10, x10, #128
+ ld1 {v0.2D - v3.2D}, [x10], #64
+ add v4.4S, v8.4S, v0.4S
+ sub v18.4S, v8.4S, v0.4S
+ sqrshrn v0.4H, v4.4S, #12
+ add v4.4S, v9.4S, v1.4S
+ sub v19.4S, v9.4S, v1.4S
+ sqrshrn v1.4H, v4.4S, #12
+ add v4.4S, v10.4S, v2.4S
+ sub v20.4S, v10.4S, v2.4S
+ sqrshrn v2.4H, v4.4S, #12
+ add v4.4S, v11.4S, v3.4S
+ sub v21.4S, v11.4S, v3.4S
+ sqrshrn v3.4H, v4.4S, #12
+ ld1 {v4.2D - v7.2D}, [x10], #64
+ add v10.4S, v12.4S, v4.4S
+ sub v22.4S, v12.4S, v4.4S
+ sqrshrn v4.4H, v10.4S, #12
+ add v10.4S, v13.4S, v5.4S
+ sub v23.4S, v13.4S, v5.4S
+ sqrshrn v5.4H, v10.4S, #12
+ add v10.4S, v14.4S, v6.4S
+ sub v24.4S, v14.4S, v6.4S
+ sqrshrn v6.4H, v10.4S, #12
+ add v10.4S, v15.4S, v7.4S
+ sub v25.4S, v15.4S, v7.4S
+ sqrshrn v7.4H, v10.4S, #12
+ sqrshrn v8.4H, v25.4S, #12
+ sqrshrn v9.4H, v24.4S, #12
+ sqrshrn v10.4H, v23.4S, #12
+ sqrshrn v11.4H, v22.4S, #12
+ sqrshrn v12.4H, v21.4S, #12
+ sqrshrn v13.4H, v20.4S, #12
+ sqrshrn v14.4H, v19.4S, #12
+ sqrshrn v15.4H, v18.4S, #12
+ transpose_16b_4x4 v0, v1, v2, v3, v20, v21, v22, v23
+ transpose_16b_4x4 v4, v5, v6, v7, v20, v21, v22, v23
+ transpose_16b_4x4 v8, v9, v10, v11, v20, v21, v22, v23
+ transpose_16b_4x4 v12, v13, v14, v15, v20, v21, v22, v23
+ trn1 v20.2D, v0.2D, v4.2D //vswp d9, d12
+ trn1 v21.2D, v8.2D, v12.2D //vswp d3, d6
+ trn1 v22.2D, v1.2D, v5.2D //vswp d3, d9
+ trn1 v23.2D, v9.2D, v13.2D //vswp d6, d12
+ trn1 v24.2D, v2.2D, v6.2D //vswp d1, d4
+ trn1 v25.2D, v10.2D, v14.2D //vswp d11, d14
+ trn1 v26.2D, v3.2D, v7.2D //vswp d2, d8
+ trn1 v27.2D, v11.2D, v15.2D //vswp d7, d13
+ st1 {v20.2D - v23.2D}, [x0], #64
+ st1 {v24.2D - v27.2D}, [x0], #64
+ subs x4, x4, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro write32_buffer
+ st1 {v0.1D - v3.1D}, [x8], #32
+ st1 {v4.1D - v7.1D}, [x8], #32
+ st1 {v8.1D - v11.1D}, [x8], #32
+ st1 {v12.1D - v15.1D}, [x8], #32
+ st1 {v16.1D - v19.1D}, [x8], #32
+ st1 {v20.1D - v23.1D}, [x8], #32
+ st1 {v24.1D - v27.1D}, [x8], #32
+ st1 {v28.1D - v31.1D}, [x8], #32
+.endm
+
+.macro tr32_out tmp, dst_first, dst_last, shift
+ ld1 {v12.2D}, [x6], #16
+ ld1 {v13.2D}, [x7], #16
+ add \tmp\().4S, v12.4S, v13.4S
+ sub v13.4S, v13.4S, v12.4S
+ sqrshrn \dst_first\().4H, \tmp\().4S, \shift
+ sqrshrn \dst_last\().4H, v13.4S, \shift
+.endm
+
+.macro tr32_transform_func shift, action, limit
+ mov x4, XZR
+ mov x10, x8
+0: add x3, x3, #32
+ ld1 {v0.2D - v1.2D}, [x3]
+ sub x3, x3, #32
+ add x0, x0, x5
+ lsl x5, x5, #1
+ ld1 {v16.1D}, [x0], x5
+ ld1 {v17.1D}, [x0], x5
+ ld1 {v18.1D}, [x0], x5
+ ld1 {v19.1D}, [x0], x5
+ ld1 {v20.1D}, [x0], x5
+ ld1 {v21.1D}, [x0], x5
+ ld1 {v22.1D}, [x0], x5
+ ld1 {v23.1D}, [x0], x5
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ sub x0, x0, x5, lsl #4
+ tr32_begin
+ add x0, x0, x5, lsr #1
+ sub x6, x6, x5, lsl #1
+ ld1 {v0.2D - v1.2D}, [x3]
+ lsl x5, x5, #1
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ sub x0, x0, x5, lsl #3
+ sub x0, x0, x5, lsr #1
+ tr16_begin v24, v25, v26, v27, v28, v29, v30, v31
+ st1 {v2.2D - v5.2D}, [x9], #64
+ st1 {v6.2D - v9.2D}, [x9], #64
+ sub x9, x9, #128
+ ld1 {v24.1D}, [x0], x5
+ ld1 {v25.1D}, [x0], x5
+ ld1 {v26.1D}, [x0], x5
+ ld1 {v27.1D}, [x0], x5
+ ld1 {v28.1D}, [x0], x5
+ ld1 {v29.1D}, [x0], x5
+ ld1 {v30.1D}, [x0], x5
+ ld1 {v31.1D}, [x0], x5
+ sub x0, x0, x5, lsl #3
+ tr8_begin v25, v27, v29, v31
+ tr4 v24, v26, v28, v30
+ tr8_end_0
+ ld1 {v0.2D - v3.2D}, [x9], #64
+ add v4.4S, v8.4S, v0.4S
+ sub v8.4S, v8.4S, v0.4S
+ add v5.4S, v9.4S, v1.4S
+ sub v9.4S, v9.4S, v1.4S
+ add v6.4S, v10.4S, v2.4S
+ sub v10.4S, v10.4S, v2.4S
+ add v7.4S, v11.4S, v3.4S
+ sub v11.4S, v11.4S, v3.4S
+ st1 {v4.2D - v7.2D}, [x7], #64
+ ld1 {v0.2D - v3.2D}, [x9], #64
+ sub x9, x9, #128
+ add v4.4S, v12.4S, v0.4S
+ sub v12.4S, v12.4S, v0.4S
+ add v5.4S, v13.4S, v1.4S
+ sub v13.4S, v13.4S, v1.4S
+ add v6.4S, v14.4S, v2.4S
+ sub v14.4S, v14.4S, v2.4S
+ add v7.4S, v15.4S, v3.4S
+ sub v15.4S, v15.4S, v3.4S
+ st1 {v4.2D - v7.2D}, [x7], #64
+ st1 {v15.2D}, [x7], #16
+ st1 {v14.2D}, [x7], #16
+ st1 {v13.2D}, [x7], #16
+ st1 {v12.2D}, [x7], #16
+ st1 {v11.2D}, [x7], #16
+ st1 {v10.2D}, [x7], #16
+ st1 {v9.2D}, [x7], #16
+ st1 {v8.2D}, [x7], #16
+ sub x7, x7, x5
+ add x0, x0, #8
+ lsr x5, x5, #2
+ tr32_out v14, v0, v28, \shift
+ tr32_out v14, v8, v29, \shift
+ tr32_out v14, v16, v30, \shift
+ tr32_out v14, v24, v31, \shift
+ st1 {v28.1D - v31.1D}, [x9], #32
+ tr32_out v14, v1, v28, \shift
+ tr32_out v14, v9, v29, \shift
+ tr32_out v14, v17, v30, \shift
+ tr32_out v14, v25, v31, \shift
+ st1 {v28.1D - v31.1D}, [x9], #32
+ tr32_out v14, v2, v28, \shift
+ tr32_out v14, v10, v29, \shift
+ tr32_out v14, v18, v30, \shift
+ tr32_out v14, v26, v31, \shift
+ st1 {v28.1D - v31.1D}, [x9], #32
+ tr32_out v14, v3, v28, \shift
+ tr32_out v14, v11, v29, \shift
+ tr32_out v14, v19, v30, \shift
+ tr32_out v14, v27, v31, \shift
+ st1 {v28.1D - v31.1D}, [x9], #32
+ sub x9, x9, #128
+ transpose_16b_4x4 v0, v8, v16, v24, v28, v29, v30, v31
+ transpose_16b_4x4 v1, v9, v17, v25, v28, v29, v30, v31
+ transpose_16b_4x4 v2, v10, v18, v26, v28, v29, v30, v31
+ transpose_16b_4x4 v3, v11, v19, v27, v28, v29, v30, v31
+ ld1 {v31.1D}, [x9], #8
+ ld1 {v23.1D}, [x9], #8
+ ld1 {v15.1D}, [x9], #8
+ ld1 {v7.1D}, [x9], #8
+ ld1 {v30.1D}, [x9], #8
+ ld1 {v22.1D}, [x9], #8
+ ld1 {v14.1D}, [x9], #8
+ ld1 {v6.1D}, [x9], #8
+ ld1 {v29.1D}, [x9], #8
+ ld1 {v21.1D}, [x9], #8
+ ld1 {v13.1D}, [x9], #8
+ ld1 {v5.1D}, [x9], #8
+ ld1 {v28.1D}, [x9], #8
+ ld1 {v20.1D}, [x9], #8
+ ld1 {v12.1D}, [x9], #8
+ ld1 {v4.1D}, [x9], #8
+ sub x9, x9, #128
+ st1 {v24.1D - v27.1D}, [x9]
+ transpose_16b_4x4 v4, v12, v20, v28, v24, v25, v26, v27
+ transpose_16b_4x4 v5, v13, v21, v29, v24, v25, v26, v27
+ transpose_16b_4x4 v6, v14, v22, v30, v24, v25, v26, v27
+ transpose_16b_4x4 v7, v15, v23, v31, v24, v25, v26, v27
+ ld1 {v24.1D - v27.1D}, [x9]
+ \action
+ sub x6, x6, #256
+ sub x7, x7, #256
+ add x4, x4, #4
+ cmp x4, \limit
+ b.lt 0b
+ sub x0, x0, x4, lsl #1
+1: cmp x4, #32
+ b.ge 2f
+ \action
+ add x4, x4, #4
+ b 1b
+2: mov x8, x10
+.endm
+
+const trans_coeff, align=8
+.quad 0x0000000000240053 // 36, 83
+.quad 0x003200120059004b // 89, 75, 50, 18
+.quad 0x00500046005a0057 // 90, 87, 80, 70
+.quad 0x001900090039002b // 57, 43, 25, 9
+.quad 0x00550058005a005a // 88, 85, 90, 90
+.quad 0x00430049004e0052 // 73, 67, 82, 78
+.quad 0x0026002e0036003d // 46, 38, 61, 54
+.quad 0x0004000d0016001f // 13, 04, 31, 22
+endconst
+
+function ff_hevc_transform_32x32_neon_8, export=1
+ mov x6, #32
+ add x7, x1, #4
+ cmp x1, #28
+ csel x1, x6, x7, gt
+ mov x2, x1
+ movrel x3, trans_coeff
+ sub x6, sp, #256
+ mov x7, #63
+ bic x6, x6, x7
+ sub x7, x6, #256
+ sub x8, x7, #2048
+ sub x9, x8, #256
+ mov x5, #64
+ tr32_transform_func #7, write32_buffer, x2
+ mov x2, x0
+ mov x0, x8
+ mov x8, x2
+ tr32_transform_func #12, write32_buffer, #32
+ ret
+endfunc
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+ ldrsh w2, [x0]
+ dup v8.8H, w2
+ srshr v8.8H, v8.8H, #1
+ srshr v8.8H, v8.8H, #6
+ mov v9.16B, v8.16B
+ st1 {v8.2D, v9.2D}, [x0]
+ ret
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+ ldrsh w2, [x0]
+ dup v8.8H, w2
+ srshr v8.8H, v8.8H, #1
+ srshr v8.8H, v8.8H, #6
+ mov v9.16B, v8.16B
+ mov v10.16B, v8.16B
+ mov v11.16B, v8.16B
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0]
+ ret
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+ ldrsh w2, [x0]
+ dup v8.8H, w2
+ srshr v8.8H, v8.8H, #1
+ srshr v8.8H, v8.8H, #6
+ mov v9.16B, v8.16B
+ mov v10.16B, v8.16B
+ mov v11.16B, v8.16B
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0]
+ ret
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+ ldrsh w2, [x0]
+ dup v8.8H, w2
+ srshr v8.8H, v8.8H, #1
+ srshr v8.8H, v8.8H, #6
+ mov v9.16B, v8.16B
+ mov v10.16B, v8.16B
+ mov v11.16B, v8.16B
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v8.2D - v11.2D}, [x0]
+ ret
+endfunc
+
+function ff_hevc_transform_add_4x4_neon_8, export=1
+ ld1 {v0.2D, v1.2D}, [x1]
+ ld1 {v2.S}[0], [x0], x2
+ ld1 {v2.S}[1], [x0], x2
+ ld1 {v3.S}[0], [x0], x2
+ ld1 {v3.S}[1], [x0], x2
+ sub x0, x0, x2, lsl #2
+ uxtl v8.8H, v2.8B
+ uxtl v9.8H, v3.8B
+ sqadd v0.8H, v0.8H, v8.8H
+ sqadd v1.8H, v1.8H, v9.8H
+ sqxtun v4.8B, v0.8H
+ sqxtun v5.8B, v1.8H
+ st1 {v4.S}[0], [x0], x2
+ st1 {v4.S}[1], [x0], x2
+ st1 {v5.S}[0], [x0], x2
+ st1 {v5.S}[1], [x0], x2
+ ret
+endfunc
+
+function ff_hevc_transform_add_8x8_neon_8, export=1
+ mov x3, #8
+1: subs x3, x3, #1
+ ld1 {v0.2D}, [x1], #16
+ ld1 {v8.1D}, [x0]
+ uxtl v8.8H, v8.8B
+ sqadd v0.8H, v0.8H, v8.8H
+ sqxtun v4.8B, v0.8H
+ st1 {v4.1D}, [x0], x2
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_transform_add_16x16_neon_8, export=1
+ mov x3, #16
+1: subs x3, x3, #1
+ ld1 {v0.2D - v1.2D}, [x1], #32
+ ld1 {v8.2D}, [x0]
+ uxtl v9.8H, v8.8B
+ uxtl2 v10.8H, v8.16B
+ sqadd v0.8H, v0.8H, v9.8H
+ sqadd v1.8H, v1.8H, v10.8H
+ sqxtun v4.8B, v0.8H
+ sqxtun2 v4.16B, v1.8H
+ st1 {v4.2D}, [x0], x2
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_transform_add_32x32_neon_8, export=1
+ mov x3, #32
+1: subs x3, x3, #1
+ ld1 {v0.2D - v3.2D}, [x1], #64
+ ld1 {v8.2D, v9.2D}, [x0]
+ uxtl v10.8H, v8.8B
+ uxtl2 v11.8H, v8.16B
+ uxtl v12.8H, v9.8B
+ uxtl2 v13.8H, v9.16B
+ sqadd v0.8H, v0.8H, v10.8H
+ sqadd v1.8H, v1.8H, v11.8H
+ sqadd v2.8H, v2.8H, v12.8H
+ sqadd v3.8H, v3.8H, v13.8H
+ sqxtun v4.8B, v0.8H
+ sqxtun2 v4.16B, v1.8H
+ sqxtun v5.8B, v2.8H
+ sqxtun2 v5.16B, v3.8H
+ st1 {v4.2D, v5.2D}, [x0], x2
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_transform_4x4_neon_8, export=1
+ ld1 {v28.2D - v29.2D}, [x0]
+ ldr w2, =0x00240053
+ mov v0.S[0], w2
+ tr4_shift v28, v29, #7 //3210->3120
+ zip1 v26.8H, v28.8H, v29.8H
+ zip2 v27.8H, v28.8H, v29.8H
+ zip1 v28.4S, v26.4S, v27.4S
+ zip2 v29.4S, v26.4S, v27.4S
+ tr4_shift v28, v29, #12 //3210->3120
+ zip1 v26.8H, v28.8H, v29.8H
+ zip2 v27.8H, v28.8H, v29.8H
+ zip1 v28.4S, v26.4S, v27.4S
+ zip2 v29.4S, v26.4S, v27.4S
+ st1 {v28.2D - v29.2D}, [x0]
+ ret
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+ ld1 {v28.2D - v29.2D}, [x0]
+ ldr x1, =0x0037001d004a
+ mov v1.D[0], x1
+ uxtl v0.4S, v1.4H
+ tr4_luma_shift v28, v29, #7
+ zip1 v26.8H, v28.8H, v29.8H
+ zip2 v27.8H, v28.8H, v29.8H
+ zip1 v28.4S, v26.4S, v27.4S
+ zip2 v29.4S, v26.4S, v27.4S
+ tr4_luma_shift v28, v29, #12
+ zip1 v26.8H, v28.8H, v29.8H
+ zip2 v27.8H, v28.8H, v29.8H
+ zip1 v28.4S, v26.4S, v27.4S
+ zip2 v29.4S, v26.4S, v27.4S
+ st1 {v28.2D - v29.2D}, [x0]
+ ret
+endfunc
--
2.3.2 (Apple Git-55)
From 7aa6c4482d67f715e7d799294aeb06a0e6cd786a Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:16:28 +0800
Subject: [PATCH 02/12] Create hevcdsp_init_aarch64.c
Used to add aarch64 neon optimization for HEVC decoder
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/hevcdsp_init_aarch64.c | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
create mode 100644 libavcodec/hevcdsp_init_aarch64.c
diff --git a/libavcodec/hevcdsp_init_aarch64.c b/libavcodec/hevcdsp_init_aarch64.c
new file mode 100644
index 0000000..e8c2802
--- /dev/null
+++ b/libavcodec/hevcdsp_init_aarch64.c
@@ -0,0 +1,33 @@
+/*
+ * ARM NEON optimised HEVC decode for aarch64
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_aarch64.h"
+
+av_cold void ff_hevcdsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_hevcdsp_init_neon(c, bit_depth);
+}
--
2.3.2 (Apple Git-55)
From 6eccea0d8600c64972af9f5b02e84a7499ca4133 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:19:25 +0800
Subject: [PATCH 03/12] Create hevcdsp_init_neon.c
Added for Aarch64 NEON optimization for hevc decoder
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/hevcdsp_init_neon.c | 64 ++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_init_neon.c
diff --git a/libavcodec/aarch64/hevcdsp_init_neon.c b/libavcodec/aarch64/hevcdsp_init_neon.c
new file mode 100644
index 0000000..0a3b2e5
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_init_neon.c
@@ -0,0 +1,64 @@
+/*
+ * ARM NEON optimised HEVC for armv8 instruct functions
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_aarch64.h"
+
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_16x16_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_32x32_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+ if (bit_depth == 8) {
+ int x;
+ c->idct[0] = ff_hevc_transform_4x4_neon_8;
+ c->idct[1] = ff_hevc_transform_8x8_neon_8;
+ c->idct[2] = ff_hevc_transform_16x16_neon_8;
+ c->idct[3] = ff_hevc_transform_32x32_neon_8;
+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8;
+ c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8;
+ c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8;
+ c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
+ c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
+ c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+ }
+}
--
2.3.2 (Apple Git-55)
From 5ac978c2df7bdf891c0149ed83b7f9a6b5317a0c Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:22:17 +0800
Subject: [PATCH 04/12] Update Makefile to add Aach64 NEON optim HEVC decoder
Used to add NEON optimization for HEVC decoder
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/Makefile | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index d001b34..3ef75b8 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
+OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
@@ -28,3 +29,6 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
+
+NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_init_neon.o \
+ aarch64/hevcdsp_idct_neon.o
--
2.3.2 (Apple Git-55)
From 4bb10fc74c38dfcbca4354132189b1032d14a8cb Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:25:44 +0800
Subject: [PATCH 05/12] Add Aarch64 neon optim for HEVC decoder
Used to add Aarch64 neon optimization for HEVC decoder
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/hevcdsp.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d9..bfe37ef 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -259,6 +259,8 @@ int i = 0;
if (ARCH_X86)
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+ if (ARCH_AARCH64)
+ ff_hevcdsp_init_aarch64(hevcdsp, bit_depth);
if (ARCH_ARM)
ff_hevcdsp_init_arm(hevcdsp, bit_depth);
if (ARCH_MIPS)
--
2.3.2 (Apple Git-55)
From 5f139d949dd8d1a939e9b6f4836ecd81ad8298b5 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:27:26 +0800
Subject: [PATCH 06/12] Add Aarch64 neon optim for HEVC decoder
Used to add aarch64 neon optimization for HEVC decoder
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/hevcdsp.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd..757a441 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -128,6 +128,7 @@ extern const int8_t ff_hevc_epel_filters[7][4];
extern const int8_t ff_hevc_qpel_filters[3][16];
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
+void ff_hevcdsp_init_aarch64(HEVCDSPContext *c, const int bit_depth);
void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
#endif /* AVCODEC_HEVCDSP_H */
--
2.3.2 (Apple Git-55)
From 97d72592f61d267ceb57db0cf43f78cecb14efc1 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Tue, 19 Jan 2016 23:25:35 +0800
Subject: [PATCH 07/12] Create hevcdsp_aarch64.h
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/hevcdsp_aarch64.h | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_aarch64.h
diff --git a/libavcodec/aarch64/hevcdsp_aarch64.h b/libavcodec/aarch64/hevcdsp_aarch64.h
new file mode 100644
index 0000000..d44fdc1
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_aarch64.h
@@ -0,0 +1,29 @@
+/*
+ * ARM NEON optimised HEVC for armv8 instruct functions
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_HEVCDSP_AARCH64_H
+#define AVCODEC_AARCH64_HEVCDSP_AARCH64_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_AARCH64_HEVCDSP_AARCH64_H */
--
2.3.2 (Apple Git-55)
From fd5e4e9c00f3e66a99483aba4d4db371ef9d1923 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:41:03 +0800
Subject: [PATCH 08/12] Add qpel neon optimization for HEVC decoder
Add qpel neon optimization for HEVC decoder
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/Makefile | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 3ef75b8..2a6c95c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -31,4 +31,5 @@ NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_init_neon.o \
- aarch64/hevcdsp_idct_neon.o
+ aarch64/hevcdsp_idct_neon.o \
+ aarch64/hevcdsp_qpel_neon.o
--
2.3.2 (Apple Git-55)
From f5ccce6494d6e1dee63560d4e927efa15f5facfe Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:44:46 +0800
Subject: [PATCH 09/12] Create hevcdsp_qpel_neon.S
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/hevcdsp_qpel_neon.S | 1418 ++++++++++++++++++++++++++++++++
1 file changed, 1418 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000..356aa55
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,1418 @@
+/*
+ * ARM NEON optimised HEVC decode for aarch64
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+#define MAX_PB_DOUBLESIZE #128
+
+.macro init_put_pixels
+ prfm PLDL1STRM, [x1]
+ prfm PLDL1STRM, [x1, x2]
+ mov x12, MAX_PB_DOUBLESIZE
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+ init_put_pixels
+0: subs x3, x3, #2
+ ld1 {v0.H}[0], [x1], x2
+ ld1 {v0.H}[1], [x1], x2
+ ushll v0.8H, v0.8B, #6
+ st1 {v0.S}[0], [x0], x12
+ st1 {v0.S}[1], [x0], x12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+ init_put_pixels
+0: subs x3, x3, #2
+ ld1 {v0.S}[0], [x1], x2
+ ld1 {v0.S}[1], [x1], x2
+ ushll v0.8H, v0.8B, #6
+ st1 {v0.D}[0], [x0], x12
+ st1 {v0.D}[1], [x0], x12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+ init_put_pixels
+ sub x10, x2, #4
+ sub x11, x12, #8
+0: subs x3, x3, #2
+ ld1 {v0.S}[0], [x1], #4
+ ld1 {v1.H}[0], [x1], x10
+ ld1 {v0.S}[1], [x1], #4
+ ld1 {v1.H}[1], [x1], x10
+ ushll v0.8H, v0.8B, #6
+ ushll v1.8H, v1.8B, #6
+ st1 {v0.1D}, [x0], #8
+ st1 {v1.S}[0], [x0], x11
+ st1 {v0.D}[1], [x0], #8
+ st1 {v1.S}[1], [x0], x11
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+ init_put_pixels
+0: subs x3, x3, #2
+ ld1 {v0.8B}, [x1], x2
+ ld1 {v1.8B}, [x1], x2
+ ushll v8.8H, v0.8B, #6
+ ushll v9.8H, v1.8B, #6
+ st1 {v8.8H}, [x0], x12
+ st1 {v9.8H}, [x0], x12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+ init_put_pixels
+ sub x10, x2, #8
+ sub x11, x12, #16
+0: subs x3, x3, #2
+ ld1 {v0.1D}, [x1], #8
+ ld1 {v2.S}[0], [x1], x10
+ ld1 {v1.1D}, [x1], #8
+ ld1 {v2.S}[1], [x1], x10
+ ushll v0.8H, v0.8B, #6
+ ushll v1.8H, v1.8B, #6
+ ushll v2.8H, v2.8B, #6
+ st1 {v0.2D}, [x0], #16
+ st1 {v2.1D}, [x0], x11
+ st1 {v1.2D}, [x0], #16
+ st1 {v2.D}[1], [x0], x11
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+ init_put_pixels
+0: subs x3, x3, #2
+ ld1 {v0.2D}, [x1], x2
+ ld1 {v1.2D}, [x1], x2
+ ushll v8.8H, v0.8B, #6
+ ushll2 v9.8H, v0.16B, #6
+ ushll v10.8H, v1.8B, #6
+ ushll2 v11.8H, v1.16B, #6
+ st1 {v8.2D - v9.2D}, [x0], x12
+ st1 {v10.2D - v11.2D}, [x0], x12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+ init_put_pixels
+0: subs x3, x3, #1
+ ld1 {v0.1D - v2.1D}, [x1], x2
+ ushll v8.8H, v0.8B, #6
+ ushll v9.8H, v1.8B, #6
+ ushll v10.8H, v2.8B, #6
+ st1 {v8.2D - v10.2D}, [x0], x12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+ init_put_pixels
+0: subs x3, x3, #1
+ ld1 {v0.2D - v1.2D}, [x1], x2
+ ushll v8.8H, v0.8B, #6
+ ushll2 v9.8H, v0.16B, #6
+ ushll v10.8H, v1.8B, #6
+ ushll2 v11.8H, v1.16B, #6
+ st1 {v8.2D - v11.2D}, [x0], x12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+ init_put_pixels
+ sub x11, x12, #64
+0: subs x3, x3, #1
+ ld1 {v0.2D - v2.2D}, [x1], x2
+ ushll v8.8H, v0.8B, #6
+ ushll2 v9.8H, v0.16B, #6
+ ushll v10.8H, v1.8B, #6
+ ushll2 v11.8H, v1.16B, #6
+ ushll v12.8H, v2.8B, #6
+ ushll2 v13.8H, v2.16B, #6
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v12.2D - v13.2D}, [x0], x11
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+ init_put_pixels
+ sub x11, x12, #64
+0: subs x3, x3, #1
+ ld1 {v0.2D - v3.2D}, [x1], x2
+ ushll v8.8H, v0.8B, #6
+ ushll2 v9.8H, v0.16B, #6
+ ushll v10.8H, v1.8B, #6
+ ushll2 v11.8H, v1.16B, #6
+ ushll v12.8H, v2.8B, #6
+ ushll2 v13.8H, v2.16B, #6
+ ushll v14.8H, v3.8B, #6
+ ushll2 v15.8H, v3.16B, #6
+ st1 {v8.2D - v11.2D}, [x0], #64
+ st1 {v12.2D - v15.2D}, [x0], x11
+ b.ne 0b
+ ret
+endfunc
+
+.macro regshuffle_d8
+ mov v16.8B, v17.8B
+ mov v17.8B, v18.8B
+ mov v18.8B, v19.8B
+ mov v19.8B, v20.8B
+ mov v20.8B, v21.8B
+ mov v21.8B, v22.8B
+ mov v22.8B, v23.8B
+.endm
+
+.macro regshuffle_v8
+ mov v0.16B, v1.16B
+ mov v1.16B, v2.16B
+ mov v2.16B, v3.16B
+ mov v3.16B, v4.16B
+ mov v4.16B, v5.16B
+ mov v5.16B, v6.16B
+ mov v6.16B, v7.16B
+.endm
+
+.macro vextin8
+ prfm PLDL1STRM, [x2]
+ ld1 {v22.1D - v23.1D}, [x2], x3
+ ext v16.8B, v22.8B, v23.8B, #1
+ ext v17.8B, v22.8B, v23.8B, #2
+ ext v18.8B, v22.8B, v23.8B, #3
+ ext v19.8B, v22.8B, v23.8B, #4
+ ext v20.8B, v22.8B, v23.8B, #5
+ ext v21.8B, v22.8B, v23.8B, #6
+ ext v22.8B, v22.8B, v23.8B, #7
+.endm
+
+.macro vextin8_4
+ prfm PLDL1STRM, [x2]
+ ld1 {v22.1D - v23.1D}, [x2], x3
+ ld1 {v24.1D - v25.1D}, [x2], x3
+ ext v16.8B, v22.8B, v23.8B, #1
+ ext v17.8B, v22.8B, v23.8B, #2
+ ext v18.8B, v22.8B, v23.8B, #3
+ ext v19.8B, v22.8B, v23.8B, #4
+ ext v20.8B, v22.8B, v23.8B, #5
+ ext v21.8B, v22.8B, v23.8B, #6
+ ext v22.8B, v22.8B, v23.8B, #7
+ ext v26.8B, v24.8B, v25.8B, #1
+ ext v27.8B, v24.8B, v25.8B, #2
+ ext v28.8B, v24.8B, v25.8B, #3
+ ext v29.8B, v24.8B, v25.8B, #4
+ ext v30.8B, v24.8B, v25.8B, #5
+ ext v31.8B, v24.8B, v25.8B, #6
+ ext v24.8B, v24.8B, v25.8B, #7
+ trn1 v16.4S, v16.4S, v26.4S
+ trn1 v17.4S, v17.4S, v27.4S
+ trn1 v18.4S, v18.4S, v28.4S
+ trn1 v19.4S, v19.4S, v29.4S
+ trn1 v20.4S, v20.4S, v30.4S
+ trn1 v21.4S, v21.4S, v31.4S
+ trn1 v22.4S, v22.4S, v24.4S
+ trn1 v23.4S, v23.4S, v25.4S
+.endm
+
+.macro loadin8
+ prfm PLDL1STRM, [x2]
+ ld1 {v16.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v17.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v18.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v19.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v20.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v21.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v22.1D}, [x2], x3
+ prfm PLDL1STRM, [x2]
+ ld1 {v23.1D}, [x2], x3
+.endm
+
+.macro qpel_filter_1_32b
+ movi v16.8H, #58
+ movi v17.8H, #10
+ smull v9.4S, v3.4H, v16.4H
+ smull2 v10.4S, v3.8H, v16.8H
+ movi v16.8H, #17
+ smull v11.4S, v2.4H, v17.4H
+ smull2 v12.4S, v2.8H, v17.8H
+ movi v17.8H, #5
+ smull v13.4S, v4.4H, v16.4H
+ smull2 v14.4S, v4.8H, v16.8H
+ smull v15.4S, v5.4H, v17.4H
+ smull2 v8.4S, v5.8H, v17.8H
+ sub v9.4S, v9.4S, v11.4S
+ sub v10.4S, v10.4S, v12.4S
+ sshll v11.4S, v1.4H, #2
+ sshll2 v12.4S, v1.8H, #2
+ add v9.4S, v9.4S, v13.4S
+ add v10.4S, v10.4S, v14.4S
+ ssubl v13.4S, v6.4H, v0.4H
+ ssubl2 v14.4S, v6.8H, v0.8H
+ add v9.4S, v9.4S, v11.4S
+ add v10.4S, v10.4S, v12.4S
+ sub v13.4S, v13.4S, v15.4S
+ sub v14.4S, v14.4S, v8.4S
+ add v9.4S, v9.4S, v13.4S
+ add v10.4S, v10.4S, v14.4S
+ sqshrn v8.4H, v9.4S, #6
+ sqshrn2 v8.8H, v10.4S, #6
+.endm
+
+.macro qpel_filter_2_32b
+ movi v8.4S, #11
+ saddl v9.4S, v3.4H, v4.4H
+ saddl2 v10.4S, v3.8H, v4.8H
+ saddl v11.4S, v2.4H, v5.4H
+ saddl2 v12.4S, v2.8H, v5.8H
+ mul v11.4S, v11.4S, v8.4S
+ mul v12.4S, v12.4S, v8.4S
+ movi v8.4S, #40
+ saddl v15.4S, v1.4H, v6.4H
+ mul v9.4S, v9.4S, v8.4S
+ mul v10.4S, v10.4S, v8.4S
+ saddl2 v8.4S, v1.8H, v6.8H
+ saddl v13.4S, v0.4H, v7.4H
+ saddl2 v14.4S, v0.8H, v7.8H
+ shl v15.4S, v15.4S, #2
+ shl v8.4S, v8.4S, #2
+ add v11.4S, v11.4S, v13.4S
+ add v12.4S, v12.4S, v14.4S
+ add v9.4S, v9.4S, v15.4S
+ add v10.4S, v10.4S, v8.4S
+ sub v9.4S, v9.4S, v11.4S
+ sub v10.4S, v10.4S, v12.4S
+ sqshrn v8.4H, v9.4S, #6
+ sqshrn2 v8.8H, v10.4S, #6
+.endm
+
+.macro qpel_filter_3_32b
+ movi v16.8H, #58
+ movi v17.8H, #10
+ smull v9.4S, v4.4H, v16.4H
+ smull2 v10.4S, v4.8H, v16.8H
+ movi v16.8H, #17
+ smull v11.4S, v5.4H, v17.4H
+ smull2 v12.4S, v5.8H, v17.8H
+ movi v17.8H, #5
+ smull v13.4S, v3.4H, v16.4H
+ smull2 v14.4S, v3.8H, v16.8H
+ smull v15.4S, v2.4H, v17.4H
+ smull2 v8.4S, v2.8H, v17.8H
+ sub v9.4S, v9.4S, v11.4S
+ sub v10.4S, v10.4S, v12.4S
+ sshll v11.4S, v6.4H, #2
+ sshll2 v12.4S, v6.8H, #2
+ add v9.4S, v9.4S, v13.4S
+ add v10.4S, v10.4S, v14.4S
+ ssubl v13.4S, v1.4H, v7.4H
+ ssubl2 v14.4S, v1.8H, v7.8H
+ add v9.4S, v9.4S, v11.4S
+ add v10.4S, v10.4S, v12.4S
+ sub v13.4S, v13.4S, v15.4S
+ sub v14.4S, v14.4S, v8.4S
+ add v9.4S, v9.4S, v13.4S
+ add v10.4S, v10.4S, v14.4S
+ sqshrn v8.4H, v9.4S, #6
+ sqshrn2 v8.8H, v10.4S, #6
+.endm
+
+.macro qpel_filter_1 out=v7
+ movi v24.8B, #58
+ movi v25.8B, #10
+ ushll v13.8H, v20.8B, #4
+ ushll v14.8H, v21.8B, #2
+ umull \out\().8H, v19.8B, v24.8B
+ uaddw v13.8H, v13.8H, v20.8B
+ umull v15.8H, v18.8B, v25.8B
+ uaddw v14.8H, v14.8H, v21.8B
+ usubl v12.8H, v22.8B, v16.8B
+ add \out\().8H, \out\().8H, v13.8H
+ ushll v13.8H, v17.8B, #2
+ add v15.8H, v15.8H, v14.8H
+ add v13.8H, v13.8H, v12.8H
+ sub \out\().8H, \out\().8H, v15.8H
+ add \out\().8H, \out\().8H, v13.8H
+.endm
+
+.macro qpel_filter_2 out=v7
+ movi v12.8H, #10
+ movi v14.8H, #11
+ uaddl v13.8H, v19.8B, v20.8B
+ uaddl v15.8H, v18.8B, v21.8B
+ mul v13.8H, v13.8H, v12.8H
+ mul v15.8H, v15.8H, v14.8H
+ uaddl \out\().8H, v17.8B, v22.8B
+ uaddl v12.8H, v16.8B, v23.8B
+ add \out\().8H, \out\().8H, v13.8H
+ add v12.8H, v12.8H, v15.8H
+ shl \out\().8H, \out\().8H, #2
+ sub \out\().8H, \out\().8H, v12.8H
+.endm
+
+.macro qpel_filter_3 out=v7
+ movi v24.8B, #58
+ movi v25.8B, #10
+ ushll v13.8H, v19.8B, #4
+ ushll v14.8H, v18.8B, #2
+ umull \out\().8H, v20.8B, v24.8B
+ uaddw v13.8H, v13.8H, v19.8B
+ umull v15.8H, v21.8B, v25.8B
+ uaddw v14.8H, v14.8H, v18.8B
+ usubl v12.8H, v17.8B, v23.8B
+ add \out\().8H, \out\().8H, v13.8H
+ ushll v13.8H, v22.8B, #2
+ add v15.8H, v15.8H, v14.8H
+ add v13.8H, v13.8H, v12.8H
+ sub \out\().8H, \out\().8H, v15.8H
+ add \out\().8H, \out\().8H, v13.8H
+.endm
+
+.macro hevc_put_qpel_vX_neon_8 filter
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, x4
+ mov x6, x0
+ mov x7, x2
+ lsl x1, x1, #1
+0: loadin8
+ cmp x5, #4
+ b.eq 4f
+8: subs x4, x4, #1
+ \filter
+ st1 {v7.2D}, [x0], x1
+ regshuffle_d8
+ ld1 {v23.1D}, [x2], x3
+ b.ne 8b
+ subs x5, x5, #8
+ b.eq 99f
+ mov x4, x12
+ add x6, x6, #16
+ mov x0, x6
+ add x7, x7, #8
+ mov x2, x7
+ b 0b
+4: subs x4, x4, #1
+ \filter
+ st1 {v7.1D}, [x0], x1
+ regshuffle_d8
+ ld1 {v23.S}[0], [x2], x3
+ b.ne 4b
+99: ret
+.endm
+
+.macro hevc_put_qpel_uw_vX_neon_8 filter
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, x5
+ mov x13, x0
+ mov x14, x2
+ cmp x6, #0
+ b.ne .Lbi\@
+0: loadin8
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filter
+ sqrshrun v0.8B, v7.8H, #6
+ st1 {v0.1D}, [x0], x1
+ regshuffle_d8
+ ld1 {v23.1D}, [x2], x3
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x13, x13, #8
+ add x14, x14, #8
+ mov x5, x12
+ mov x0, x13
+ mov x2, x14
+ b 0b
+4: subs x5, x5, #1
+ \filter
+ sqrshrun v0.8B, v7.8H, #6
+ st1 {v0.S}[0], [x0], x1
+ regshuffle_d8
+ ld1 {v23.S}[0], [x2], x3
+ b.ne 4b
+ ret
+.Lbi\@:
+ lsl x7, x7, #1
+ mov x15, x6
+0: loadin8
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filter
+ ld1 {v0.2D}, [x6], x7
+ sqadd v0.8H, v0.8H, v7.8H
+ sqrshrun v0.8B, v0.8H, #7
+ st1 {v0.1D}, [x0], x1
+ regshuffle_d8
+ ld1 {v23.1D}, [x2], x3
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x13, x13, #8
+ add x15, x15, #16
+ add x14, x14, #8
+ mov x5, x12
+ mov x0, x13
+ mov x6, x15
+ mov x2, x14
+ b 0b
+4: subs x5, x5, #1
+ \filter
+ ld1 {v0.1D}, [x6], x7
+ sqadd v0.4H, v0.4H, v7.4H
+ sqrshrun v0.8B, v0.8H, #7
+ st1 {v0.S}[0], [x0], x1
+ regshuffle_d8
+ ld1 {v23.S}[0], [x2], x3
+ b.ne 4b
+99: ret
+.endm
+
+.macro hevc_put_qpel_uw_weight_vX_neon_8 filter
+ ldp w8, w9, [sp]
+ ldp w10, w11, [sp, #8]
+ mov w12, #7
+ sub w12, w12, w7
+ lsl w8, w8, w12
+ lsl w9, w9, w12
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ dup v0.8H, w10
+ dup v1.8H, w8
+ mov x12, x0
+ mov x13, x2
+ mov x14, x5
+ cmp x6, #0
+ b.ne .Lbi\@
+
+0: loadin8
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filter
+ smull v14.4S, v1.4H, v7.4H
+ smull2 v15.4S, v1.8H, v7.8H
+ rshrn v8.4H, v14.4S, #13
+ rshrn2 v8.8H, v15.4S, #13
+ sqadd v8.8H, v8.8H, v0.8H
+ sqxtun v8.8B, v8.8H
+ st1 {v8.1D}, [x0], x1
+ prfm PLDL1STRM, [x2]
+ regshuffle_d8
+ ld1 {v23.1D}, [x2], x3
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x12, x12, #8
+ add x13, x13, #8
+ mov x0, x12
+ mov x2, x13
+ mov x5, x14
+ b 0b
+4: subs x5, x5, #1
+ \filter
+ smull v14.4S, v1.4H, v7.4H
+ rshrn v8.4H, v14.4S, #13
+ sqadd v8.8H, v8.8H, v0.8H
+ sqxtun v8.8B, v8.8H
+ st1 {v8.S}[0], [x0], x1
+ prfm PLDL1STRM, [x2]
+ regshuffle_d8
+ ld1 {v23.S}[0], [x2], x3
+ b.ne 4b
+ ret
+.Lbi\@:
+ add w10, w10, w11
+ add w10, w10, #1
+ lsl w10, w10, #13
+ dup v0.4S, w10
+ dup v2.8H, w9
+ mov x7, MAX_PB_DOUBLESIZE
+ mov x11, x6
+0: loadin8
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filter
+ ld1 {v4.2D}, [x6], x7
+ smull v14.4S, v2.4H, v7.4H
+ smull2 v15.4S, v2.8H, v7.8H
+ smull v12.4S, v1.4H, v4.4H
+ smull2 v13.4S, v1.8H, v4.8H
+ add v14.4S, v14.4S, v12.4S
+ add v15.4S, v15.4S, v13.4S
+ add v14.4S, v14.4S, v0.4S
+ add v15.4S, v15.4S, v0.4S
+ shrn v8.4H, v14.4S, #14
+ shrn2 v8.8H, v14.4S, #14
+ sqxtun v8.8B, v8.8H
+ st1 {v8.1D}, [x0], x1
+ prfm PLDL1STRM, [x2]
+ regshuffle_d8
+ ld1 {v23.1D}, [x2], x3
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x11, x11, #16
+ add x12, x12, #8
+ add x13, x13, #8
+ mov x6, x11
+ mov x0, x12
+ mov x2, x13
+ mov x5, x14
+ b 0b
+4: subs x5, x5, #1
+ \filter
+ ld1 {v4.1D}, [x6], x7
+ smull v14.4S, v2.4H, v7.4H
+ smull v12.4S, v1.4H, v4.4H
+ add v14.4S, v14.4S, v12.4S
+ add v14.4S, v14.4S, v0.4S
+ shrn v8.4H, v14.4S, #14
+ sqxtun v8.8B, v8.8H
+ st1 {v8.S}[0], [x0], x1
+ prfm PLDL1STRM, [x2]
+ regshuffle_d8
+ ld1 {v23.S}[0], [x2], x3
+ b.ne 4b
+99: ret
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+ hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+ hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+ hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_v1_neon_8, export=1
+ hevc_put_qpel_uw_weight_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_v2_neon_8, export=1
+ hevc_put_qpel_uw_weight_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_v3_neon_8, export=1
+ hevc_put_qpel_uw_weight_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+ sub x2, x2, #4
+ lsl x1, x1, #1
+ mov x12, x4
+ mov x6, x0
+ mov x7, x2
+ cmp x5, #4
+ b.eq 4f
+8: subs x4, x4, #1
+ vextin8
+ \filter
+ st1 {v7.2D}, [x0], x1
+ b.ne 8b
+ subs x5, x5, #8
+ b.eq 99f
+ mov x4, x12
+ add x6, x6, #16
+ mov x0, x6
+ add x7, x7, #8
+ mov x2, x7
+ cmp x5, #4
+ b.ne 8b
+4: subs x4, x4, #2
+ vextin8_4
+ \filter
+ st1 {v7.D}[0], [x0], x1
+ st1 {v7.D}[1], [x0], x1
+ b.ne 4b
+99: ret
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+ sub x2, x2, #4
+ mov x12, x5
+ mov x13, x0
+ mov x14, x2
+ cmp x6, #0
+ b.ne .Lbi\@
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ vextin8
+ \filter
+ sqrshrun v0.8B, v7.8H, #6
+ st1 {v0.1D}, [x0], x1
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x13, x13, #8
+ add x14, x14, #8
+ mov x5, x12
+ mov x0, x13
+ mov x2, x14
+ cmp x4, #4
+ b.ne 8b
+4: subs x5, x5, #2
+ vextin8_4
+ \filter
+ sqrshrun v0.8B, v7.8H, #6
+ st1 {v0.S}[0], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ b.ne 4b
+ ret
+.Lbi\@:
+ lsl x7, x7, #1
+ cmp x4, #4
+ b.eq 4f
+ mov x15, x6
+8: subs x5, x5, #1
+ vextin8
+ \filter
+ ld1 {v0.2D}, [x6], x7
+ sqadd v0.8H, v0.8H, v7.8H
+ sqrshrun v0.8B, v0.8H, #7
+ st1 {v0.1D}, [x0]
+ add x0, x0, x1
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x15, x15, #16
+ add x13, x13, #8
+ add x14, x14, #8
+ mov x5, x12
+ mov x6, x15
+ mov x2, x14
+ mov x0, x13
+ cmp x4, #4
+ b.ne 8b
+4: subs x5, x5, #2
+ vextin8_4
+ \filter
+ ld1 {v0.1D}, [x6], x7
+ ld1 {v0.D}[1], [x6], x7
+ sqadd v0.8H, v0.8H, v7.8H
+ sqrshrun v0.8B, v0.8H, #7
+ st1 {v0.S}[0], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ b.ne 4b
+99: ret
+.endm
+
+.macro hevc_put_qpel_uw_weight_hX_neon_8 filter
+ ldp w8, w9, [sp]
+ ldp w10, w11, [sp, #8]
+ mov w12, #7
+ sub w12, w12, w7
+ sub x2, x2, #4
+ lsl w8, w8, w12
+ lsl w9, w9, w12
+ dup v0.8H, w10
+ dup v1.8H, w8
+ mov x12, x0
+ mov x13, x2
+ mov x14, x5
+ cmp x6, #0
+ b.ne .Lbi\@
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ vextin8
+ \filter
+ smull v14.4S, v1.4H, v7.4H
+ smull2 v15.4S, v1.8H, v7.8H
+ rshrn v8.4H, v14.4S, #13
+ rshrn2 v8.8H, v15.4S, #13
+ sqadd v8.8H, v8.8H, v0.8H
+ sqxtun v8.8B, v8.8H
+ st1 {v8.1D}, [x0], x1
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x12, x12, #8
+ add x13, x13, #8
+ mov x0, x12
+ mov x2, x13
+ mov x5, x14
+ cmp x4, #4
+ b.ne 8b
+4: subs x5, x5, #2
+ vextin8_4
+ \filter
+ smull v14.4S, v1.4H, v7.4H
+ smull2 v15.4S, v1.8H, v7.8H
+ rshrn v8.4H, v14.4S, #13
+ rshrn2 v8.8H, v15.4S, #13
+ sqadd v8.8H, v8.8H, v0.8H
+ sqxtun v8.8B, v8.8H
+ st1 {v8.S}[0], [x0], x1
+ st1 {v8.S}[1], [x0], x1
+ b.ne 4b
+ ret
+.Lbi\@:
+ add w10, w10, w11
+ add w10, w10, #1
+ lsl w10, w10, #13
+ dup v0.4S, w10
+ dup v1.8H, w9
+ dup v2.8H, w8
+ mov x7, MAX_PB_DOUBLESIZE
+ cmp x4, #4
+ b.eq 4f
+ mov x11, x6
+8: subs x5, x5, #1
+ vextin8
+ \filter
+ ld1 {v4.2D}, [x6], x7
+ smull v14.4S, v1.4H, v7.4H
+ smull2 v15.4S, v1.8H, v7.8H
+ smull v12.4S, v2.4H, v4.4H
+ smull2 v13.4S, v2.8H, v4.8H
+ add v14.4S, v14.4S, v12.4S
+ add v15.4S, v15.4S, v13.4S
+ add v14.4S, v14.4S, v0.4S
+ add v15.4S, v15.4S, v0.4S
+ shrn v8.4H, v14.4S, #14
+ shrn2 v8.8H, v14.4S, #14
+ sqxtun v8.8B, v8.8H
+ st1 {v8.1D}, [x0], x1
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x11, x11, #16
+ add x12, x12, #8
+ add x13, x13, #8
+ mov x6, x11
+ mov x0, x12
+ mov x2, x13
+ mov x5, x14
+ cmp x4, #4
+ b.ne 8b
+4: subs x5, x5, #2
+ vextin8_4
+ \filter
+ ld1 {v4.D}[0], [x6], x7
+ ld1 {v4.D}[1], [x6], x7
+ smull v14.4S, v1.4H, v7.4H
+ smull2 v15.4S, v1.8H, v7.8H
+ smull v12.4S, v2.4H, v4.4H
+ smull2 v13.4S, v2.8H, v4.8H
+ add v14.4S, v14.4S, v12.4S
+ add v15.4S, v15.4S, v13.4S
+ shrn v8.4H, v14.4S, #14
+ shrn2 v8.8H, v15.4S, #14
+ sqxtun v8.8B, v8.8H
+ st1 {v8.S}[0], [x0], x1
+ st1 {v8.S}[1], [x0], x1
+ b.ne 4b
+99: ret
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+ hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+ hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+ hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1_neon_8, export=1
+ hevc_put_qpel_uw_weight_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2_neon_8, export=1
+ hevc_put_qpel_uw_weight_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3_neon_8, export=1
+ hevc_put_qpel_uw_weight_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+ sub x2, x2, #4
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ lsl x1, x1, #1
+ mov x12, x4
+ mov x6, x0
+ mov x7, x2
+0: vextin8
+ \filterh v0
+ vextin8
+ \filterh v1
+ vextin8
+ \filterh v2
+ vextin8
+ \filterh v3
+ vextin8
+ \filterh v4
+ vextin8
+ \filterh v5
+ vextin8
+ \filterh v6
+ vextin8
+ \filterh v7
+ cmp x5, #4
+ b.eq 4f
+8: subs x4, x4, #1
+ \filterv
+ st1 {v8.2D}, [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 8b
+ subs x5, x5, #8
+ b.eq 99f
+ mov x4, x12
+ add x6, x6, #16
+ mov x0, x6
+ add x7, x7, #8
+ mov x2, x7
+ b 0b
+4: subs x4, x4, #1
+ \filterv
+ st1 {v8.1D}, [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 4b
+99: ret
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+ sub x2, x2, #4
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, x5
+ mov x13, x0
+ mov x14, x2
+ cmp x6, #0
+ b.ne .Lbi\@
+0: vextin8
+ \filterh v0
+ vextin8
+ \filterh v1
+ vextin8
+ \filterh v2
+ vextin8
+ \filterh v3
+ vextin8
+ \filterh v4
+ vextin8
+ \filterh v5
+ vextin8
+ \filterh v6
+ vextin8
+ \filterh v7
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filterv
+ sqrshrun v0.8B, v8.8H, #6
+ st1 {v0.1D}, [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x13, x13, #8
+ add x14, x14, #8
+ mov x5, x12
+ mov x0, x13
+ mov x2, x14
+ b 0b
+4: subs x5, x5, #1
+ \filterv
+ sqrshrun v0.8B, v8.8H, #6
+ st1 {v0.S}[0], [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 4b
+ ret
+.Lbi\@:
+ lsl x7, x7, #1
+ mov x15, x6
+0: vextin8
+ \filterh v0
+ vextin8
+ \filterh v1
+ vextin8
+ \filterh v2
+ vextin8
+ \filterh v3
+ vextin8
+ \filterh v4
+ vextin8
+ \filterh v5
+ vextin8
+ \filterh v6
+ vextin8
+ \filterh v7
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filterv
+ ld1 {v0.2D}, [x6], x7
+ sqadd v0.8H, v0.8H, v8.8H
+ sqrshrun v0.8B, v0.8H, #7
+ st1 {v0.1D}, [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x13, x13, #8
+ add x14, x14, #8
+ add x15, x15, #16
+ mov x5, x12
+ mov x0, x13
+ mov x2, x14
+ mov x6, x15
+ b 0b
+4: subs x5, x5, #1
+ \filterv
+ ld1 {v0.1D}, [x6], x7
+ sqadd v0.4H, v0.4H, v8.4H
+ sqrshrun v0.8B, v0.8H, #7
+ st1 {v0.S}[0], [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 4b
+99: ret
+.endm
+
+.macro hevc_put_qpel_uw_weight_hXvY_neon_8 filterh filterv
+ ldp w8, w9, [sp]
+ ldp w10, w11, [sp, #8]
+ mov w12, #7
+ sub w12, w12, w7
+ lsl w8, w8, w12
+ lsl w9, w9, w12
+ dup v28.8H, w10
+ dup v29.8H, w8
+ sub x2, x2, #4
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, x0
+ mov x13, x2
+ mov x14, x5
+ cmp x6, #0
+ b.ne .Lbi\@
+0: vextin8
+ \filterh v0
+ vextin8
+ \filterh v1
+ vextin8
+ \filterh v2
+ vextin8
+ \filterh v3
+ vextin8
+ \filterh v4
+ vextin8
+ \filterh v5
+ vextin8
+ \filterh v6
+ vextin8
+ \filterh v7
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filterv
+ smull v14.4S, v29.4H, v8.4H
+ smull2 v15.4S, v29.8H, v8.8H
+ rshrn v8.4H, v14.4S, #13
+ rshrn2 v8.8H, v15.4S, #13
+ sqadd v8.8H, v8.8H, v28.8H
+ sqxtun v8.8B, v8.8H
+ st1 {v8.1D}, [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x12, x12, #8
+ add x13, x13, #8
+ mov x0, x12
+ mov x2, x13
+ mov x5, x14
+ b 0b
+4: subs x5, x5, #1
+ \filterv
+ smull v14.4S, v29.4H, v8.4H
+ rshrn v8.4H, v14.4S, #13
+ sqadd v8.8H, v8.8H, v28.8H
+ sqxtun v8.8B, v8.8H
+ st1 {v8.S}[0], [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 4b
+ ret
+.Lbi\@:
+ add w10, w10, w11
+ add w10, w10, #1
+ lsl w10, w10, #13
+ dup v28.4S, w10
+ dup v30.8H, w9
+ mov x7, MAX_PB_DOUBLESIZE
+ mov x11, x6
+0: vextin8
+ \filterh v0
+ vextin8
+ \filterh v1
+ vextin8
+ \filterh v2
+ vextin8
+ \filterh v3
+ vextin8
+ \filterh v4
+ vextin8
+ \filterh v5
+ vextin8
+ \filterh v6
+ vextin8
+ \filterh v7
+ cmp x4, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ \filterv
+ ld1 {v0.2D}, [x6], x7
+ smull v14.4S, v30.4H, v8.4H
+ smull2 v15.4S, v30.8H, v8.8H
+ smull v12.4S, v29.4H, v0.4H
+ smull2 v13.4S, v29.8H, v0.8H
+ add v14.4S, v14.4S, v12.4S
+ add v15.4S, v15.4S, v13.4S
+ add v14.4S, v14.4S, v28.4S
+ add v15.4S, v15.4S, v28.4S
+ rshrn v8.4H, v14.4S, #14
+ rshrn2 v8.8H, v15.4S, #14
+ sqxtun v8.8B, v8.8H
+ st1 {v8.1D}, [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 8b
+ subs x4, x4, #8
+ b.eq 99f
+ add x11, x11, #16
+ add x12, x12, #8
+ add x13, x13, #8
+ mov x6, x11
+ mov x0, x12
+ mov x2, x13
+ mov x5, x14
+ b 0b
+4: subs x5, x5, #1
+ ld1 {v0.1D}, [x6], x7
+ smull v14.4S, v30.4H, v8.4H
+ smull v12.4S, v29.4H, v0.4H
+ add v14.4S, v14.4S, v12.4S
+ add v14.4S, v14.4S, v28.4S
+ rshrn v8.4H, v14.4S, #14
+ sqxtun v8.8B, v8.8H
+ st1 {v8.S}[0], [x0], x1
+ regshuffle_v8
+ vextin8
+ \filterh v7
+ b.ne 4b
+99: ret
+.endm
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1v1_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2v1_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3v1_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1v2_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2v2_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3v2_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1v3_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2v3_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3v3_neon_8, export=1
+ hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uni_w_neon_8, export=1
+ ldr w10, [sp, #16]
+ mov w11, #7
+ sub w5, w11, w5
+ lsl w6, w6, w5
+ dup v12.16B, w6
+ dup v14.8H, w7
+ mov x12, x4
+ mov x13, x0
+ mov x14, x2
+ cmp w10, #4
+ b.eq 4f
+8: subs x4, x4, #1
+ ld1 {v0.1D}, [x2], x3
+ umull v8.8H, v0.8B, v12.8B
+ urshr v8.8H, v8.8H, #7
+ usqadd v8.8H, v14.8H
+ uqxtn v0.8B, v8.8H
+ st1 {v0.1D}, [x0], x1
+ b.ne 8b
+ subs w10, w10, #8
+ b.eq 99f
+ add x13, x13, #8
+ add x14, x14, #8
+ mov x4, x12
+ mov x0, x13
+ mov x2, x14
+ cmp w10, #4
+ b.ne 8b
+4: subs x4, x4, #2
+ ld1 {v0.S}[0], [x2], x3
+ ld1 {v0.S}[1], [x2], x3
+ umull v8.8H, v0.8B, v12.8B
+ urshr v8.8H, v8.8H, #7
+ usqadd v8.8H, v14.8H
+ uqxtn v0.8B, v8.8H
+ st1 {v0.S}[0], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ b.ne 4b
+99: ret
+endfunc
+
+function ff_hevc_put_qpel_bi_w_neon_8, export=1
+ ldp w8, w9, [sp]
+ ldr w10, [sp, #8]
+ mov w11, #7
+ sub w11, w11, w6
+ lsl w7, w7, w11
+ lsl w8, w8, w11
+ ldr w6, [sp, #32]
+ add w11, w9, w10
+ add w11, w11, #1
+ lsl w11, w11, #13
+ dup v12.8H, w7
+ dup v13.8H, w8
+ dup v14.4S, w11
+ mov x7, MAX_PB_DOUBLESIZE
+ mov x10, x4
+ mov x11, x0
+ mov x12, x5
+ mov x13, x2
+ cmp w6, #4
+ b.eq 4f
+8: subs x5, x5, #1
+ ld1 {v0.1D}, [x2], x3
+ ld1 {v1.2D}, [x4], x7
+ ushll v0.8H, v0.8B, #6
+ smull v4.4S, v0.4H, v13.4H
+ smull2 v5.4S, v0.8H, v13.8H
+ smull v6.4S, v1.4H, v12.4H
+ smull2 v7.4S, v1.8H, v12.8H
+ add v4.4S, v4.4S, v6.4S
+ add v5.4S, v5.4S, v7.4S
+ add v4.4S, v4.4S, v14.4S
+ add v5.4S, v5.4S, v14.4S
+ shrn v0.4H, v4.4S, #14
+ shrn2 v0.8H, v5.4S, #14
+ sqxtun v0.8B, v0.8H
+ st1 {v0.1D}, [x0], x1
+ b.ne 8b
+ subs w6, w6, #8
+ b.eq 99f
+ add x11, x11, #8
+ add x10, x10, #16
+ add x13, x13, #8
+ mov x4, x10
+ mov x0, x11
+ mov x5, x12
+ mov x2, x13
+ cmp w6, #4
+ b.ne 8b
+4: subs x5, x5, #2
+ ld1 {v0.S}[0], [x2], x3
+ ld1 {v2.S}[0], [x2], x3
+ ld1 {v1.1D}, [x4], x7
+ ld1 {v3.1D}, [x4], x7
+ ushll v0.8H, v0.8B, #6
+ ushll v2.8H, v2.8B, #6
+ smull v4.4S, v0.4H, v13.4H
+ smull v6.4S, v1.4H, v12.4H
+ smull v5.4S, v2.4H, v13.4H
+ smull v7.4S, v3.4H, v12.4H
+ add v4.4S, v4.4S, v6.4S
+ add v4.4S, v4.4S, v14.4S
+ add v5.4S, v5.4S, v7.4S
+ add v5.4S, v5.4S, v14.4S
+ shrn v0.4H, v4.4S, #14
+ shrn v1.4H, v5.4S, #14
+ sqxtun v0.8B, v0.8H
+ sqxtun v1.8B, v1.8H
+ st1 {v0.S}[0], [x0], x1
+ st1 {v1.S}[0], [x0], x1
+ b.ne 4b
+99: ret
+endfunc
--
2.3.2 (Apple Git-55)
From 3f13286178f46ef629184322d1b7d112cbbaba8a Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:45:46 +0800
Subject: [PATCH 10/12] Update hevcdsp_init_neon.c
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/hevcdsp_init_neon.c | 202 +++++++++++++++++++++++++++++++++
1 file changed, 202 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_neon.c b/libavcodec/aarch64/hevcdsp_init_neon.c
index 0a3b2e5..050f69b 100644
--- a/libavcodec/aarch64/hevcdsp_init_neon.c
+++ b/libavcodec/aarch64/hevcdsp_init_neon.c
@@ -41,6 +41,133 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
+#define PUT_PIXELS(name) \
+void name(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, int height, \
+intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, int width, int16_t *src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+#define QPEL_FUNC(name) \
+ void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+ int height, int width)
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW(name) \
+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+ int width, int height, int16_t* src2, ptrdiff_t src2stride)
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width){
+ put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width){
+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2, int height, intptr_t mx, intptr_t my, int width){
+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+#define QPEL_FUNC_UW_WEIGHT(name) \
+void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+int width, int height, int16_t* src2, \
+int denom, int wx0, int wx1, int ox0, int ox1);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_v3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1v3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2v3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3v3_neon_8);
+#undef QPEL_FUNC_UW_WEIGHT
+
+static void (*put_hevc_qpel_uw_weight_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int16_t* src2,
+ int denom, int wx0, int wx1, int ox0, int ox1);
+
+static void ff_hevc_put_qpel_uni_weight_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+ put_hevc_qpel_uw_weight_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, denom, wx, 0, ox, 0);
+}
+
+static void ff_hevc_put_qpel_bi_weight_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1,
+ intptr_t mx, intptr_t my, int width)
+{
+ put_hevc_qpel_uw_weight_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, denom, wx0, wx1, ox0, ox1);
+}
+
+void ff_hevc_put_qpel_bi_w_neon_8(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *src1, ptrdiff_t _srcstride,
+ int16_t *src2, int height, int denom, int wx0, int wx1,
+ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_w_neon_8(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *src1, ptrdiff_t _srcstride,
+ int height, int denom, int wx0,
+ int ox0, intptr_t mx, intptr_t my, int width);
av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
@@ -60,5 +187,80 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+
+ put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
+ put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
+ put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
+ put_hevc_qpel_neon[0][1] = ff_hevc_put_qpel_h1_neon_8;
+ put_hevc_qpel_neon[0][2] = ff_hevc_put_qpel_h2_neon_8;
+ put_hevc_qpel_neon[0][3] = ff_hevc_put_qpel_h3_neon_8;
+ put_hevc_qpel_neon[1][1] = ff_hevc_put_qpel_h1v1_neon_8;
+ put_hevc_qpel_neon[1][2] = ff_hevc_put_qpel_h2v1_neon_8;
+ put_hevc_qpel_neon[1][3] = ff_hevc_put_qpel_h3v1_neon_8;
+ put_hevc_qpel_neon[2][1] = ff_hevc_put_qpel_h1v2_neon_8;
+ put_hevc_qpel_neon[2][2] = ff_hevc_put_qpel_h2v2_neon_8;
+ put_hevc_qpel_neon[2][3] = ff_hevc_put_qpel_h3v2_neon_8;
+ put_hevc_qpel_neon[3][1] = ff_hevc_put_qpel_h1v3_neon_8;
+ put_hevc_qpel_neon[3][2] = ff_hevc_put_qpel_h2v3_neon_8;
+ put_hevc_qpel_neon[3][3] = ff_hevc_put_qpel_h3v3_neon_8;
+ put_hevc_qpel_uw_neon[1][0] = ff_hevc_put_qpel_uw_v1_neon_8;
+ put_hevc_qpel_uw_neon[2][0] = ff_hevc_put_qpel_uw_v2_neon_8;
+ put_hevc_qpel_uw_neon[3][0] = ff_hevc_put_qpel_uw_v3_neon_8;
+ put_hevc_qpel_uw_neon[0][1] = ff_hevc_put_qpel_uw_h1_neon_8;
+ put_hevc_qpel_uw_neon[0][2] = ff_hevc_put_qpel_uw_h2_neon_8;
+ put_hevc_qpel_uw_neon[0][3] = ff_hevc_put_qpel_uw_h3_neon_8;
+ put_hevc_qpel_uw_neon[1][1] = ff_hevc_put_qpel_uw_h1v1_neon_8;
+ put_hevc_qpel_uw_neon[1][2] = ff_hevc_put_qpel_uw_h2v1_neon_8;
+ put_hevc_qpel_uw_neon[1][3] = ff_hevc_put_qpel_uw_h3v1_neon_8;
+ put_hevc_qpel_uw_neon[2][1] = ff_hevc_put_qpel_uw_h1v2_neon_8;
+ put_hevc_qpel_uw_neon[2][2] = ff_hevc_put_qpel_uw_h2v2_neon_8;
+ put_hevc_qpel_uw_neon[2][3] = ff_hevc_put_qpel_uw_h3v2_neon_8;
+ put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8;
+ put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8;
+ put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8;
+ put_hevc_qpel_uw_weight_neon[1][0] = ff_hevc_put_qpel_uw_weight_v1_neon_8;
+ put_hevc_qpel_uw_weight_neon[2][0] = ff_hevc_put_qpel_uw_weight_v2_neon_8;
+ put_hevc_qpel_uw_weight_neon[3][0] = ff_hevc_put_qpel_uw_weight_v3_neon_8;
+ put_hevc_qpel_uw_weight_neon[0][1] = ff_hevc_put_qpel_uw_weight_h1_neon_8;
+ put_hevc_qpel_uw_weight_neon[0][2] = ff_hevc_put_qpel_uw_weight_h2_neon_8;
+ put_hevc_qpel_uw_weight_neon[0][3] = ff_hevc_put_qpel_uw_weight_h3_neon_8;
+ put_hevc_qpel_uw_weight_neon[1][1] = ff_hevc_put_qpel_uw_weight_h1v1_neon_8;
+ put_hevc_qpel_uw_weight_neon[1][2] = ff_hevc_put_qpel_uw_weight_h2v1_neon_8;
+ put_hevc_qpel_uw_weight_neon[1][3] = ff_hevc_put_qpel_uw_weight_h3v1_neon_8;
+ put_hevc_qpel_uw_weight_neon[2][1] = ff_hevc_put_qpel_uw_weight_h1v2_neon_8;
+ put_hevc_qpel_uw_weight_neon[2][2] = ff_hevc_put_qpel_uw_weight_h2v2_neon_8;
+ put_hevc_qpel_uw_weight_neon[2][3] = ff_hevc_put_qpel_uw_weight_h3v2_neon_8;
+ put_hevc_qpel_uw_weight_neon[3][1] = ff_hevc_put_qpel_uw_weight_h1v3_neon_8;
+ put_hevc_qpel_uw_weight_neon[3][2] = ff_hevc_put_qpel_uw_weight_h2v3_neon_8;
+ put_hevc_qpel_uw_weight_neon[3][3] = ff_hevc_put_qpel_uw_weight_h3v3_neon_8;
+ for (x = 0; x < 10; x++) {
+ c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel_uni[x][1][0] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_uni[x][0][1] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_uni[x][1][1] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_uni_w[x][0][0] = ff_hevc_put_qpel_uni_w_neon_8;
+ c->put_hevc_qpel_uni_w[x][1][0] = ff_hevc_put_qpel_uni_weight_neon_wrapper;
+ c->put_hevc_qpel_uni_w[x][0][1] = ff_hevc_put_qpel_uni_weight_neon_wrapper;
+ c->put_hevc_qpel_uni_w[x][1][1] = ff_hevc_put_qpel_uni_weight_neon_wrapper;
+ c->put_hevc_qpel_bi_w[x][0][0] = ff_hevc_put_qpel_bi_w_neon_8;
+ c->put_hevc_qpel_bi_w[x][1][0] = ff_hevc_put_qpel_bi_weight_neon_wrapper;
+ c->put_hevc_qpel_bi_w[x][0][1] = ff_hevc_put_qpel_bi_weight_neon_wrapper;
+ c->put_hevc_qpel_bi_w[x][1][1] = ff_hevc_put_qpel_bi_weight_neon_wrapper;
+ }
+ c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
+ c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
+ c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
+ c->put_hevc_qpel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
+ c->put_hevc_qpel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
}
}
--
2.3.2 (Apple Git-55)
From db9a1abfbbb5fa9075903eb8a119b64233121f0f Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:56:50 +0800
Subject: [PATCH 12/12] Create hevcdsp_init_aarch64.c
Signed-off-by: zjh8890 <243186085 at qq.com>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 33 +++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
new file mode 100644
index 0000000..e8c2802
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -0,0 +1,33 @@
+/*
+ * ARM NEON optimised HEVC decode for aarch64
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_aarch64.h"
+
+av_cold void ff_hevcdsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_hevcdsp_init_neon(c, bit_depth);
+}
--
2.3.2 (Apple Git-55)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-Create-hevcdsp_init_aarch64.c.patch
Type: application/octet-stream
Size: 1928 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0003-Create-hevcdsp_init_neon.c.patch
Type: application/octet-stream
Size: 3981 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0001.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-Update-Makefile-to-add-Aach64-NEON-optim-HEVC-decode.patch
Type: application/octet-stream
Size: 1571 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0002.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0005-Add-Aarch64-neon-optim-for-HEVC-decoder.patch
Type: application/octet-stream
Size: 823 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0003.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0006-Add-Aarch64-neon-optim-for-HEVC-decoder.patch
Type: application/octet-stream
Size: 979 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0004.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0007-Create-hevcdsp_aarch64.h.patch
Type: application/octet-stream
Size: 1792 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0005.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0008-Add-qpel-neon-optimization-for-HEVC-decoder.patch
Type: application/octet-stream
Size: 1044 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0006.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0012-Create-hevcdsp_init_aarch64.c.patch
Type: application/octet-stream
Size: 1910 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0007.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0009-Create-hevcdsp_qpel_neon.S.patch
Type: application/octet-stream
Size: 43767 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0008.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0010-Update-hevcdsp_init_neon.c.patch
Type: application/octet-stream
Size: 14286 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0009.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-Create-hevcdsp_idct_neon.S-for-aarch64.patch
Type: application/octet-stream
Size: 48246 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0010.obj>
More information about the ffmpeg-devel
mailing list