[FFmpeg-devel] [PATCH v2] avcodec/startcode: Avoid unaligned accesses

Andreas Rheinhardt andreas.rheinhardt at outlook.com
Wed Oct 12 01:20:23 EEST 2022


Up until now, ff_startcode_find_candidate_c() simply casts
an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
alignment requirement of these types as well as effective type
rules of the C standard. This commit therefore replaces these
direct accesses with AV_RN64/32; this also improves
readability.

UBSan reported these unaligned accesses which happened in 233
FATE-tests involving H.264 and VC-1 (this has also been reported
in tickets #8138 and #8485); these tests are fixed by this commit.

The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
loongarch, ppc and x64. There was only a slight difference for mips.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt at outlook.com>
---
This is v2 of https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinhardt@gmail.com/

Here is the mips code before this change:

startcode_old_O3.o:     file format elf64-tradlittlemips


Disassembly of section .text:

0000000000000000 <ff_startcode_find_candidate_c>:
   0:	18a00029 	blez	a1,a8 <ff_startcode_find_candidate_c+0xa8>
   4:	3c08ff7f 	lui	a4,0xff7f
   8:	3c07ff01 	lui	a3,0xff01
   c:	65087f7f 	daddiu	a4,a4,32639
  10:	64e70101 	daddiu	a3,a3,257
  14:	00084438 	dsll	a4,a4,0x10
  18:	00073c38 	dsll	a3,a3,0x10
  1c:	65087f7f 	daddiu	a4,a4,32639
  20:	64e70101 	daddiu	a3,a3,257
  24:	00084478 	dsll	a4,a4,0x11
  28:	00073df8 	dsll	a3,a3,0x17
  2c:	00803025 	move	a2,a0
  30:	00001025 	move	v0,zero
  34:	3508feff 	ori	a4,a4,0xfeff
  38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
  3c:	34e78080 	ori	a3,a3,0x8080
  40:	24420008 	addiu	v0,v0,8
  44:	0045182a 	slt	v1,v0,a1
  48:	10600015 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
  4c:	00000000 	nop
  50:	dcc30000 	ld	v1,0(a2)
  54:	0068482d 	daddu	a5,v1,a4
  58:	44a30000 	dmtc1	v1,$f0
  5c:	44a90800 	dmtc1	a5,$f1
  60:	4be10002 	pandn	$f0,$f0,$f1
  64:	44230000 	dmfc1	v1,$f0
  68:	00671824 	and	v1,v1,a3
  6c:	1060fff4 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
  70:	64c60008 	daddiu	a2,a2,8
  74:	0045182a 	slt	v1,v0,a1
  78:	10600009 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
  7c:	0082182d 	daddu	v1,a0,v0
  80:	10000005 	b	98 <ff_startcode_find_candidate_c+0x98>
  84:	90640000 	lbu	a0,0(v1)
  88:	24420001 	addiu	v0,v0,1
  8c:	10a20008 	beq	a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
  90:	00000000 	nop
  94:	90640000 	lbu	a0,0(v1)
  98:	1480fffb 	bnez	a0,88 <ff_startcode_find_candidate_c+0x88>
  9c:	64630001 	daddiu	v1,v1,1
  a0:	03e00008 	jr	ra
  a4:	00000000 	nop
  a8:	03e00008 	jr	ra
  ac:	00001025 	move	v0,zero
  b0:	03e00008 	jr	ra
  b4:	00a01025 	move	v0,a1
	...

And here after this change:

startcode_new_O3.o:     file format elf64-tradlittlemips


Disassembly of section .text:

0000000000000000 <ff_startcode_find_candidate_c>:
   0:	18a0002b 	blez	a1,b0 <ff_startcode_find_candidate_c+0xb0>
   4:	3c08ff7f 	lui	a4,0xff7f
   8:	3c07ff01 	lui	a3,0xff01
   c:	65087f7f 	daddiu	a4,a4,32639
  10:	64e70101 	daddiu	a3,a3,257
  14:	00084438 	dsll	a4,a4,0x10
  18:	00073c38 	dsll	a3,a3,0x10
  1c:	65087f7f 	daddiu	a4,a4,32639
  20:	64e70101 	daddiu	a3,a3,257
  24:	00084478 	dsll	a4,a4,0x11
  28:	00073df8 	dsll	a3,a3,0x17
  2c:	00803025 	move	a2,a0
  30:	00001025 	move	v0,zero
  34:	3508feff 	ori	a4,a4,0xfeff
  38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
  3c:	34e78080 	ori	a3,a3,0x8080
  40:	24420008 	addiu	v0,v0,8
  44:	0045182a 	slt	v1,v0,a1
  48:	10600017 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
  4c:	00000000 	nop
  50:	68c30007 	ldl	v1,7(a2)
  54:	6cc30000 	ldr	v1,0(a2)
  58:	0068482d 	daddu	a5,v1,a4
  5c:	44a30000 	dmtc1	v1,$f0
  60:	44a90800 	dmtc1	a5,$f1
  64:	4be10002 	pandn	$f0,$f0,$f1
  68:	44230000 	dmfc1	v1,$f0
  6c:	00671824 	and	v1,v1,a3
  70:	1060fff3 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
  74:	64c60008 	daddiu	a2,a2,8
  78:	0045182a 	slt	v1,v0,a1
  7c:	1060000a 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
  80:	0082182d 	daddu	v1,a0,v0
  84:	10000006 	b	a0 <ff_startcode_find_candidate_c+0xa0>
  88:	90640000 	lbu	a0,0(v1)
  8c:	00000000 	nop
  90:	24420001 	addiu	v0,v0,1
  94:	10a20008 	beq	a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
  98:	00000000 	nop
  9c:	90640000 	lbu	a0,0(v1)
  a0:	1480fffb 	bnez	a0,90 <ff_startcode_find_candidate_c+0x90>
  a4:	64630001 	daddiu	v1,v1,1
  a8:	03e00008 	jr	ra
  ac:	00000000 	nop
  b0:	03e00008 	jr	ra
  b4:	00001025 	move	v0,zero
  b8:	03e00008 	jr	ra
  bc:	00a01025 	move	v0,a1

As one can see, the difference is that an ld has been replaced
by a pair of ldl and ldr. I don't know the performance implications
of this.

 libavcodec/startcode.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index 9efdffe8c6..d84f326521 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -25,6 +25,7 @@
  * @author Michael Niedermayer <michaelni at gmx.at>
  */
 
+#include "libavutil/intreadwrite.h"
 #include "startcode.h"
 #include "config.h"
 
@@ -38,14 +39,14 @@ int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
      */
 #if HAVE_FAST_64BIT
     while (i < size &&
-            !((~*(const uint64_t *)(buf + i) &
-                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+            !((~AV_RN64(buf + i) &
+                    (AV_RN64(buf + i) - 0x0101010101010101ULL)) &
                     0x8080808080808080ULL))
         i += 8;
 #else
     while (i < size &&
-            !((~*(const uint32_t *)(buf + i) &
-                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+            !((~AV_RN32(buf + i) &
+                    (AV_RN32(buf + i) - 0x01010101U)) &
                     0x80808080U))
         i += 4;
 #endif
-- 
2.34.1



More information about the ffmpeg-devel mailing list