[FFmpeg-devel] [PATCH 5/9] x86: simple_idct10_template: fix overflow in pass

Christophe Gisquet christophe.gisquet at gmail.com
Mon Oct 12 19:37:46 CEST 2015


When the input of a pass has 15 or 16 bits of precision (in particular
the column pass), the addition of a bias to W4 may lead to overflows
in the input to pmaddwd.

This requires postponing the adding of the bias to after the first
butterfly. To do so, the fact that m15, unused although zeroed, is
exploited. In case the pass is safe, an address can be directly used,
and the number of xmm regs can be decreased. Otherwise, the 32bits bias
is loaded into it.
---
 libavcodec/x86/proresdsp.asm              |  8 ++++----
 libavcodec/x86/simple_idct10_template.asm | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 18cf15b..3fb71ba 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -37,17 +37,17 @@ cextern pw_1019
 
 section .text align=16
 
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
+%macro idct_put_fn 0
+cglobal prores_idct_put_10, 4, 4, 15
     IDCT_PUT_FN    pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
     RET
 %endmacro
 
 INIT_XMM sse2
-idct_put_fn 16
+idct_put_fn
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_put_fn
 %endif
 
 %endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
index 968d280..e46c83f 100644
--- a/libavcodec/x86/simple_idct10_template.asm
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -75,6 +75,7 @@ cextern w7_min_w5
     ; a2 -= W6 * row[2];
     ; a3 -= W2 * row[2];
 %ifstr %1
+    mova        m15, [pd_round_ %+ %2]
 %else
     paddw       m10, [%1]
 %endif
@@ -87,6 +88,17 @@ cextern w7_min_w5
     pmaddwd     m7,  m1, [w4_min_w2]
     pmaddwd     m0, [w4_plus_w2]
     pmaddwd     m1, [w4_plus_w2]
+%ifstr %1
+    ; Adding 1<<(%2-1) for >=15 bits values
+    paddd       m2, m15
+    paddd       m3, m15
+    paddd       m4, m15
+    paddd       m5, m15
+    paddd       m6, m15
+    paddd       m7, m15
+    paddd       m0, m15
+    paddd       m1, m15
+%endif
 
     ; a0: -1*row[0]-1*row[2]
     ; a1: -1*row[0]
@@ -225,7 +237,6 @@ cextern w7_min_w5
 
 %macro IDCT_PUT_FN 6-7
     movsxd      r1,  r1d
-    pxor        m15, m15           ; zero
 
     ; for (i = 0; i < 8; i++)
     ;     idctRowCondDC(block + i*8);
-- 
2.6.0



More information about the ffmpeg-devel mailing list