FFmpeg
mlpdsp_init.c
Go to the documentation of this file.
1 /*
2  * MLP DSP functions x86-optimized
3  * Copyright (c) 2009 Ramiro Polla
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/x86/asm.h"
25 #include "libavutil/x86/cpu.h"
26 #include "libavcodec/mlpdsp.h"
27 #include "libavcodec/mlp.h"
28 
29 #define REMATRIX_CHANNEL_FUNC(opt) \
30 void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
31  const int32_t *coeffs, \
32  const uint8_t *bypassed_lsbs, \
33  const int8_t *noise_buffer, \
34  int index, \
35  unsigned int dest_ch, \
36  uint16_t blockpos, \
37  unsigned int maxchan, \
38  int matrix_noise_shift, \
39  int access_unit_size_pow2, \
40  int32_t mask);
41 
43 REMATRIX_CHANNEL_FUNC(avx2_bmi2)
44 
45 #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
46 
47 extern char ff_mlp_firorder_8;
48 extern char ff_mlp_firorder_7;
49 extern char ff_mlp_firorder_6;
50 extern char ff_mlp_firorder_5;
51 extern char ff_mlp_firorder_4;
52 extern char ff_mlp_firorder_3;
53 extern char ff_mlp_firorder_2;
54 extern char ff_mlp_firorder_1;
55 extern char ff_mlp_firorder_0;
56 
57 extern char ff_mlp_iirorder_4;
58 extern char ff_mlp_iirorder_3;
59 extern char ff_mlp_iirorder_2;
60 extern char ff_mlp_iirorder_1;
61 extern char ff_mlp_iirorder_0;
62 
63 static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
64  &ff_mlp_firorder_2, &ff_mlp_firorder_3,
65  &ff_mlp_firorder_4, &ff_mlp_firorder_5,
66  &ff_mlp_firorder_6, &ff_mlp_firorder_7,
67  &ff_mlp_firorder_8 };
68 static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
69  &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
70  &ff_mlp_iirorder_4 };
71 
72 #if ARCH_X86_64
73 
74 #define MLPMUL(label, offset, offs, offc) \
75  LABEL_MANGLE(label)": \n\t" \
76  "movslq "offset"+"offs"(%0), %%rax\n\t" \
77  "movslq "offset"+"offc"(%1), %%rdx\n\t" \
78  "imul %%rdx, %%rax\n\t" \
79  "add %%rax, %%rsi\n\t"
80 
81 #define FIRMULREG(label, offset, firc)\
82  LABEL_MANGLE(label)": \n\t" \
83  "movslq "#offset"(%0), %%rax\n\t" \
84  "imul %"#firc", %%rax\n\t" \
85  "add %%rax, %%rsi\n\t"
86 
87 #define CLEAR_ACCUM \
88  "xor %%rsi, %%rsi\n\t"
89 
90 #define SHIFT_ACCUM \
91  "shr %%cl, %%rsi\n\t"
92 
93 #define ACCUM "%%rdx"
94 #define RESULT "%%rsi"
95 #define RESULT32 "%%esi"
96 
97 #else /* if ARCH_X86_32 */
98 
99 #define MLPMUL(label, offset, offs, offc) \
100  LABEL_MANGLE(label)": \n\t" \
101  "mov "offset"+"offs"(%0), %%eax\n\t" \
102  "imull "offset"+"offc"(%1) \n\t" \
103  "add %%eax , %%esi\n\t" \
104  "adc %%edx , %%ecx\n\t"
105 
106 #define FIRMULREG(label, offset, firc) \
107  MLPMUL(label, #offset, "0", "0")
108 
109 #define CLEAR_ACCUM \
110  "xor %%esi, %%esi\n\t" \
111  "xor %%ecx, %%ecx\n\t"
112 
113 #define SHIFT_ACCUM \
114  "mov %%ecx, %%edx\n\t" \
115  "mov %%esi, %%eax\n\t" \
116  "movzbl %7 , %%ecx\n\t" \
117  "shrd %%cl, %%edx, %%eax\n\t" \
118 
119 #define ACCUM "%%edx"
120 #define RESULT "%%eax"
121 #define RESULT32 "%%eax"
122 
123 #endif /* !ARCH_X86_64 */
124 
125 #define BINC AV_STRINGIFY(4* MAX_CHANNELS)
126 #define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
127 #define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
128 
129 #define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
130 #define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
131 
132 static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
133  int firorder, int iirorder,
134  unsigned int filter_shift, int32_t mask,
135  int blocksize, int32_t *sample_buffer)
136 {
137  const void *firjump = firtable[firorder];
138  const void *iirjump = iirtable[iirorder];
139 
140  blocksize = -blocksize;
141 
142  __asm__ volatile(
143  "1: \n\t"
144  CLEAR_ACCUM
145  "jmp *%5 \n\t"
146  FIRMUL (ff_mlp_firorder_8, 0x1c )
147  FIRMUL (ff_mlp_firorder_7, 0x18 )
148  FIRMUL (ff_mlp_firorder_6, 0x14 )
149  FIRMUL (ff_mlp_firorder_5, 0x10 )
150  FIRMUL (ff_mlp_firorder_4, 0x0c )
151  FIRMUL (ff_mlp_firorder_3, 0x08 )
152  FIRMUL (ff_mlp_firorder_2, 0x04 )
153  FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
154  LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
155  "jmp *%6 \n\t"
156  IIRMUL (ff_mlp_iirorder_4, 0x0c )
157  IIRMUL (ff_mlp_iirorder_3, 0x08 )
158  IIRMUL (ff_mlp_iirorder_2, 0x04 )
159  IIRMUL (ff_mlp_iirorder_1, 0x00 )
160  LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
161  SHIFT_ACCUM
162  "mov "RESULT" ,"ACCUM" \n\t"
163  "add (%2) ,"RESULT" \n\t"
164  "and %4 ,"RESULT" \n\t"
165  "sub $4 , %0 \n\t"
166  "mov "RESULT32", (%0) \n\t"
167  "mov "RESULT32", (%2) \n\t"
168  "add $"BINC" , %2 \n\t"
169  "sub "ACCUM" ,"RESULT" \n\t"
170  "mov "RESULT32","IOFFS"(%0) \n\t"
171  "incl %3 \n\t"
172  "js 1b \n\t"
173  : /* 0*/"+r"(state),
174  /* 1*/"+r"(coeff),
175  /* 2*/"+r"(sample_buffer),
176 #if ARCH_X86_64
177  /* 3*/"+r"(blocksize)
178  : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
179  /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
180  , /* 8*/"r"((int64_t)coeff[0])
181  : "rax", "rdx", "rsi"
182 #else /* ARCH_X86_32 */
183  /* 3*/"+m"(blocksize)
184  : /* 4*/"m"( mask), /* 5*/"m"(firjump),
185  /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
186  : "eax", "edx", "esi", "ecx"
187 #endif /* !ARCH_X86_64 */
188  );
189 }
190 
191 #endif /* HAVE_7REGS && HAVE_INLINE_ASM */
192 
194 {
195  int cpu_flags = av_get_cpu_flags();
196 #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
197  if (INLINE_MMX(cpu_flags))
198  c->mlp_filter_channel = mlp_filter_channel_x86;
199 #endif
200  if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
201  c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
202  if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
203  c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
204 }
INLINE_MMX
#define INLINE_MMX(flags)
Definition: cpu.h:86
cpu.h
r
const char * r
Definition: vf_curves.c:116
sub
static float sub(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:32
LABEL_MANGLE
#define LABEL_MANGLE(a)
Definition: asm.h:103
state
static struct @321 state
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
b
#define b
Definition: input.c:41
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:95
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:50
REMATRIX_CHANNEL_FUNC
#define REMATRIX_CHANNEL_FUNC(opt)
Definition: mlpdsp_init.c:29
av_cold
#define av_cold
Definition: attributes.h:90
mask
static const uint16_t mask[17]
Definition: lzw.c:38
int32_t
int32_t
Definition: audio_convert.c:194
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
cpu.h
mlpdsp.h
asm.h
attributes.h
ff_mlpdsp_init_x86
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
Definition: mlpdsp_init.c:193
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ACCUM
#define ACCUM(k, x, d)
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
add
static float add(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:36
AV_CPU_FLAG_BMI2
#define AV_CPU_FLAG_BMI2
Bit Manipulation Instruction Set 2.
Definition: cpu.h:57
mlp.h
x86_reg
int x86_reg
Definition: asm.h:72
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:73
MLPDSPContext
Definition: mlpdsp.h:49