21 #ifdef COMPILE_TEMPLATE_SSE2
25 #define MOVQU "movdqu"
27 #define LOAD(mem,dst) \
28 MOV" "mem", "dst" \n\t"\
29 "punpcklbw "MM"7, "dst" \n\t"
30 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
31 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
32 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33 "psrldq $2, "src" \n\t"
40 #define LOAD(mem,dst) \
41 MOV" "mem", "dst" \n\t"\
42 "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
44 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
45 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
48 #ifdef COMPILE_TEMPLATE_SSSE3
49 #define PABS(tmp,dst) \
50 "pabsw "dst", "dst" \n\t"
52 #define PABS(tmp,dst) \
53 "pxor "tmp", "tmp" \n\t"\
54 "psubw "dst", "tmp" \n\t"\
55 "pmaxsw "tmp", "dst" \n\t"
58 #define CHECK(pj,mj) \
59 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" \
60 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" \
61 MOVQ" "MM"2, "MM"4 \n\t"\
62 MOVQ" "MM"2, "MM"5 \n\t"\
63 "pxor "MM"3, "MM"4 \n\t"\
64 "pavgb "MM"3, "MM"5 \n\t"\
65 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
66 "psubusb "MM"4, "MM"5 \n\t"\
68 "punpcklbw "MM"7, "MM"5 \n\t" \
69 MOVQ" "MM"2, "MM"4 \n\t"\
70 "psubusb "MM"3, "MM"2 \n\t"\
71 "psubusb "MM"4, "MM"3 \n\t"\
72 "pmaxub "MM"3, "MM"2 \n\t"\
73 MOVQ" "MM"2, "MM"3 \n\t"\
74 MOVQ" "MM"2, "MM"4 \n\t" \
77 "punpcklbw "MM"7, "MM"2 \n\t"\
78 "punpcklbw "MM"7, "MM"3 \n\t"\
79 "punpcklbw "MM"7, "MM"4 \n\t"\
80 "paddw "MM"3, "MM"2 \n\t"\
81 "paddw "MM"4, "MM"2 \n\t"
84 MOVQ" "MM"0, "MM"3 \n\t"\
85 "pcmpgtw "MM"2, "MM"3 \n\t" \
86 "pminsw "MM"2, "MM"0 \n\t" \
87 MOVQ" "MM"3, "MM"6 \n\t"\
88 "pand "MM"3, "MM"5 \n\t"\
89 "pandn "MM"1, "MM"3 \n\t"\
90 "por "MM"5, "MM"3 \n\t"\
91 MOVQ" "MM"3, "MM"1 \n\t"
95 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
96 "psllw $14, "MM"6 \n\t"\
97 "paddsw "MM"6, "MM"2 \n\t"\
98 MOVQ" "MM"0, "MM"3 \n\t"\
99 "pcmpgtw "MM"2, "MM"3 \n\t"\
100 "pminsw "MM"2, "MM"0 \n\t"\
101 "pand "MM"3, "MM"5 \n\t"\
102 "pandn "MM"1, "MM"3 \n\t"\
103 "por "MM"5, "MM"3 \n\t"\
104 MOVQ" "MM"3, "MM"1 \n\t"
107 uint8_t *next,
int w,
int prefs,
115 for(x=0; x<w; x+=STEP){\
117 "pxor "MM"7, "MM"7 \n\t"\
118 LOAD("(%[cur],%[mrefs])", MM"0") \
119 LOAD("(%[cur],%[prefs])", MM"1") \
120 LOAD("(%["prev2"])", MM"2") \
121 LOAD("(%["next2"])", MM"3") \
122 MOVQ" "MM"3, "MM"4 \n\t"\
123 "paddw "MM"2, "MM"3 \n\t"\
124 "psraw $1, "MM"3 \n\t" \
125 MOVQ" "MM"0, (%[tmp]) \n\t" \
126 MOVQ" "MM"3, 16(%[tmp]) \n\t" \
127 MOVQ" "MM"1, 32(%[tmp]) \n\t" \
128 "psubw "MM"4, "MM"2 \n\t"\
129 PABS( MM"4", MM"2") \
130 LOAD("(%[prev],%[mrefs])", MM"3") \
131 LOAD("(%[prev],%[prefs])", MM"4") \
132 "psubw "MM"0, "MM"3 \n\t"\
133 "psubw "MM"1, "MM"4 \n\t"\
136 "paddw "MM"4, "MM"3 \n\t" \
137 "psrlw $1, "MM"2 \n\t"\
138 "psrlw $1, "MM"3 \n\t"\
139 "pmaxsw "MM"3, "MM"2 \n\t"\
140 LOAD("(%[next],%[mrefs])", MM"3") \
141 LOAD("(%[next],%[prefs])", MM"4") \
142 "psubw "MM"0, "MM"3 \n\t"\
143 "psubw "MM"1, "MM"4 \n\t"\
146 "paddw "MM"4, "MM"3 \n\t" \
147 "psrlw $1, "MM"3 \n\t"\
148 "pmaxsw "MM"3, "MM"2 \n\t"\
149 MOVQ" "MM"2, 48(%[tmp]) \n\t" \
151 "paddw "MM"0, "MM"1 \n\t"\
152 "paddw "MM"0, "MM"0 \n\t"\
153 "psubw "MM"1, "MM"0 \n\t"\
154 "psrlw $1, "MM"1 \n\t" \
155 PABS( MM"2", MM"0") \
157 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" \
158 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" \
159 MOVQ" "MM"2, "MM"4 \n\t"\
160 "psubusb "MM"3, "MM"2 \n\t"\
161 "psubusb "MM"4, "MM"3 \n\t"\
162 "pmaxub "MM"3, "MM"2 \n\t"\
163 PSHUF(MM"3", MM"2") \
164 "punpcklbw "MM"7, "MM"2 \n\t" \
165 "punpcklbw "MM"7, "MM"3 \n\t" \
166 "paddw "MM"2, "MM"0 \n\t"\
167 "paddw "MM"3, "MM"0 \n\t"\
168 "psubw "MANGLE(pw_1)", "MM"0 \n\t" \
180 MOVQ" 48(%[tmp]), "MM"6 \n\t" \
181 "cmpl $2, %[mode] \n\t"\
183 LOAD("(%["prev2"],%[mrefs],2)", MM"2") \
184 LOAD("(%["next2"],%[mrefs],2)", MM"4") \
185 LOAD("(%["prev2"],%[prefs],2)", MM"3") \
186 LOAD("(%["next2"],%[prefs],2)", MM"5") \
187 "paddw "MM"4, "MM"2 \n\t"\
188 "paddw "MM"5, "MM"3 \n\t"\
189 "psrlw $1, "MM"2 \n\t" \
190 "psrlw $1, "MM"3 \n\t" \
191 MOVQ" (%[tmp]), "MM"4 \n\t" \
192 MOVQ" 16(%[tmp]), "MM"5 \n\t" \
193 MOVQ" 32(%[tmp]), "MM"7 \n\t" \
194 "psubw "MM"4, "MM"2 \n\t" \
195 "psubw "MM"7, "MM"3 \n\t" \
196 MOVQ" "MM"5, "MM"0 \n\t"\
197 "psubw "MM"4, "MM"5 \n\t" \
198 "psubw "MM"7, "MM"0 \n\t" \
199 MOVQ" "MM"2, "MM"4 \n\t"\
200 "pminsw "MM"3, "MM"2 \n\t"\
201 "pmaxsw "MM"4, "MM"3 \n\t"\
202 "pmaxsw "MM"5, "MM"2 \n\t"\
203 "pminsw "MM"5, "MM"3 \n\t"\
204 "pmaxsw "MM"0, "MM"2 \n\t" \
205 "pminsw "MM"0, "MM"3 \n\t" \
206 "pxor "MM"4, "MM"4 \n\t"\
207 "pmaxsw "MM"3, "MM"6 \n\t"\
208 "psubw "MM"2, "MM"4 \n\t" \
209 "pmaxsw "MM"4, "MM"6 \n\t" \
212 MOVQ" 16(%[tmp]), "MM"2 \n\t" \
213 MOVQ" "MM"2, "MM"3 \n\t"\
214 "psubw "MM"6, "MM"2 \n\t" \
215 "paddw "MM"6, "MM"3 \n\t" \
216 "pmaxsw "MM"2, "MM"1 \n\t"\
217 "pminsw "MM"3, "MM"1 \n\t" \
218 "packuswb "MM"1, "MM"1 \n\t"\
223 [prefs]"r"((x86_reg)prefs),\
224 [mrefs]"r"((x86_reg)mrefs),\
228 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\