25 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ 27 out0 = __msa_subs_u_h(out0, in0); \ 28 out1 = __msa_subs_u_h(out1, in1); \ 39 for (row = 16; row--;) {
52 src2 =
LD_UB(src + 16);
54 for (row = 32; row--;) {
55 ST_UB2(src1, src2, dst, 16);
67 for (row = 4; row--;) {
71 src0 = (v16u8) __msa_fill_b(inp >> 24);
72 src1 = (v16u8) __msa_fill_b(inp >> 16);
73 src2 = (v16u8) __msa_fill_b(inp >> 8);
74 src3 = (v16u8) __msa_fill_b(inp);
76 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
77 dst += (4 * dst_stride);
88 for (row = 8; row--;) {
92 src0 = (v16u8) __msa_fill_b(inp >> 24);
93 src1 = (v16u8) __msa_fill_b(inp >> 16);
94 src2 = (v16u8) __msa_fill_b(inp >> 8);
95 src3 = (v16u8) __msa_fill_b(inp);
97 ST_UB2(src0, src0, dst, 16);
99 ST_UB2(src1, src1, dst, 16);
101 ST_UB2(src2, src2, dst, 16);
103 ST_UB2(src3, src3, dst, 16);
112 v16i8 store,
src = { 0 };
120 sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
121 sum_w = __msa_hadd_u_w(sum_h, sum_h);
122 sum_d = __msa_hadd_u_d(sum_w, sum_w);
123 sum_w = (v4u32) __msa_srari_w((v4i32)
sum_d, 3);
124 store = __msa_splati_b((v16i8) sum_w, 0);
125 val0 = __msa_copy_u_w((v4i32) store, 0);
127 SW4(val0, val0, val0, val0, dst, dst_stride);
130 #define INTRA_DC_TL_4x4(dir) \ 131 void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 132 const uint8_t *left, \ 133 const uint8_t *top) \ 136 v16i8 store, data = { 0 }; \ 141 data = (v16i8) __msa_insert_w((v4i32) data, 0, val0); \ 142 sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data); \ 143 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 144 sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2); \ 145 store = __msa_splati_b((v16i8) sum_w, 0); \ 146 val0 = __msa_copy_u_w((v4i32) store, 0); \ 148 SW4(val0, val0, val0, val0, dst, dst_stride); \ 166 sum_h = __msa_hadd_u_h(src, src);
167 sum_w = __msa_hadd_u_w(sum_h, sum_h);
168 sum_d = __msa_hadd_u_d(sum_w, sum_w);
169 sum_w = (v4u32) __msa_pckev_w((v4i32)
sum_d, (v4i32) sum_d);
170 sum_d = __msa_hadd_u_d(sum_w, sum_w);
171 sum_w = (v4u32) __msa_srari_w((v4i32)
sum_d, 4);
172 store = __msa_splati_b((v16i8) sum_w, 0);
173 val0 = __msa_copy_u_d((v2i64) store, 0);
175 SD4(val0, val0, val0, val0, dst, dst_stride);
176 dst += (4 * dst_stride);
177 SD4(val0, val0, val0, val0, dst, dst_stride);
180 #define INTRA_DC_TL_8x8(dir) \ 181 void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 182 const uint8_t *left, \ 183 const uint8_t *top) \ 187 v16u8 data = { 0 }; \ 193 data = (v16u8) __msa_insert_d((v2i64) data, 0, val0); \ 194 sum_h = __msa_hadd_u_h(data, data); \ 195 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 196 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 197 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); \ 198 store = __msa_splati_b((v16i8) sum_w, 0); \ 199 val0 = __msa_copy_u_d((v2i64) store, 0); \ 201 SD4(val0, val0, val0, val0, dst, dst_stride); \ 202 dst += (4 * dst_stride); \ 203 SD4(val0, val0, val0, val0, dst, dst_stride); \ 213 v8u16 sum_h, sum_top, sum_left;
217 top =
LD_UB(src_top);
218 left =
LD_UB(src_left);
220 sum_h = sum_top + sum_left;
221 sum_w = __msa_hadd_u_w(sum_h, sum_h);
222 sum_d = __msa_hadd_u_d(sum_w, sum_w);
223 sum_w = (v4u32) __msa_pckev_w((v4i32)
sum_d, (v4i32) sum_d);
224 sum_d = __msa_hadd_u_d(sum_w, sum_w);
225 sum_w = (v4u32) __msa_srari_w((v4i32)
sum_d, 5);
226 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
228 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
229 dst += (8 * dst_stride);
230 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
233 #define INTRA_DC_TL_16x16(dir) \ 234 void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 235 const uint8_t *left, \ 236 const uint8_t *top) \ 244 sum_h = __msa_hadd_u_h(data, data); \ 245 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 246 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 247 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ 248 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 249 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); \ 250 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ 252 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 253 dst += (8 * dst_stride); \ 254 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 263 v16u8 top0, top1, left0, left1,
out;
264 v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
268 LD_UB2(src_top, 16, top0, top1);
269 LD_UB2(src_left, 16, left0, left1);
272 sum_h = sum_top0 + sum_top1;
273 sum_h += sum_left0 + sum_left1;
274 sum_w = __msa_hadd_u_w(sum_h, sum_h);
275 sum_d = __msa_hadd_u_d(sum_w, sum_w);
276 sum_w = (v4u32) __msa_pckev_w((v4i32)
sum_d, (v4i32) sum_d);
277 sum_d = __msa_hadd_u_d(sum_w, sum_w);
278 sum_w = (v4u32) __msa_srari_w((v4i32)
sum_d, 6);
279 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
281 for (row = 16; row--;)
283 ST_UB2(out, out, dst, 16);
285 ST_UB2(out, out, dst, 16);
290 #define INTRA_DC_TL_32x32(dir) \ 291 void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 292 const uint8_t *left, \ 293 const uint8_t *top) \ 296 v16u8 data0, data1, out; \ 297 v8u16 sum_h, sum_data0, sum_data1; \ 301 LD_UB2(dir, 16, data0, data1); \ 302 HADD_UB2_UH(data0, data1, sum_data0, sum_data1); \ 303 sum_h = sum_data0 + sum_data1; \ 304 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 305 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 306 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ 307 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 308 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); \ 309 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ 311 for (row = 16; row--;) \ 313 ST_UB2(out, out, dst, 16); \ 315 ST_UB2(out, out, dst, 16); \ 322 #define INTRA_PREDICT_VALDC_16X16_MSA(val) \ 323 void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 324 const uint8_t *left, const uint8_t *top) \ 326 v16u8 out = (v16u8) __msa_ldi_b(val); \ 328 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 329 dst += (8 * dst_stride); \ 330 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 337 #define INTRA_PREDICT_VALDC_32X32_MSA(val) \ 338 void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 339 const uint8_t *left, const uint8_t *top) \ 342 v16u8 out = (v16u8) __msa_ldi_b(val); \ 344 for (row = 16; row--;) \ 346 ST_UB2(out, out, dst, 16); \ 348 ST_UB2(out, out, dst, 16); \ 361 uint8_t top_left = src_top_ptr[-1];
362 v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
364 v8u16 src_top_left, vec0, vec1, vec2, vec3;
366 src_top_left = (v8u16) __msa_fill_h(top_left);
367 src_top =
LD_SB(src_top_ptr);
369 src_left0 = __msa_fill_b(left >> 24);
370 src_left1 = __msa_fill_b(left >> 16);
371 src_left2 = __msa_fill_b(left >> 8);
372 src_left3 = __msa_fill_b(left);
374 ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
375 src_left3, src_top, src0, src1, src2, src3);
376 HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
381 ST_W2(tmp0, 0, 2, dst, dst_stride);
382 ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
388 uint8_t top_left = src_top_ptr[-1];
389 uint32_t loop_cnt,
left;
390 v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
391 v8u16 src_top_left, vec0, vec1, vec2, vec3;
394 src_top =
LD_SB(src_top_ptr);
395 src_top_left = (v8u16) __msa_fill_h(top_left);
398 for (loop_cnt = 2; loop_cnt--;) {
400 src_left0 = __msa_fill_b(left >> 24);
401 src_left1 = __msa_fill_b(left >> 16);
402 src_left2 = __msa_fill_b(left >> 8);
403 src_left3 = __msa_fill_b(left);
406 ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
407 src_left3, src_top, src0, src1, src2, src3);
408 HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
413 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
414 dst += (4 * dst_stride);
421 uint8_t top_left = src_top_ptr[-1];
422 uint32_t loop_cnt,
left;
423 v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
424 v8u16 src_top_left, res_r, res_l;
426 src_top =
LD_SB(src_top_ptr);
427 src_top_left = (v8u16) __msa_fill_h(top_left);
430 for (loop_cnt = 4; loop_cnt--;) {
432 src_left0 = __msa_fill_b(left >> 24);
433 src_left1 = __msa_fill_b(left >> 16);
434 src_left2 = __msa_fill_b(left >> 8);
435 src_left3 = __msa_fill_b(left);
472 uint8_t top_left = src_top_ptr[-1];
473 uint32_t loop_cnt,
left;
474 v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
475 v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
477 src_top0 =
LD_SB(src_top_ptr);
478 src_top1 =
LD_SB(src_top_ptr + 16);
479 src_top_left = (v8u16) __msa_fill_h(top_left);
482 for (loop_cnt = 8; loop_cnt--;) {
484 src_left0 = __msa_fill_b(left >> 24);
485 src_left1 = __msa_fill_b(left >> 16);
486 src_left2 = __msa_fill_b(left >> 8);
487 src_left3 = __msa_fill_b(left);
490 ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
491 ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
492 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
496 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
501 ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
502 ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
503 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
507 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
512 ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
513 ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
514 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
518 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
523 ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
524 ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
525 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
529 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, const uint8_t *src)
void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
#define INTRA_PREDICT_VALDC_16X16_MSA(val)
void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, const uint8_t *top)
static void sum_d(const int *input, int *output, int len)
#define INTRA_DC_TL_32x32(dir)
void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, const uint8_t *src)
#define PCKEV_ST_SB(in0, in1, pdst)
void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
#define INTRA_DC_TL_16x16(dir)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define INTRA_DC_TL_8x8(dir)
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)
#define SW4(in0, in1, in2, in3, pdst, stride)
void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, const uint8_t *top)
void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
#define INSERT_W2_SB(...)
#define INTRA_DC_TL_4x4(dir)
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
#define INSERT_D2_UB(...)
#define ST_W2(in, idx0, idx1, pdst, stride)
#define INTRA_PREDICT_VALDC_32X32_MSA(val)
void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)