FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148 #define MAX_SEGMENT 8
149  struct {
154  struct {
160  int16_t q_val;
161  int8_t lf_val;
162  int16_t qmul[2][2];
163  uint8_t lflvl[4][2];
164  } feat[MAX_SEGMENT];
165  } segmentation;
166  struct {
168  unsigned tile_cols, tile_rows;
170  } tiling;
171  unsigned sb_cols, sb_rows, rows, cols;
172  struct {
174  uint8_t coef[4][2][2][6][6][3];
175  } prob_ctx[4];
176  struct {
177  prob_context p;
178  uint8_t coef[4][2][2][6][6][11];
181  } prob;
182  struct {
183  unsigned y_mode[4][10];
184  unsigned uv_mode[10][10];
185  unsigned filter[4][3];
186  unsigned mv_mode[7][4];
187  unsigned intra[4][2];
188  unsigned comp[5][2];
189  unsigned single_ref[5][2][2];
190  unsigned comp_ref[5][2];
191  unsigned tx32p[2][4];
192  unsigned tx16p[2][3];
193  unsigned tx8p[2][2];
194  unsigned skip[3][2];
195  unsigned mv_joint[4];
196  struct {
197  unsigned sign[2];
198  unsigned classes[11];
199  unsigned class0[2];
200  unsigned bits[10][2];
201  unsigned class0_fp[2][4];
202  unsigned fp[4];
203  unsigned class0_hp[2];
204  unsigned hp[2];
205  } mv_comp[2];
206  unsigned partition[4][4][4];
207  unsigned coef[4][2][2][6][6][3];
208  unsigned eob[4][2][2][6][6][2];
209  } counts;
212 
213  // contextual (left/above) cache
228  // FIXME maybe merge some of the below in a flags field?
239 
240  // whole-frame cache
242  struct VP9Filter *lflvl;
244 
245  // block reconstruction intermediates
247  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
249  struct { int x, y; } min_mv, max_mv;
251  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
252 } VP9Context;
253 
254 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
255  {
256  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
257  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
258  }, {
259  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
260  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
261  }
262 };
263 
265 {
266  VP9Context *s = ctx->priv_data;
267  int ret, sz;
268 
269  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
270  return ret;
271  sz = 64 * s->sb_cols * s->sb_rows;
272  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
273  ff_thread_release_buffer(ctx, &f->tf);
274  return AVERROR(ENOMEM);
275  }
276 
278  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
279 
280  // retain segmentation map if it doesn't update
282  !s->intraonly && !s->keyframe && !s->errorres) {
284  }
285 
286  return 0;
287 }
288 
290 {
291  ff_thread_release_buffer(ctx, &f->tf);
293 }
294 
296 {
297  int res;
298 
299  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
300  return res;
301  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
302  vp9_unref_frame(ctx, dst);
303  return AVERROR(ENOMEM);
304  }
305 
307  dst->mv = src->mv;
308 
309  return 0;
310 }
311 
312 static int update_size(AVCodecContext *ctx, int w, int h)
313 {
314  VP9Context *s = ctx->priv_data;
315  uint8_t *p;
316 
317  av_assert0(w > 0 && h > 0);
318 
319  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
320  return 0;
321 
322  ctx->width = w;
323  ctx->height = h;
324  s->sb_cols = (w + 63) >> 6;
325  s->sb_rows = (h + 63) >> 6;
326  s->cols = (w + 7) >> 3;
327  s->rows = (h + 7) >> 3;
328 
329 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
330  av_freep(&s->intra_pred_data[0]);
331  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
332  if (!p)
333  return AVERROR(ENOMEM);
334  assign(s->intra_pred_data[0], uint8_t *, 64);
335  assign(s->intra_pred_data[1], uint8_t *, 32);
336  assign(s->intra_pred_data[2], uint8_t *, 32);
337  assign(s->above_y_nnz_ctx, uint8_t *, 16);
338  assign(s->above_mode_ctx, uint8_t *, 16);
339  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
341  assign(s->above_skip_ctx, uint8_t *, 8);
342  assign(s->above_txfm_ctx, uint8_t *, 8);
343  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
344  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
345  assign(s->above_segpred_ctx, uint8_t *, 8);
346  assign(s->above_intra_ctx, uint8_t *, 8);
347  assign(s->above_comp_ctx, uint8_t *, 8);
348  assign(s->above_ref_ctx, uint8_t *, 8);
349  assign(s->above_filter_ctx, uint8_t *, 8);
350  assign(s->lflvl, struct VP9Filter *, 1);
351 #undef assign
352 
353  // these will be re-allocated a little later
354  av_freep(&s->b_base);
355  av_freep(&s->block_base);
356 
357  return 0;
358 }
359 
361 {
362  VP9Context *s = ctx->priv_data;
363 
364  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
365  return 0;
366 
367  av_free(s->b_base);
368  av_free(s->block_base);
369  if (s->uses_2pass) {
370  int sbs = s->sb_cols * s->sb_rows;
371 
372  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
373  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
374  if (!s->b_base || !s->block_base)
375  return AVERROR(ENOMEM);
376  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
377  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
378  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
379  s->uveob_base[0] = s->eob_base + 256 * sbs;
380  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
381  } else {
382  s->b_base = av_malloc(sizeof(VP9Block));
383  s->block_base = av_mallocz((64 * 64 + 128) * 3);
384  if (!s->b_base || !s->block_base)
385  return AVERROR(ENOMEM);
386  s->uvblock_base[0] = s->block_base + 64 * 64;
387  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
388  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
389  s->uveob_base[0] = s->eob_base + 256;
390  s->uveob_base[1] = s->uveob_base[0] + 64;
391  }
393 
394  return 0;
395 }
396 
397 // for some reason the sign bit is at the end, not the start, of a bit sequence
399 {
400  int v = get_bits(gb, n);
401  return get_bits1(gb) ? -v : v;
402 }
403 
405 {
406  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
407 }
408 
409 // differential forward probability updates
410 static int update_prob(VP56RangeCoder *c, int p)
411 {
412  static const int inv_map_table[254] = {
413  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
414  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
415  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
416  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
417  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
418  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
419  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
420  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
421  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
422  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
423  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
424  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
425  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
426  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
427  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
428  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
429  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
430  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
431  252, 253,
432  };
433  int d;
434 
435  /* This code is trying to do a differential probability update. For a
436  * current probability A in the range [1, 255], the difference to a new
437  * probability of any value can be expressed differentially as 1-A,255-A
438  * where some part of this (absolute range) exists both in positive as
439  * well as the negative part, whereas another part only exists in one
440  * half. We're trying to code this shared part differentially, i.e.
441  * times two where the value of the lowest bit specifies the sign, and
442  * the single part is then coded on top of this. This absolute difference
443  * then again has a value of [0,254], but a bigger value in this range
444  * indicates that we're further away from the original value A, so we
445  * can code this as a VLC code, since higher values are increasingly
446  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
447  * updates vs. the 'fine, exact' updates further down the range, which
448  * adds one extra dimension to this differential update model. */
449 
450  if (!vp8_rac_get(c)) {
451  d = vp8_rac_get_uint(c, 4) + 0;
452  } else if (!vp8_rac_get(c)) {
453  d = vp8_rac_get_uint(c, 4) + 16;
454  } else if (!vp8_rac_get(c)) {
455  d = vp8_rac_get_uint(c, 5) + 32;
456  } else {
457  d = vp8_rac_get_uint(c, 7);
458  if (d >= 65)
459  d = (d << 1) - 65 + vp8_rac_get(c);
460  d += 64;
461  }
462 
463  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
464  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
465 }
466 
468  const uint8_t *data, int size, int *ref)
469 {
470  VP9Context *s = ctx->priv_data;
471  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
472  int last_invisible;
473  const uint8_t *data2;
474 
475  /* general header */
476  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
477  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
478  return res;
479  }
480  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
481  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
482  return AVERROR_INVALIDDATA;
483  }
484  s->profile = get_bits1(&s->gb);
485  if (get_bits1(&s->gb)) { // reserved bit
486  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
487  return AVERROR_INVALIDDATA;
488  }
489  if (get_bits1(&s->gb)) {
490  *ref = get_bits(&s->gb, 3);
491  return 0;
492  }
493  s->last_uses_2pass = s->uses_2pass;
494  s->last_keyframe = s->keyframe;
495  s->keyframe = !get_bits1(&s->gb);
496  last_invisible = s->invisible;
497  s->invisible = !get_bits1(&s->gb);
498  s->errorres = get_bits1(&s->gb);
499  s->use_last_frame_mvs = !s->errorres && !last_invisible;
500  if (s->keyframe) {
501  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
502  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
503  return AVERROR_INVALIDDATA;
504  }
505  s->colorspace = get_bits(&s->gb, 3);
506  if (s->colorspace == 7) { // RGB = profile 1
507  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
508  return AVERROR_INVALIDDATA;
509  }
510  s->fullrange = get_bits1(&s->gb);
511  // for profile 1, here follows the subsampling bits
512  s->refreshrefmask = 0xff;
513  w = get_bits(&s->gb, 16) + 1;
514  h = get_bits(&s->gb, 16) + 1;
515  if (get_bits1(&s->gb)) // display size
516  skip_bits(&s->gb, 32);
517  } else {
518  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
519  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
520  if (s->intraonly) {
521  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
522  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
523  return AVERROR_INVALIDDATA;
524  }
525  s->refreshrefmask = get_bits(&s->gb, 8);
526  w = get_bits(&s->gb, 16) + 1;
527  h = get_bits(&s->gb, 16) + 1;
528  if (get_bits1(&s->gb)) // display size
529  skip_bits(&s->gb, 32);
530  } else {
531  s->refreshrefmask = get_bits(&s->gb, 8);
532  s->refidx[0] = get_bits(&s->gb, 3);
533  s->signbias[0] = get_bits1(&s->gb);
534  s->refidx[1] = get_bits(&s->gb, 3);
535  s->signbias[1] = get_bits1(&s->gb);
536  s->refidx[2] = get_bits(&s->gb, 3);
537  s->signbias[2] = get_bits1(&s->gb);
538  if (!s->refs[s->refidx[0]].f->data[0] ||
539  !s->refs[s->refidx[1]].f->data[0] ||
540  !s->refs[s->refidx[2]].f->data[0]) {
541  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
542  return AVERROR_INVALIDDATA;
543  }
544  if (get_bits1(&s->gb)) {
545  w = s->refs[s->refidx[0]].f->width;
546  h = s->refs[s->refidx[0]].f->height;
547  } else if (get_bits1(&s->gb)) {
548  w = s->refs[s->refidx[1]].f->width;
549  h = s->refs[s->refidx[1]].f->height;
550  } else if (get_bits1(&s->gb)) {
551  w = s->refs[s->refidx[2]].f->width;
552  h = s->refs[s->refidx[2]].f->height;
553  } else {
554  w = get_bits(&s->gb, 16) + 1;
555  h = get_bits(&s->gb, 16) + 1;
556  }
557  // Note that in this code, "CUR_FRAME" is actually before we
558  // have formally allocated a frame, and thus actually represents
559  // the _last_ frame
560  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
561  s->frames[CUR_FRAME].tf.f->height == h;
562  if (get_bits1(&s->gb)) // display size
563  skip_bits(&s->gb, 32);
564  s->highprecisionmvs = get_bits1(&s->gb);
566  get_bits(&s->gb, 2);
567  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
568  s->signbias[0] != s->signbias[2];
569  if (s->allowcompinter) {
570  if (s->signbias[0] == s->signbias[1]) {
571  s->fixcompref = 2;
572  s->varcompref[0] = 0;
573  s->varcompref[1] = 1;
574  } else if (s->signbias[0] == s->signbias[2]) {
575  s->fixcompref = 1;
576  s->varcompref[0] = 0;
577  s->varcompref[1] = 2;
578  } else {
579  s->fixcompref = 0;
580  s->varcompref[0] = 1;
581  s->varcompref[1] = 2;
582  }
583  }
584  }
585  }
586  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
587  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
588  s->framectxid = c = get_bits(&s->gb, 2);
589 
590  /* loopfilter header data */
591  s->filter.level = get_bits(&s->gb, 6);
592  sharp = get_bits(&s->gb, 3);
593  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
594  // the old cache values since they are still valid
595  if (s->filter.sharpness != sharp)
596  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
597  s->filter.sharpness = sharp;
598  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
599  if (get_bits1(&s->gb)) {
600  for (i = 0; i < 4; i++)
601  if (get_bits1(&s->gb))
602  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
603  for (i = 0; i < 2; i++)
604  if (get_bits1(&s->gb))
605  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
606  }
607  } else {
608  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
609  }
610 
611  /* quantization header data */
612  s->yac_qi = get_bits(&s->gb, 8);
613  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
616  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
617  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
618 
619  /* segmentation header info */
620  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
621  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
622  for (i = 0; i < 7; i++)
623  s->prob.seg[i] = get_bits1(&s->gb) ?
624  get_bits(&s->gb, 8) : 255;
625  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
626  for (i = 0; i < 3; i++)
627  s->prob.segpred[i] = get_bits1(&s->gb) ?
628  get_bits(&s->gb, 8) : 255;
629  }
630  }
631  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
632  (w != s->frames[CUR_FRAME].tf.f->width ||
633  h != s->frames[CUR_FRAME].tf.f->height)) {
634  av_log(ctx, AV_LOG_ERROR,
635  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
637  return AVERROR_INVALIDDATA;
638  }
639 
640  if (get_bits1(&s->gb)) {
642  for (i = 0; i < 8; i++) {
643  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
644  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
645  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
646  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
647  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
648  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
649  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
650  }
651  }
652  } else {
653  s->segmentation.feat[0].q_enabled = 0;
654  s->segmentation.feat[0].lf_enabled = 0;
655  s->segmentation.feat[0].skip_enabled = 0;
656  s->segmentation.feat[0].ref_enabled = 0;
657  }
658 
659  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
660  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
661  int qyac, qydc, quvac, quvdc, lflvl, sh;
662 
663  if (s->segmentation.feat[i].q_enabled) {
665  qyac = s->segmentation.feat[i].q_val;
666  else
667  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
668  } else {
669  qyac = s->yac_qi;
670  }
671  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
672  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
673  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
674  qyac = av_clip_uintp2(qyac, 8);
675 
676  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
677  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
678  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
679  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
680 
681  sh = s->filter.level >= 32;
682  if (s->segmentation.feat[i].lf_enabled) {
684  lflvl = s->segmentation.feat[i].lf_val;
685  else
686  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
687  } else {
688  lflvl = s->filter.level;
689  }
690  s->segmentation.feat[i].lflvl[0][0] =
691  s->segmentation.feat[i].lflvl[0][1] =
692  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
693  for (j = 1; j < 4; j++) {
694  s->segmentation.feat[i].lflvl[j][0] =
695  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
696  s->lf_delta.mode[0]) << sh), 6);
697  s->segmentation.feat[i].lflvl[j][1] =
698  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
699  s->lf_delta.mode[1]) << sh), 6);
700  }
701  }
702 
703  /* tiling info */
704  if ((res = update_size(ctx, w, h)) < 0) {
705  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
706  return res;
707  }
708  for (s->tiling.log2_tile_cols = 0;
709  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
710  s->tiling.log2_tile_cols++) ;
711  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
712  max = FFMAX(0, max - 1);
713  while (max > s->tiling.log2_tile_cols) {
714  if (get_bits1(&s->gb))
715  s->tiling.log2_tile_cols++;
716  else
717  break;
718  }
719  s->tiling.log2_tile_rows = decode012(&s->gb);
720  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
721  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
722  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
723  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
724  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
725  if (!s->c_b) {
726  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
727  return AVERROR(ENOMEM);
728  }
729  }
730 
731  if (s->keyframe || s->errorres || s->intraonly) {
732  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
733  s->prob_ctx[3].p = vp9_default_probs;
734  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
735  sizeof(vp9_default_coef_probs));
736  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
737  sizeof(vp9_default_coef_probs));
738  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
739  sizeof(vp9_default_coef_probs));
740  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
741  sizeof(vp9_default_coef_probs));
742  }
743 
744  // next 16 bits is size of the rest of the header (arith-coded)
745  size2 = get_bits(&s->gb, 16);
746  data2 = align_get_bits(&s->gb);
747  if (size2 > size - (data2 - data)) {
748  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
749  return AVERROR_INVALIDDATA;
750  }
751  ff_vp56_init_range_decoder(&s->c, data2, size2);
752  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
753  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
754  return AVERROR_INVALIDDATA;
755  }
756 
757  if (s->keyframe || s->intraonly) {
758  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
759  } else {
760  memset(&s->counts, 0, sizeof(s->counts));
761  }
762  // FIXME is it faster to not copy here, but do it down in the fw updates
763  // as explicit copies if the fw update is missing (and skip the copy upon
764  // fw update)?
765  s->prob.p = s->prob_ctx[c].p;
766 
767  // txfm updates
768  if (s->lossless) {
769  s->txfmmode = TX_4X4;
770  } else {
771  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
772  if (s->txfmmode == 3)
773  s->txfmmode += vp8_rac_get(&s->c);
774 
775  if (s->txfmmode == TX_SWITCHABLE) {
776  for (i = 0; i < 2; i++)
777  if (vp56_rac_get_prob_branchy(&s->c, 252))
778  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
779  for (i = 0; i < 2; i++)
780  for (j = 0; j < 2; j++)
781  if (vp56_rac_get_prob_branchy(&s->c, 252))
782  s->prob.p.tx16p[i][j] =
783  update_prob(&s->c, s->prob.p.tx16p[i][j]);
784  for (i = 0; i < 2; i++)
785  for (j = 0; j < 3; j++)
786  if (vp56_rac_get_prob_branchy(&s->c, 252))
787  s->prob.p.tx32p[i][j] =
788  update_prob(&s->c, s->prob.p.tx32p[i][j]);
789  }
790  }
791 
792  // coef updates
793  for (i = 0; i < 4; i++) {
794  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
795  if (vp8_rac_get(&s->c)) {
796  for (j = 0; j < 2; j++)
797  for (k = 0; k < 2; k++)
798  for (l = 0; l < 6; l++)
799  for (m = 0; m < 6; m++) {
800  uint8_t *p = s->prob.coef[i][j][k][l][m];
801  uint8_t *r = ref[j][k][l][m];
802  if (m >= 3 && l == 0) // dc only has 3 pt
803  break;
804  for (n = 0; n < 3; n++) {
805  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
806  p[n] = update_prob(&s->c, r[n]);
807  } else {
808  p[n] = r[n];
809  }
810  }
811  p[3] = 0;
812  }
813  } else {
814  for (j = 0; j < 2; j++)
815  for (k = 0; k < 2; k++)
816  for (l = 0; l < 6; l++)
817  for (m = 0; m < 6; m++) {
818  uint8_t *p = s->prob.coef[i][j][k][l][m];
819  uint8_t *r = ref[j][k][l][m];
820  if (m > 3 && l == 0) // dc only has 3 pt
821  break;
822  memcpy(p, r, 3);
823  p[3] = 0;
824  }
825  }
826  if (s->txfmmode == i)
827  break;
828  }
829 
830  // mode updates
831  for (i = 0; i < 3; i++)
832  if (vp56_rac_get_prob_branchy(&s->c, 252))
833  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
834  if (!s->keyframe && !s->intraonly) {
835  for (i = 0; i < 7; i++)
836  for (j = 0; j < 3; j++)
837  if (vp56_rac_get_prob_branchy(&s->c, 252))
838  s->prob.p.mv_mode[i][j] =
839  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
840 
841  if (s->filtermode == FILTER_SWITCHABLE)
842  for (i = 0; i < 4; i++)
843  for (j = 0; j < 2; j++)
844  if (vp56_rac_get_prob_branchy(&s->c, 252))
845  s->prob.p.filter[i][j] =
846  update_prob(&s->c, s->prob.p.filter[i][j]);
847 
848  for (i = 0; i < 4; i++)
849  if (vp56_rac_get_prob_branchy(&s->c, 252))
850  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
851 
852  if (s->allowcompinter) {
853  s->comppredmode = vp8_rac_get(&s->c);
854  if (s->comppredmode)
855  s->comppredmode += vp8_rac_get(&s->c);
856  if (s->comppredmode == PRED_SWITCHABLE)
857  for (i = 0; i < 5; i++)
858  if (vp56_rac_get_prob_branchy(&s->c, 252))
859  s->prob.p.comp[i] =
860  update_prob(&s->c, s->prob.p.comp[i]);
861  } else {
863  }
864 
865  if (s->comppredmode != PRED_COMPREF) {
866  for (i = 0; i < 5; i++) {
867  if (vp56_rac_get_prob_branchy(&s->c, 252))
868  s->prob.p.single_ref[i][0] =
869  update_prob(&s->c, s->prob.p.single_ref[i][0]);
870  if (vp56_rac_get_prob_branchy(&s->c, 252))
871  s->prob.p.single_ref[i][1] =
872  update_prob(&s->c, s->prob.p.single_ref[i][1]);
873  }
874  }
875 
876  if (s->comppredmode != PRED_SINGLEREF) {
877  for (i = 0; i < 5; i++)
878  if (vp56_rac_get_prob_branchy(&s->c, 252))
879  s->prob.p.comp_ref[i] =
880  update_prob(&s->c, s->prob.p.comp_ref[i]);
881  }
882 
883  for (i = 0; i < 4; i++)
884  for (j = 0; j < 9; j++)
885  if (vp56_rac_get_prob_branchy(&s->c, 252))
886  s->prob.p.y_mode[i][j] =
887  update_prob(&s->c, s->prob.p.y_mode[i][j]);
888 
889  for (i = 0; i < 4; i++)
890  for (j = 0; j < 4; j++)
891  for (k = 0; k < 3; k++)
892  if (vp56_rac_get_prob_branchy(&s->c, 252))
893  s->prob.p.partition[3 - i][j][k] =
894  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
895 
896  // mv fields don't use the update_prob subexp model for some reason
897  for (i = 0; i < 3; i++)
898  if (vp56_rac_get_prob_branchy(&s->c, 252))
899  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
900 
901  for (i = 0; i < 2; i++) {
902  if (vp56_rac_get_prob_branchy(&s->c, 252))
903  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
904 
905  for (j = 0; j < 10; j++)
906  if (vp56_rac_get_prob_branchy(&s->c, 252))
907  s->prob.p.mv_comp[i].classes[j] =
908  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
909 
910  if (vp56_rac_get_prob_branchy(&s->c, 252))
911  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
912 
913  for (j = 0; j < 10; j++)
914  if (vp56_rac_get_prob_branchy(&s->c, 252))
915  s->prob.p.mv_comp[i].bits[j] =
916  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
917  }
918 
919  for (i = 0; i < 2; i++) {
920  for (j = 0; j < 2; j++)
921  for (k = 0; k < 3; k++)
922  if (vp56_rac_get_prob_branchy(&s->c, 252))
923  s->prob.p.mv_comp[i].class0_fp[j][k] =
924  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
925 
926  for (j = 0; j < 3; j++)
927  if (vp56_rac_get_prob_branchy(&s->c, 252))
928  s->prob.p.mv_comp[i].fp[j] =
929  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
930  }
931 
932  if (s->highprecisionmvs) {
933  for (i = 0; i < 2; i++) {
934  if (vp56_rac_get_prob_branchy(&s->c, 252))
935  s->prob.p.mv_comp[i].class0_hp =
936  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
937 
938  if (vp56_rac_get_prob_branchy(&s->c, 252))
939  s->prob.p.mv_comp[i].hp =
940  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
941  }
942  }
943  }
944 
945  return (data2 - data) + size2;
946 }
947 
948 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
949  VP9Context *s)
950 {
951  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
952  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
953 }
954 
955 static void find_ref_mvs(VP9Context *s,
956  VP56mv *pmv, int ref, int z, int idx, int sb)
957 {
958  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
959  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
960  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
961  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
962  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
963  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
964  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
965  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
966  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
967  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
968  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
969  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
970  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
971  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
972  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
973  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
974  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
975  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
976  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
977  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
978  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
979  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
980  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
981  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
982  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
983  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
984  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
985  };
986  VP9Block *b = s->b;
987  int row = s->row, col = s->col, row7 = s->row7;
988  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
989 #define INVALID_MV 0x80008000U
990  uint32_t mem = INVALID_MV;
991  int i;
992 
993 #define RETURN_DIRECT_MV(mv) \
994  do { \
995  uint32_t m = AV_RN32A(&mv); \
996  if (!idx) { \
997  AV_WN32A(pmv, m); \
998  return; \
999  } else if (mem == INVALID_MV) { \
1000  mem = m; \
1001  } else if (m != mem) { \
1002  AV_WN32A(pmv, m); \
1003  return; \
1004  } \
1005  } while (0)
1006 
1007  if (sb >= 0) {
1008  if (sb == 2 || sb == 1) {
1009  RETURN_DIRECT_MV(b->mv[0][z]);
1010  } else if (sb == 3) {
1011  RETURN_DIRECT_MV(b->mv[2][z]);
1012  RETURN_DIRECT_MV(b->mv[1][z]);
1013  RETURN_DIRECT_MV(b->mv[0][z]);
1014  }
1015 
1016 #define RETURN_MV(mv) \
1017  do { \
1018  if (sb > 0) { \
1019  VP56mv tmp; \
1020  uint32_t m; \
1021  clamp_mv(&tmp, &mv, s); \
1022  m = AV_RN32A(&tmp); \
1023  if (!idx) { \
1024  AV_WN32A(pmv, m); \
1025  return; \
1026  } else if (mem == INVALID_MV) { \
1027  mem = m; \
1028  } else if (m != mem) { \
1029  AV_WN32A(pmv, m); \
1030  return; \
1031  } \
1032  } else { \
1033  uint32_t m = AV_RN32A(&mv); \
1034  if (!idx) { \
1035  clamp_mv(pmv, &mv, s); \
1036  return; \
1037  } else if (mem == INVALID_MV) { \
1038  mem = m; \
1039  } else if (m != mem) { \
1040  clamp_mv(pmv, &mv, s); \
1041  return; \
1042  } \
1043  } \
1044  } while (0)
1045 
1046  if (row > 0) {
1047  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1048  if (mv->ref[0] == ref) {
1049  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1050  } else if (mv->ref[1] == ref) {
1051  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1052  }
1053  }
1054  if (col > s->tiling.tile_col_start) {
1055  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1056  if (mv->ref[0] == ref) {
1057  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1058  } else if (mv->ref[1] == ref) {
1059  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1060  }
1061  }
1062  i = 2;
1063  } else {
1064  i = 0;
1065  }
1066 
1067  // previously coded MVs in this neighbourhood, using same reference frame
1068  for (; i < 8; i++) {
1069  int c = p[i][0] + col, r = p[i][1] + row;
1070 
1071  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1072  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1073 
1074  if (mv->ref[0] == ref) {
1075  RETURN_MV(mv->mv[0]);
1076  } else if (mv->ref[1] == ref) {
1077  RETURN_MV(mv->mv[1]);
1078  }
1079  }
1080  }
1081 
1082  // MV at this position in previous frame, using same reference frame
1083  if (s->use_last_frame_mvs) {
1084  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1085 
1086  if (!s->last_uses_2pass)
1087  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1088  if (mv->ref[0] == ref) {
1089  RETURN_MV(mv->mv[0]);
1090  } else if (mv->ref[1] == ref) {
1091  RETURN_MV(mv->mv[1]);
1092  }
1093  }
1094 
1095 #define RETURN_SCALE_MV(mv, scale) \
1096  do { \
1097  if (scale) { \
1098  VP56mv mv_temp = { -mv.x, -mv.y }; \
1099  RETURN_MV(mv_temp); \
1100  } else { \
1101  RETURN_MV(mv); \
1102  } \
1103  } while (0)
1104 
1105  // previously coded MVs in this neighbourhood, using different reference frame
1106  for (i = 0; i < 8; i++) {
1107  int c = p[i][0] + col, r = p[i][1] + row;
1108 
1109  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1110  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1111 
1112  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1113  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1114  }
1115  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1116  // BUG - libvpx has this condition regardless of whether
1117  // we used the first ref MV and pre-scaling
1118  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1119  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1120  }
1121  }
1122  }
1123 
1124  // MV at this position in previous frame, using different reference frame
1125  if (s->use_last_frame_mvs) {
1126  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1127 
1128  // no need to await_progress, because we already did that above
1129  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1130  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1131  }
1132  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1133  // BUG - libvpx has this condition regardless of whether
1134  // we used the first ref MV and pre-scaling
1135  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1136  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1137  }
1138  }
1139 
1140  AV_ZERO32(pmv);
1141 #undef INVALID_MV
1142 #undef RETURN_MV
1143 #undef RETURN_SCALE_MV
1144 }
1145 
1146 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1147 {
1148  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1149  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1150  s->prob.p.mv_comp[idx].classes);
1151 
1152  s->counts.mv_comp[idx].sign[sign]++;
1153  s->counts.mv_comp[idx].classes[c]++;
1154  if (c) {
1155  int m;
1156 
1157  for (n = 0, m = 0; m < c; m++) {
1158  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1159  n |= bit << m;
1160  s->counts.mv_comp[idx].bits[m][bit]++;
1161  }
1162  n <<= 3;
1163  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1164  n |= bit << 1;
1165  s->counts.mv_comp[idx].fp[bit]++;
1166  if (hp) {
1167  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1168  s->counts.mv_comp[idx].hp[bit]++;
1169  n |= bit;
1170  } else {
1171  n |= 1;
1172  // bug in libvpx - we count for bw entropy purposes even if the
1173  // bit wasn't coded
1174  s->counts.mv_comp[idx].hp[1]++;
1175  }
1176  n += 8 << c;
1177  } else {
1178  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1179  s->counts.mv_comp[idx].class0[n]++;
1180  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1181  s->prob.p.mv_comp[idx].class0_fp[n]);
1182  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1183  n = (n << 3) | (bit << 1);
1184  if (hp) {
1185  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1186  s->counts.mv_comp[idx].class0_hp[bit]++;
1187  n |= bit;
1188  } else {
1189  n |= 1;
1190  // bug in libvpx - we count for bw entropy purposes even if the
1191  // bit wasn't coded
1192  s->counts.mv_comp[idx].class0_hp[1]++;
1193  }
1194  }
1195 
1196  return sign ? -(n + 1) : (n + 1);
1197 }
1198 
1199 static void fill_mv(VP9Context *s,
1200  VP56mv *mv, int mode, int sb)
1201 {
1202  VP9Block *b = s->b;
1203 
1204  if (mode == ZEROMV) {
1205  AV_ZERO64(mv);
1206  } else {
1207  int hp;
1208 
1209  // FIXME cache this value and reuse for other subblocks
1210  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1211  mode == NEWMV ? -1 : sb);
1212  // FIXME maybe move this code into find_ref_mvs()
1213  if ((mode == NEWMV || sb == -1) &&
1214  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1215  if (mv[0].y & 1) {
1216  if (mv[0].y < 0)
1217  mv[0].y++;
1218  else
1219  mv[0].y--;
1220  }
1221  if (mv[0].x & 1) {
1222  if (mv[0].x < 0)
1223  mv[0].x++;
1224  else
1225  mv[0].x--;
1226  }
1227  }
1228  if (mode == NEWMV) {
1230  s->prob.p.mv_joint);
1231 
1232  s->counts.mv_joint[j]++;
1233  if (j >= MV_JOINT_V)
1234  mv[0].y += read_mv_component(s, 0, hp);
1235  if (j & 1)
1236  mv[0].x += read_mv_component(s, 1, hp);
1237  }
1238 
1239  if (b->comp) {
1240  // FIXME cache this value and reuse for other subblocks
1241  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1242  mode == NEWMV ? -1 : sb);
1243  if ((mode == NEWMV || sb == -1) &&
1244  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1245  if (mv[1].y & 1) {
1246  if (mv[1].y < 0)
1247  mv[1].y++;
1248  else
1249  mv[1].y--;
1250  }
1251  if (mv[1].x & 1) {
1252  if (mv[1].x < 0)
1253  mv[1].x++;
1254  else
1255  mv[1].x--;
1256  }
1257  }
1258  if (mode == NEWMV) {
1260  s->prob.p.mv_joint);
1261 
1262  s->counts.mv_joint[j]++;
1263  if (j >= MV_JOINT_V)
1264  mv[1].y += read_mv_component(s, 0, hp);
1265  if (j & 1)
1266  mv[1].x += read_mv_component(s, 1, hp);
1267  }
1268  }
1269  }
1270 }
1271 
1272 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1273  ptrdiff_t stride, int v)
1274 {
1275  switch (w) {
1276  case 1:
1277  do {
1278  *ptr = v;
1279  ptr += stride;
1280  } while (--h);
1281  break;
1282  case 2: {
1283  int v16 = v * 0x0101;
1284  do {
1285  AV_WN16A(ptr, v16);
1286  ptr += stride;
1287  } while (--h);
1288  break;
1289  }
1290  case 4: {
1291  uint32_t v32 = v * 0x01010101;
1292  do {
1293  AV_WN32A(ptr, v32);
1294  ptr += stride;
1295  } while (--h);
1296  break;
1297  }
1298  case 8: {
1299 #if HAVE_FAST_64BIT
1300  uint64_t v64 = v * 0x0101010101010101ULL;
1301  do {
1302  AV_WN64A(ptr, v64);
1303  ptr += stride;
1304  } while (--h);
1305 #else
1306  uint32_t v32 = v * 0x01010101;
1307  do {
1308  AV_WN32A(ptr, v32);
1309  AV_WN32A(ptr + 4, v32);
1310  ptr += stride;
1311  } while (--h);
1312 #endif
1313  break;
1314  }
1315  }
1316 }
1317 
1318 static void decode_mode(AVCodecContext *ctx)
1319 {
1320  static const uint8_t left_ctx[N_BS_SIZES] = {
1321  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1322  };
1323  static const uint8_t above_ctx[N_BS_SIZES] = {
1324  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1325  };
1326  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1328  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1329  };
1330  VP9Context *s = ctx->priv_data;
1331  VP9Block *b = s->b;
1332  int row = s->row, col = s->col, row7 = s->row7;
1333  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1334  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1335  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1336  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1337  int vref, filter_id;
1338 
1339  if (!s->segmentation.enabled) {
1340  b->seg_id = 0;
1341  } else if (s->keyframe || s->intraonly) {
1343  } else if (!s->segmentation.update_map ||
1344  (s->segmentation.temporal &&
1346  s->prob.segpred[s->above_segpred_ctx[col] +
1347  s->left_segpred_ctx[row7]]))) {
1348  if (!s->errorres) {
1349  int pred = 8, x;
1350  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1351 
1352  if (!s->last_uses_2pass)
1353  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1354  for (y = 0; y < h4; y++)
1355  for (x = 0; x < w4; x++)
1356  pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1357  av_assert1(pred < 8);
1358  b->seg_id = pred;
1359  } else {
1360  b->seg_id = 0;
1361  }
1362 
1363  memset(&s->above_segpred_ctx[col], 1, w4);
1364  memset(&s->left_segpred_ctx[row7], 1, h4);
1365  } else {
1367  s->prob.seg);
1368 
1369  memset(&s->above_segpred_ctx[col], 0, w4);
1370  memset(&s->left_segpred_ctx[row7], 0, h4);
1371  }
1372  if (s->segmentation.enabled &&
1373  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1374  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1375  w4, h4, 8 * s->sb_cols, b->seg_id);
1376  }
1377 
1378  b->skip = s->segmentation.enabled &&
1379  s->segmentation.feat[b->seg_id].skip_enabled;
1380  if (!b->skip) {
1381  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1382  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1383  s->counts.skip[c][b->skip]++;
1384  }
1385 
1386  if (s->keyframe || s->intraonly) {
1387  b->intra = 1;
1388  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1389  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1390  } else {
1391  int c, bit;
1392 
1393  if (have_a && have_l) {
1394  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1395  c += (c == 2);
1396  } else {
1397  c = have_a ? 2 * s->above_intra_ctx[col] :
1398  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1399  }
1400  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1401  s->counts.intra[c][bit]++;
1402  b->intra = !bit;
1403  }
1404 
1405  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1406  int c;
1407  if (have_a) {
1408  if (have_l) {
1409  c = (s->above_skip_ctx[col] ? max_tx :
1410  s->above_txfm_ctx[col]) +
1411  (s->left_skip_ctx[row7] ? max_tx :
1412  s->left_txfm_ctx[row7]) > max_tx;
1413  } else {
1414  c = s->above_skip_ctx[col] ? 1 :
1415  (s->above_txfm_ctx[col] * 2 > max_tx);
1416  }
1417  } else if (have_l) {
1418  c = s->left_skip_ctx[row7] ? 1 :
1419  (s->left_txfm_ctx[row7] * 2 > max_tx);
1420  } else {
1421  c = 1;
1422  }
1423  switch (max_tx) {
1424  case TX_32X32:
1425  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1426  if (b->tx) {
1427  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1428  if (b->tx == 2)
1429  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1430  }
1431  s->counts.tx32p[c][b->tx]++;
1432  break;
1433  case TX_16X16:
1434  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1435  if (b->tx)
1436  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1437  s->counts.tx16p[c][b->tx]++;
1438  break;
1439  case TX_8X8:
1440  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1441  s->counts.tx8p[c][b->tx]++;
1442  break;
1443  case TX_4X4:
1444  b->tx = TX_4X4;
1445  break;
1446  }
1447  } else {
1448  b->tx = FFMIN(max_tx, s->txfmmode);
1449  }
1450 
1451  if (s->keyframe || s->intraonly) {
1452  uint8_t *a = &s->above_mode_ctx[col * 2];
1453  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1454 
1455  b->comp = 0;
1456  if (b->bs > BS_8x8) {
1457  // FIXME the memory storage intermediates here aren't really
1458  // necessary, they're just there to make the code slightly
1459  // simpler for now
1460  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1461  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1462  if (b->bs != BS_8x4) {
1464  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1465  l[0] = a[1] = b->mode[1];
1466  } else {
1467  l[0] = a[1] = b->mode[1] = b->mode[0];
1468  }
1469  if (b->bs != BS_4x8) {
1470  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1471  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1472  if (b->bs != BS_8x4) {
1474  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1475  l[1] = a[1] = b->mode[3];
1476  } else {
1477  l[1] = a[1] = b->mode[3] = b->mode[2];
1478  }
1479  } else {
1480  b->mode[2] = b->mode[0];
1481  l[1] = a[1] = b->mode[3] = b->mode[1];
1482  }
1483  } else {
1485  vp9_default_kf_ymode_probs[*a][*l]);
1486  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1487  // FIXME this can probably be optimized
1488  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1489  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1490  }
1493  } else if (b->intra) {
1494  b->comp = 0;
1495  if (b->bs > BS_8x8) {
1497  s->prob.p.y_mode[0]);
1498  s->counts.y_mode[0][b->mode[0]]++;
1499  if (b->bs != BS_8x4) {
1501  s->prob.p.y_mode[0]);
1502  s->counts.y_mode[0][b->mode[1]]++;
1503  } else {
1504  b->mode[1] = b->mode[0];
1505  }
1506  if (b->bs != BS_4x8) {
1508  s->prob.p.y_mode[0]);
1509  s->counts.y_mode[0][b->mode[2]]++;
1510  if (b->bs != BS_8x4) {
1512  s->prob.p.y_mode[0]);
1513  s->counts.y_mode[0][b->mode[3]]++;
1514  } else {
1515  b->mode[3] = b->mode[2];
1516  }
1517  } else {
1518  b->mode[2] = b->mode[0];
1519  b->mode[3] = b->mode[1];
1520  }
1521  } else {
1522  static const uint8_t size_group[10] = {
1523  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1524  };
1525  int sz = size_group[b->bs];
1526 
1528  s->prob.p.y_mode[sz]);
1529  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1530  s->counts.y_mode[sz][b->mode[3]]++;
1531  }
1533  s->prob.p.uv_mode[b->mode[3]]);
1534  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1535  } else {
1536  static const uint8_t inter_mode_ctx_lut[14][14] = {
1537  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1543  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1544  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1545  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1546  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1547  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1548  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1549  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1550  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1551  };
1552 
1553  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1554  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1555  b->comp = 0;
1556  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1557  } else {
1558  // read comp_pred flag
1559  if (s->comppredmode != PRED_SWITCHABLE) {
1560  b->comp = s->comppredmode == PRED_COMPREF;
1561  } else {
1562  int c;
1563 
1564  // FIXME add intra as ref=0xff (or -1) to make these easier?
1565  if (have_a) {
1566  if (have_l) {
1567  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1568  c = 4;
1569  } else if (s->above_comp_ctx[col]) {
1570  c = 2 + (s->left_intra_ctx[row7] ||
1571  s->left_ref_ctx[row7] == s->fixcompref);
1572  } else if (s->left_comp_ctx[row7]) {
1573  c = 2 + (s->above_intra_ctx[col] ||
1574  s->above_ref_ctx[col] == s->fixcompref);
1575  } else {
1576  c = (!s->above_intra_ctx[col] &&
1577  s->above_ref_ctx[col] == s->fixcompref) ^
1578  (!s->left_intra_ctx[row7] &&
1579  s->left_ref_ctx[row & 7] == s->fixcompref);
1580  }
1581  } else {
1582  c = s->above_comp_ctx[col] ? 3 :
1583  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1584  }
1585  } else if (have_l) {
1586  c = s->left_comp_ctx[row7] ? 3 :
1587  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1588  } else {
1589  c = 1;
1590  }
1591  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1592  s->counts.comp[c][b->comp]++;
1593  }
1594 
1595  // read actual references
1596  // FIXME probably cache a few variables here to prevent repetitive
1597  // memory accesses below
1598  if (b->comp) /* two references */ {
1599  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1600 
1601  b->ref[fix_idx] = s->fixcompref;
1602  // FIXME can this codeblob be replaced by some sort of LUT?
1603  if (have_a) {
1604  if (have_l) {
1605  if (s->above_intra_ctx[col]) {
1606  if (s->left_intra_ctx[row7]) {
1607  c = 2;
1608  } else {
1609  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1610  }
1611  } else if (s->left_intra_ctx[row7]) {
1612  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1613  } else {
1614  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1615 
1616  if (refl == refa && refa == s->varcompref[1]) {
1617  c = 0;
1618  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1619  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1620  (refl == s->fixcompref && refa == s->varcompref[0])) {
1621  c = 4;
1622  } else {
1623  c = (refa == refl) ? 3 : 1;
1624  }
1625  } else if (!s->left_comp_ctx[row7]) {
1626  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1627  c = 1;
1628  } else {
1629  c = (refl == s->varcompref[1] &&
1630  refa != s->varcompref[1]) ? 2 : 4;
1631  }
1632  } else if (!s->above_comp_ctx[col]) {
1633  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1634  c = 1;
1635  } else {
1636  c = (refa == s->varcompref[1] &&
1637  refl != s->varcompref[1]) ? 2 : 4;
1638  }
1639  } else {
1640  c = (refl == refa) ? 4 : 2;
1641  }
1642  }
1643  } else {
1644  if (s->above_intra_ctx[col]) {
1645  c = 2;
1646  } else if (s->above_comp_ctx[col]) {
1647  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1648  } else {
1649  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1650  }
1651  }
1652  } else if (have_l) {
1653  if (s->left_intra_ctx[row7]) {
1654  c = 2;
1655  } else if (s->left_comp_ctx[row7]) {
1656  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1657  } else {
1658  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1659  }
1660  } else {
1661  c = 2;
1662  }
1663  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1664  b->ref[var_idx] = s->varcompref[bit];
1665  s->counts.comp_ref[c][bit]++;
1666  } else /* single reference */ {
1667  int bit, c;
1668 
1669  if (have_a && !s->above_intra_ctx[col]) {
1670  if (have_l && !s->left_intra_ctx[row7]) {
1671  if (s->left_comp_ctx[row7]) {
1672  if (s->above_comp_ctx[col]) {
1673  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1674  !s->above_ref_ctx[col]);
1675  } else {
1676  c = (3 * !s->above_ref_ctx[col]) +
1677  (!s->fixcompref || !s->left_ref_ctx[row7]);
1678  }
1679  } else if (s->above_comp_ctx[col]) {
1680  c = (3 * !s->left_ref_ctx[row7]) +
1681  (!s->fixcompref || !s->above_ref_ctx[col]);
1682  } else {
1683  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1684  }
1685  } else if (s->above_intra_ctx[col]) {
1686  c = 2;
1687  } else if (s->above_comp_ctx[col]) {
1688  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1689  } else {
1690  c = 4 * (!s->above_ref_ctx[col]);
1691  }
1692  } else if (have_l && !s->left_intra_ctx[row7]) {
1693  if (s->left_intra_ctx[row7]) {
1694  c = 2;
1695  } else if (s->left_comp_ctx[row7]) {
1696  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1697  } else {
1698  c = 4 * (!s->left_ref_ctx[row7]);
1699  }
1700  } else {
1701  c = 2;
1702  }
1703  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1704  s->counts.single_ref[c][0][bit]++;
1705  if (!bit) {
1706  b->ref[0] = 0;
1707  } else {
1708  // FIXME can this codeblob be replaced by some sort of LUT?
1709  if (have_a) {
1710  if (have_l) {
1711  if (s->left_intra_ctx[row7]) {
1712  if (s->above_intra_ctx[col]) {
1713  c = 2;
1714  } else if (s->above_comp_ctx[col]) {
1715  c = 1 + 2 * (s->fixcompref == 1 ||
1716  s->above_ref_ctx[col] == 1);
1717  } else if (!s->above_ref_ctx[col]) {
1718  c = 3;
1719  } else {
1720  c = 4 * (s->above_ref_ctx[col] == 1);
1721  }
1722  } else if (s->above_intra_ctx[col]) {
1723  if (s->left_intra_ctx[row7]) {
1724  c = 2;
1725  } else if (s->left_comp_ctx[row7]) {
1726  c = 1 + 2 * (s->fixcompref == 1 ||
1727  s->left_ref_ctx[row7] == 1);
1728  } else if (!s->left_ref_ctx[row7]) {
1729  c = 3;
1730  } else {
1731  c = 4 * (s->left_ref_ctx[row7] == 1);
1732  }
1733  } else if (s->above_comp_ctx[col]) {
1734  if (s->left_comp_ctx[row7]) {
1735  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1736  c = 3 * (s->fixcompref == 1 ||
1737  s->left_ref_ctx[row7] == 1);
1738  } else {
1739  c = 2;
1740  }
1741  } else if (!s->left_ref_ctx[row7]) {
1742  c = 1 + 2 * (s->fixcompref == 1 ||
1743  s->above_ref_ctx[col] == 1);
1744  } else {
1745  c = 3 * (s->left_ref_ctx[row7] == 1) +
1746  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1747  }
1748  } else if (s->left_comp_ctx[row7]) {
1749  if (!s->above_ref_ctx[col]) {
1750  c = 1 + 2 * (s->fixcompref == 1 ||
1751  s->left_ref_ctx[row7] == 1);
1752  } else {
1753  c = 3 * (s->above_ref_ctx[col] == 1) +
1754  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1755  }
1756  } else if (!s->above_ref_ctx[col]) {
1757  if (!s->left_ref_ctx[row7]) {
1758  c = 3;
1759  } else {
1760  c = 4 * (s->left_ref_ctx[row7] == 1);
1761  }
1762  } else if (!s->left_ref_ctx[row7]) {
1763  c = 4 * (s->above_ref_ctx[col] == 1);
1764  } else {
1765  c = 2 * (s->left_ref_ctx[row7] == 1) +
1766  2 * (s->above_ref_ctx[col] == 1);
1767  }
1768  } else {
1769  if (s->above_intra_ctx[col] ||
1770  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1771  c = 2;
1772  } else if (s->above_comp_ctx[col]) {
1773  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1774  } else {
1775  c = 4 * (s->above_ref_ctx[col] == 1);
1776  }
1777  }
1778  } else if (have_l) {
1779  if (s->left_intra_ctx[row7] ||
1780  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1781  c = 2;
1782  } else if (s->left_comp_ctx[row7]) {
1783  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1784  } else {
1785  c = 4 * (s->left_ref_ctx[row7] == 1);
1786  }
1787  } else {
1788  c = 2;
1789  }
1790  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1791  s->counts.single_ref[c][1][bit]++;
1792  b->ref[0] = 1 + bit;
1793  }
1794  }
1795  }
1796 
1797  if (b->bs <= BS_8x8) {
1798  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1799  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1800  } else {
1801  static const uint8_t off[10] = {
1802  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1803  };
1804 
1805  // FIXME this needs to use the LUT tables from find_ref_mvs
1806  // because not all are -1,0/0,-1
1807  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1808  [s->left_mode_ctx[row7 + off[b->bs]]];
1809 
1811  s->prob.p.mv_mode[c]);
1812  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1813  s->counts.mv_mode[c][b->mode[0] - 10]++;
1814  }
1815  }
1816 
1817  if (s->filtermode == FILTER_SWITCHABLE) {
1818  int c;
1819 
1820  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1821  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1822  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1823  s->left_filter_ctx[row7] : 3;
1824  } else {
1825  c = s->above_filter_ctx[col];
1826  }
1827  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1828  c = s->left_filter_ctx[row7];
1829  } else {
1830  c = 3;
1831  }
1832 
1833  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1834  s->prob.p.filter[c]);
1835  s->counts.filter[c][filter_id]++;
1836  b->filter = vp9_filter_lut[filter_id];
1837  } else {
1838  b->filter = s->filtermode;
1839  }
1840 
1841  if (b->bs > BS_8x8) {
1842  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1843 
1845  s->prob.p.mv_mode[c]);
1846  s->counts.mv_mode[c][b->mode[0] - 10]++;
1847  fill_mv(s, b->mv[0], b->mode[0], 0);
1848 
1849  if (b->bs != BS_8x4) {
1851  s->prob.p.mv_mode[c]);
1852  s->counts.mv_mode[c][b->mode[1] - 10]++;
1853  fill_mv(s, b->mv[1], b->mode[1], 1);
1854  } else {
1855  b->mode[1] = b->mode[0];
1856  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1857  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1858  }
1859 
1860  if (b->bs != BS_4x8) {
1862  s->prob.p.mv_mode[c]);
1863  s->counts.mv_mode[c][b->mode[2] - 10]++;
1864  fill_mv(s, b->mv[2], b->mode[2], 2);
1865 
1866  if (b->bs != BS_8x4) {
1868  s->prob.p.mv_mode[c]);
1869  s->counts.mv_mode[c][b->mode[3] - 10]++;
1870  fill_mv(s, b->mv[3], b->mode[3], 3);
1871  } else {
1872  b->mode[3] = b->mode[2];
1873  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1874  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1875  }
1876  } else {
1877  b->mode[2] = b->mode[0];
1878  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1879  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1880  b->mode[3] = b->mode[1];
1881  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1882  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1883  }
1884  } else {
1885  fill_mv(s, b->mv[0], b->mode[0], -1);
1886  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1887  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1888  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1889  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1890  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1891  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1892  }
1893 
1894  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1895  }
1896 
1897 #if HAVE_FAST_64BIT
1898 #define SPLAT_CTX(var, val, n) \
1899  switch (n) { \
1900  case 1: var = val; break; \
1901  case 2: AV_WN16A(&var, val * 0x0101); break; \
1902  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1903  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1904  case 16: { \
1905  uint64_t v64 = val * 0x0101010101010101ULL; \
1906  AV_WN64A( &var, v64); \
1907  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1908  break; \
1909  } \
1910  }
1911 #else
1912 #define SPLAT_CTX(var, val, n) \
1913  switch (n) { \
1914  case 1: var = val; break; \
1915  case 2: AV_WN16A(&var, val * 0x0101); break; \
1916  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1917  case 8: { \
1918  uint32_t v32 = val * 0x01010101; \
1919  AV_WN32A( &var, v32); \
1920  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1921  break; \
1922  } \
1923  case 16: { \
1924  uint32_t v32 = val * 0x01010101; \
1925  AV_WN32A( &var, v32); \
1926  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1927  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1928  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1929  break; \
1930  } \
1931  }
1932 #endif
1933 
1934  switch (bwh_tab[1][b->bs][0]) {
1935 #define SET_CTXS(dir, off, n) \
1936  do { \
1937  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1938  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1939  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1940  if (!s->keyframe && !s->intraonly) { \
1941  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1942  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1943  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1944  if (!b->intra) { \
1945  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1946  if (s->filtermode == FILTER_SWITCHABLE) { \
1947  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1948  } \
1949  } \
1950  } \
1951  } while (0)
1952  case 1: SET_CTXS(above, col, 1); break;
1953  case 2: SET_CTXS(above, col, 2); break;
1954  case 4: SET_CTXS(above, col, 4); break;
1955  case 8: SET_CTXS(above, col, 8); break;
1956  }
1957  switch (bwh_tab[1][b->bs][1]) {
1958  case 1: SET_CTXS(left, row7, 1); break;
1959  case 2: SET_CTXS(left, row7, 2); break;
1960  case 4: SET_CTXS(left, row7, 4); break;
1961  case 8: SET_CTXS(left, row7, 8); break;
1962  }
1963 #undef SPLAT_CTX
1964 #undef SET_CTXS
1965 
1966  if (!s->keyframe && !s->intraonly) {
1967  if (b->bs > BS_8x8) {
1968  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1969 
1970  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1971  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1972  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1973  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1974  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1975  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1976  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1977  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1978  } else {
1979  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1980 
1981  for (n = 0; n < w4 * 2; n++) {
1982  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1983  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1984  }
1985  for (n = 0; n < h4 * 2; n++) {
1986  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1987  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1988  }
1989  }
1990  }
1991 
1992  // FIXME kinda ugly
1993  for (y = 0; y < h4; y++) {
1994  int x, o = (row + y) * s->sb_cols * 8 + col;
1995  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1996 
1997  if (b->intra) {
1998  for (x = 0; x < w4; x++) {
1999  mv[x].ref[0] =
2000  mv[x].ref[1] = -1;
2001  }
2002  } else if (b->comp) {
2003  for (x = 0; x < w4; x++) {
2004  mv[x].ref[0] = b->ref[0];
2005  mv[x].ref[1] = b->ref[1];
2006  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2007  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2008  }
2009  } else {
2010  for (x = 0; x < w4; x++) {
2011  mv[x].ref[0] = b->ref[0];
2012  mv[x].ref[1] = -1;
2013  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2014  }
2015  }
2016  }
2017 }
2018 
2019 // FIXME merge cnt/eob arguments?
2020 static av_always_inline int
2021 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2022  int is_tx32x32, unsigned (*cnt)[6][3],
2023  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2024  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2025  const int16_t *band_counts, const int16_t *qmul)
2026 {
2027  int i = 0, band = 0, band_left = band_counts[band];
2028  uint8_t *tp = p[0][nnz];
2029  uint8_t cache[1024];
2030 
2031  do {
2032  int val, rc;
2033 
2034  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2035  eob[band][nnz][val]++;
2036  if (!val)
2037  break;
2038 
2039  skip_eob:
2040  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2041  cnt[band][nnz][0]++;
2042  if (!--band_left)
2043  band_left = band_counts[++band];
2044  cache[scan[i]] = 0;
2045  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2046  tp = p[band][nnz];
2047  if (++i == n_coeffs)
2048  break; //invalid input; blocks should end with EOB
2049  goto skip_eob;
2050  }
2051 
2052  rc = scan[i];
2053  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2054  cnt[band][nnz][1]++;
2055  val = 1;
2056  cache[rc] = 1;
2057  } else {
2058  // fill in p[3-10] (model fill) - only once per frame for each pos
2059  if (!tp[3])
2060  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2061 
2062  cnt[band][nnz][2]++;
2063  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2064  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2065  cache[rc] = val = 2;
2066  } else {
2067  val = 3 + vp56_rac_get_prob(c, tp[5]);
2068  cache[rc] = 3;
2069  }
2070  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2071  cache[rc] = 4;
2072  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2073  val = 5 + vp56_rac_get_prob(c, 159);
2074  } else {
2075  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2076  val += vp56_rac_get_prob(c, 145);
2077  }
2078  } else { // cat 3-6
2079  cache[rc] = 5;
2080  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2081  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2082  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2083  val += (vp56_rac_get_prob(c, 148) << 1);
2084  val += vp56_rac_get_prob(c, 140);
2085  } else {
2086  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2087  val += (vp56_rac_get_prob(c, 155) << 2);
2088  val += (vp56_rac_get_prob(c, 140) << 1);
2089  val += vp56_rac_get_prob(c, 135);
2090  }
2091  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2092  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2093  val += (vp56_rac_get_prob(c, 157) << 3);
2094  val += (vp56_rac_get_prob(c, 141) << 2);
2095  val += (vp56_rac_get_prob(c, 134) << 1);
2096  val += vp56_rac_get_prob(c, 130);
2097  } else {
2098  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2099  val += (vp56_rac_get_prob(c, 254) << 12);
2100  val += (vp56_rac_get_prob(c, 254) << 11);
2101  val += (vp56_rac_get_prob(c, 252) << 10);
2102  val += (vp56_rac_get_prob(c, 249) << 9);
2103  val += (vp56_rac_get_prob(c, 243) << 8);
2104  val += (vp56_rac_get_prob(c, 230) << 7);
2105  val += (vp56_rac_get_prob(c, 196) << 6);
2106  val += (vp56_rac_get_prob(c, 177) << 5);
2107  val += (vp56_rac_get_prob(c, 153) << 4);
2108  val += (vp56_rac_get_prob(c, 140) << 3);
2109  val += (vp56_rac_get_prob(c, 133) << 2);
2110  val += (vp56_rac_get_prob(c, 130) << 1);
2111  val += vp56_rac_get_prob(c, 129);
2112  }
2113  }
2114  }
2115  if (!--band_left)
2116  band_left = band_counts[++band];
2117  if (is_tx32x32)
2118  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2119  else
2120  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2121  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2122  tp = p[band][nnz];
2123  } while (++i < n_coeffs);
2124 
2125  return i;
2126 }
2127 
2128 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2129  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2130  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2131  const int16_t (*nb)[2], const int16_t *band_counts,
2132  const int16_t *qmul)
2133 {
2134  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2135  nnz, scan, nb, band_counts, qmul);
2136 }
2137 
2138 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2139  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2140  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2141  const int16_t (*nb)[2], const int16_t *band_counts,
2142  const int16_t *qmul)
2143 {
2144  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2145  nnz, scan, nb, band_counts, qmul);
2146 }
2147 
2149 {
2150  VP9Context *s = ctx->priv_data;
2151  VP9Block *b = s->b;
2152  int row = s->row, col = s->col;
2153  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2154  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2155  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2156  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2157  int end_x = FFMIN(2 * (s->cols - col), w4);
2158  int end_y = FFMIN(2 * (s->rows - row), h4);
2159  int n, pl, x, y, res;
2160  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2161  int tx = 4 * s->lossless + b->tx;
2162  const int16_t * const *yscans = vp9_scans[tx];
2163  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2164  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2165  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2166  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2167  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2168  static const int16_t band_counts[4][8] = {
2169  { 1, 2, 3, 4, 3, 16 - 13 },
2170  { 1, 2, 3, 4, 11, 64 - 21 },
2171  { 1, 2, 3, 4, 11, 256 - 21 },
2172  { 1, 2, 3, 4, 11, 1024 - 21 },
2173  };
2174  const int16_t *y_band_counts = band_counts[b->tx];
2175  const int16_t *uv_band_counts = band_counts[b->uvtx];
2176 
2177 #define MERGE(la, end, step, rd) \
2178  for (n = 0; n < end; n += step) \
2179  la[n] = !!rd(&la[n])
2180 #define MERGE_CTX(step, rd) \
2181  do { \
2182  MERGE(l, end_y, step, rd); \
2183  MERGE(a, end_x, step, rd); \
2184  } while (0)
2185 
2186 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2187  for (n = 0, y = 0; y < end_y; y += step) { \
2188  for (x = 0; x < end_x; x += step, n += step * step) { \
2189  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2190  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2191  c, e, p, a[x] + l[y], yscans[txtp], \
2192  ynbs[txtp], y_band_counts, qmul[0]); \
2193  a[x] = l[y] = !!res; \
2194  if (step >= 4) { \
2195  AV_WN16A(&s->eob[n], res); \
2196  } else { \
2197  s->eob[n] = res; \
2198  } \
2199  } \
2200  }
2201 
2202 #define SPLAT(la, end, step, cond) \
2203  if (step == 2) { \
2204  for (n = 1; n < end; n += step) \
2205  la[n] = la[n - 1]; \
2206  } else if (step == 4) { \
2207  if (cond) { \
2208  for (n = 0; n < end; n += step) \
2209  AV_WN32A(&la[n], la[n] * 0x01010101); \
2210  } else { \
2211  for (n = 0; n < end; n += step) \
2212  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2213  } \
2214  } else /* step == 8 */ { \
2215  if (cond) { \
2216  if (HAVE_FAST_64BIT) { \
2217  for (n = 0; n < end; n += step) \
2218  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2219  } else { \
2220  for (n = 0; n < end; n += step) { \
2221  uint32_t v32 = la[n] * 0x01010101; \
2222  AV_WN32A(&la[n], v32); \
2223  AV_WN32A(&la[n + 4], v32); \
2224  } \
2225  } \
2226  } else { \
2227  for (n = 0; n < end; n += step) \
2228  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2229  } \
2230  }
2231 #define SPLAT_CTX(step) \
2232  do { \
2233  SPLAT(a, end_x, step, end_x == w4); \
2234  SPLAT(l, end_y, step, end_y == h4); \
2235  } while (0)
2236 
2237  /* y tokens */
2238  switch (b->tx) {
2239  case TX_4X4:
2240  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2241  break;
2242  case TX_8X8:
2243  MERGE_CTX(2, AV_RN16A);
2244  DECODE_Y_COEF_LOOP(2, 0,);
2245  SPLAT_CTX(2);
2246  break;
2247  case TX_16X16:
2248  MERGE_CTX(4, AV_RN32A);
2249  DECODE_Y_COEF_LOOP(4, 0,);
2250  SPLAT_CTX(4);
2251  break;
2252  case TX_32X32:
2253  MERGE_CTX(8, AV_RN64A);
2254  DECODE_Y_COEF_LOOP(8, 0, 32);
2255  SPLAT_CTX(8);
2256  break;
2257  }
2258 
2259 #define DECODE_UV_COEF_LOOP(step) \
2260  for (n = 0, y = 0; y < end_y; y += step) { \
2261  for (x = 0; x < end_x; x += step, n += step * step) { \
2262  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2263  16 * step * step, c, e, p, a[x] + l[y], \
2264  uvscan, uvnb, uv_band_counts, qmul[1]); \
2265  a[x] = l[y] = !!res; \
2266  if (step >= 4) { \
2267  AV_WN16A(&s->uveob[pl][n], res); \
2268  } else { \
2269  s->uveob[pl][n] = res; \
2270  } \
2271  } \
2272  }
2273 
2274  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2275  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2276  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2277  w4 >>= 1;
2278  h4 >>= 1;
2279  end_x >>= 1;
2280  end_y >>= 1;
2281  for (pl = 0; pl < 2; pl++) {
2282  a = &s->above_uv_nnz_ctx[pl][col];
2283  l = &s->left_uv_nnz_ctx[pl][row & 7];
2284  switch (b->uvtx) {
2285  case TX_4X4:
2287  break;
2288  case TX_8X8:
2289  MERGE_CTX(2, AV_RN16A);
2291  SPLAT_CTX(2);
2292  break;
2293  case TX_16X16:
2294  MERGE_CTX(4, AV_RN32A);
2296  SPLAT_CTX(4);
2297  break;
2298  case TX_32X32:
2299  MERGE_CTX(8, AV_RN64A);
2300  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2301  // so there is no need to loop
2302  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2303  1024, c, e, p, a[0] + l[0],
2304  uvscan, uvnb, uv_band_counts, qmul[1]);
2305  a[0] = l[0] = !!res;
2306  AV_WN16A(&s->uveob[pl][0], res);
2307  SPLAT_CTX(8);
2308  break;
2309  }
2310  }
2311 }
2312 
2314  uint8_t *dst_edge, ptrdiff_t stride_edge,
2315  uint8_t *dst_inner, ptrdiff_t stride_inner,
2316  uint8_t *l, int col, int x, int w,
2317  int row, int y, enum TxfmMode tx,
2318  int p)
2319 {
2320  int have_top = row > 0 || y > 0;
2321  int have_left = col > s->tiling.tile_col_start || x > 0;
2322  int have_right = x < w - 1;
2323  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2324  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2325  { DC_127_PRED, VERT_PRED } },
2326  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2327  { HOR_PRED, HOR_PRED } },
2328  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2329  { LEFT_DC_PRED, DC_PRED } },
2339  { DC_127_PRED, VERT_LEFT_PRED } },
2340  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2341  { HOR_UP_PRED, HOR_UP_PRED } },
2342  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2343  { HOR_PRED, TM_VP8_PRED } },
2344  };
2345  static const struct {
2346  uint8_t needs_left:1;
2347  uint8_t needs_top:1;
2348  uint8_t needs_topleft:1;
2349  uint8_t needs_topright:1;
2350  uint8_t invert_left:1;
2351  } edges[N_INTRA_PRED_MODES] = {
2352  [VERT_PRED] = { .needs_top = 1 },
2353  [HOR_PRED] = { .needs_left = 1 },
2354  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2355  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2356  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2357  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2358  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2359  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2360  [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2361  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2362  [LEFT_DC_PRED] = { .needs_left = 1 },
2363  [TOP_DC_PRED] = { .needs_top = 1 },
2364  [DC_128_PRED] = { 0 },
2365  [DC_127_PRED] = { 0 },
2366  [DC_129_PRED] = { 0 }
2367  };
2368 
2369  av_assert2(mode >= 0 && mode < 10);
2370  mode = mode_conv[mode][have_left][have_top];
2371  if (edges[mode].needs_top) {
2372  uint8_t *top, *topleft;
2373  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2374  int n_px_need_tr = 0;
2375 
2376  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2377  n_px_need_tr = 4;
2378 
2379  // if top of sb64-row, use s->intra_pred_data[] instead of
2380  // dst[-stride] for intra prediction (it contains pre- instead of
2381  // post-loopfilter data)
2382  if (have_top) {
2383  top = !(row & 7) && !y ?
2384  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2385  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2386  if (have_left)
2387  topleft = !(row & 7) && !y ?
2388  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2389  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2390  &dst_inner[-stride_inner];
2391  }
2392 
2393  if (have_top &&
2394  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2395  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2396  n_px_need + n_px_need_tr <= n_px_have) {
2397  *a = top;
2398  } else {
2399  if (have_top) {
2400  if (n_px_need <= n_px_have) {
2401  memcpy(*a, top, n_px_need);
2402  } else {
2403  memcpy(*a, top, n_px_have);
2404  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2405  n_px_need - n_px_have);
2406  }
2407  } else {
2408  memset(*a, 127, n_px_need);
2409  }
2410  if (edges[mode].needs_topleft) {
2411  if (have_left && have_top) {
2412  (*a)[-1] = topleft[-1];
2413  } else {
2414  (*a)[-1] = have_top ? 129 : 127;
2415  }
2416  }
2417  if (tx == TX_4X4 && edges[mode].needs_topright) {
2418  if (have_top && have_right &&
2419  n_px_need + n_px_need_tr <= n_px_have) {
2420  memcpy(&(*a)[4], &top[4], 4);
2421  } else {
2422  memset(&(*a)[4], (*a)[3], 4);
2423  }
2424  }
2425  }
2426  }
2427  if (edges[mode].needs_left) {
2428  if (have_left) {
2429  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2430  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2431  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2432 
2433  if (edges[mode].invert_left) {
2434  if (n_px_need <= n_px_have) {
2435  for (i = 0; i < n_px_need; i++)
2436  l[i] = dst[i * stride - 1];
2437  } else {
2438  for (i = 0; i < n_px_have; i++)
2439  l[i] = dst[i * stride - 1];
2440  memset(&l[n_px_have], l[n_px_have - 1], n_px_need - n_px_have);
2441  }
2442  } else {
2443  if (n_px_need <= n_px_have) {
2444  for (i = 0; i < n_px_need; i++)
2445  l[n_px_need - 1 - i] = dst[i * stride - 1];
2446  } else {
2447  for (i = 0; i < n_px_have; i++)
2448  l[n_px_need - 1 - i] = dst[i * stride - 1];
2449  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2450  }
2451  }
2452  } else {
2453  memset(l, 129, 4 << tx);
2454  }
2455  }
2456 
2457  return mode;
2458 }
2459 
2460 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2461 {
2462  VP9Context *s = ctx->priv_data;
2463  VP9Block *b = s->b;
2464  int row = s->row, col = s->col;
2465  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2466  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2467  int end_x = FFMIN(2 * (s->cols - col), w4);
2468  int end_y = FFMIN(2 * (s->rows - row), h4);
2469  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2470  int uvstep1d = 1 << b->uvtx, p;
2471  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2472  LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2473  LOCAL_ALIGNED_32(uint8_t, l, [32]);
2474 
2475  for (n = 0, y = 0; y < end_y; y += step1d) {
2476  uint8_t *ptr = dst, *ptr_r = dst_r;
2477  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2478  ptr_r += 4 * step1d, n += step) {
2479  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2480  y * 2 + x : 0];
2481  uint8_t *a = &a_buf[32];
2482  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2483  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2484 
2485  mode = check_intra_mode(s, mode, &a, ptr_r,
2486  s->frames[CUR_FRAME].tf.f->linesize[0],
2487  ptr, s->y_stride, l,
2488  col, x, w4, row, y, b->tx, 0);
2489  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2490  if (eob)
2491  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2492  s->block + 16 * n, eob);
2493  }
2494  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2495  dst += 4 * step1d * s->y_stride;
2496  }
2497 
2498  // U/V
2499  w4 >>= 1;
2500  end_x >>= 1;
2501  end_y >>= 1;
2502  step = 1 << (b->uvtx * 2);
2503  for (p = 0; p < 2; p++) {
2504  dst = s->dst[1 + p];
2505  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2506  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2507  uint8_t *ptr = dst, *ptr_r = dst_r;
2508  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2509  ptr_r += 4 * uvstep1d, n += step) {
2510  int mode = b->uvmode;
2511  uint8_t *a = &a_buf[16];
2512  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2513 
2514  mode = check_intra_mode(s, mode, &a, ptr_r,
2515  s->frames[CUR_FRAME].tf.f->linesize[1],
2516  ptr, s->uv_stride, l,
2517  col, x, w4, row, y, b->uvtx, p + 1);
2518  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2519  if (eob)
2520  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2521  s->uvblock[p] + 16 * n, eob);
2522  }
2523  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2524  dst += 4 * uvstep1d * s->uv_stride;
2525  }
2526  }
2527 }
2528 
2530  uint8_t *dst, ptrdiff_t dst_stride,
2531  const uint8_t *ref, ptrdiff_t ref_stride,
2533  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2534  int bw, int bh, int w, int h)
2535 {
2536  int mx = mv->x, my = mv->y, th;
2537 
2538  y += my >> 3;
2539  x += mx >> 3;
2540  ref += y * ref_stride + x;
2541  mx &= 7;
2542  my &= 7;
2543  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2544  // we use +7 because the last 7 pixels of each sbrow can be changed in
2545  // the longest loopfilter of the next sbrow
2546  th = (y + bh + 4 * !!my + 7) >> 6;
2547  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2548  if (x < !!mx * 3 || y < !!my * 3 ||
2549  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2551  ref - !!my * 3 * ref_stride - !!mx * 3,
2552  80, ref_stride,
2553  bw + !!mx * 7, bh + !!my * 7,
2554  x - !!mx * 3, y - !!my * 3, w, h);
2555  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2556  ref_stride = 80;
2557  }
2558  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2559 }
2560 
2562  uint8_t *dst_u, uint8_t *dst_v,
2563  ptrdiff_t dst_stride,
2564  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2565  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2567  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2568  int bw, int bh, int w, int h)
2569 {
2570  int mx = mv->x, my = mv->y, th;
2571 
2572  y += my >> 4;
2573  x += mx >> 4;
2574  ref_u += y * src_stride_u + x;
2575  ref_v += y * src_stride_v + x;
2576  mx &= 15;
2577  my &= 15;
2578  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2579  // we use +7 because the last 7 pixels of each sbrow can be changed in
2580  // the longest loopfilter of the next sbrow
2581  th = (y + bh + 4 * !!my + 7) >> 5;
2582  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2583  if (x < !!mx * 3 || y < !!my * 3 ||
2584  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2586  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2587  80, src_stride_u,
2588  bw + !!mx * 7, bh + !!my * 7,
2589  x - !!mx * 3, y - !!my * 3, w, h);
2590  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2591  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2592 
2594  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2595  80, src_stride_v,
2596  bw + !!mx * 7, bh + !!my * 7,
2597  x - !!mx * 3, y - !!my * 3, w, h);
2598  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2599  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2600  } else {
2601  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2602  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2603  }
2604 }
2605 
2606 static void inter_recon(AVCodecContext *ctx)
2607 {
2608  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2609  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2610  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2611  };
2612  VP9Context *s = ctx->priv_data;
2613  VP9Block *b = s->b;
2614  int row = s->row, col = s->col;
2615  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2616  AVFrame *ref1 = tref1->f, *ref2;
2617  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2618  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2619 
2620  if (b->comp) {
2621  tref2 = &s->refs[s->refidx[b->ref[1]]];
2622  ref2 = tref2->f;
2623  w2 = ref2->width;
2624  h2 = ref2->height;
2625  }
2626 
2627  // y inter pred
2628  if (b->bs > BS_8x8) {
2629  if (b->bs == BS_8x4) {
2630  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2631  ref1->data[0], ref1->linesize[0], tref1,
2632  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2633  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2634  s->dst[0] + 4 * ls_y, ls_y,
2635  ref1->data[0], ref1->linesize[0], tref1,
2636  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2637 
2638  if (b->comp) {
2639  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2640  ref2->data[0], ref2->linesize[0], tref2,
2641  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2642  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2643  s->dst[0] + 4 * ls_y, ls_y,
2644  ref2->data[0], ref2->linesize[0], tref2,
2645  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2646  }
2647  } else if (b->bs == BS_4x8) {
2648  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2649  ref1->data[0], ref1->linesize[0], tref1,
2650  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2651  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2652  ref1->data[0], ref1->linesize[0], tref1,
2653  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2654 
2655  if (b->comp) {
2656  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2657  ref2->data[0], ref2->linesize[0], tref2,
2658  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2659  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2660  ref2->data[0], ref2->linesize[0], tref2,
2661  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2662  }
2663  } else {
2664  av_assert2(b->bs == BS_4x4);
2665 
2666  // FIXME if two horizontally adjacent blocks have the same MV,
2667  // do a w8 instead of a w4 call
2668  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2669  ref1->data[0], ref1->linesize[0], tref1,
2670  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2671  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2672  ref1->data[0], ref1->linesize[0], tref1,
2673  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2674  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2675  s->dst[0] + 4 * ls_y, ls_y,
2676  ref1->data[0], ref1->linesize[0], tref1,
2677  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2678  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2679  s->dst[0] + 4 * ls_y + 4, ls_y,
2680  ref1->data[0], ref1->linesize[0], tref1,
2681  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2682 
2683  if (b->comp) {
2684  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2685  ref2->data[0], ref2->linesize[0], tref2,
2686  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2687  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2688  ref2->data[0], ref2->linesize[0], tref2,
2689  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2690  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2691  s->dst[0] + 4 * ls_y, ls_y,
2692  ref2->data[0], ref2->linesize[0], tref2,
2693  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2694  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2695  s->dst[0] + 4 * ls_y + 4, ls_y,
2696  ref2->data[0], ref2->linesize[0], tref2,
2697  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2698  }
2699  }
2700  } else {
2701  int bwl = bwlog_tab[0][b->bs];
2702  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2703 
2704  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2705  ref1->data[0], ref1->linesize[0], tref1,
2706  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2707 
2708  if (b->comp)
2709  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2710  ref2->data[0], ref2->linesize[0], tref2,
2711  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2712  }
2713 
2714  // uv inter pred
2715  {
2716  int bwl = bwlog_tab[1][b->bs];
2717  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2718  VP56mv mvuv;
2719 
2720  w1 = (w1 + 1) >> 1;
2721  h1 = (h1 + 1) >> 1;
2722  if (b->comp) {
2723  w2 = (w2 + 1) >> 1;
2724  h2 = (h2 + 1) >> 1;
2725  }
2726  if (b->bs > BS_8x8) {
2727  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2728  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2729  } else {
2730  mvuv = b->mv[0][0];
2731  }
2732 
2733  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2734  s->dst[1], s->dst[2], ls_uv,
2735  ref1->data[1], ref1->linesize[1],
2736  ref1->data[2], ref1->linesize[2], tref1,
2737  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2738 
2739  if (b->comp) {
2740  if (b->bs > BS_8x8) {
2741  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2742  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2743  } else {
2744  mvuv = b->mv[0][1];
2745  }
2746  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2747  s->dst[1], s->dst[2], ls_uv,
2748  ref2->data[1], ref2->linesize[1],
2749  ref2->data[2], ref2->linesize[2], tref2,
2750  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2751  }
2752  }
2753 
2754  if (!b->skip) {
2755  /* mostly copied intra_reconn() */
2756 
2757  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2758  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2759  int end_x = FFMIN(2 * (s->cols - col), w4);
2760  int end_y = FFMIN(2 * (s->rows - row), h4);
2761  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2762  int uvstep1d = 1 << b->uvtx, p;
2763  uint8_t *dst = s->dst[0];
2764 
2765  // y itxfm add
2766  for (n = 0, y = 0; y < end_y; y += step1d) {
2767  uint8_t *ptr = dst;
2768  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2769  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2770 
2771  if (eob)
2772  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2773  s->block + 16 * n, eob);
2774  }
2775  dst += 4 * s->y_stride * step1d;
2776  }
2777 
2778  // uv itxfm add
2779  end_x >>= 1;
2780  end_y >>= 1;
2781  step = 1 << (b->uvtx * 2);
2782  for (p = 0; p < 2; p++) {
2783  dst = s->dst[p + 1];
2784  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2785  uint8_t *ptr = dst;
2786  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2787  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2788 
2789  if (eob)
2790  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2791  s->uvblock[p] + 16 * n, eob);
2792  }
2793  dst += 4 * uvstep1d * s->uv_stride;
2794  }
2795  }
2796  }
2797 }
2798 
2799 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2800  int row_and_7, int col_and_7,
2801  int w, int h, int col_end, int row_end,
2802  enum TxfmMode tx, int skip_inter)
2803 {
2804  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2805  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2806  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2807  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2808 
2809  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2810  // edges. This means that for UV, we work on two subsampled blocks at
2811  // a time, and we only use the topleft block's mode information to set
2812  // things like block strength. Thus, for any block size smaller than
2813  // 16x16, ignore the odd portion of the block.
2814  if (tx == TX_4X4 && is_uv) {
2815  if (h == 1) {
2816  if (row_and_7 & 1)
2817  return;
2818  if (!row_end)
2819  h += 1;
2820  }
2821  if (w == 1) {
2822  if (col_and_7 & 1)
2823  return;
2824  if (!col_end)
2825  w += 1;
2826  }
2827  }
2828 
2829  if (tx == TX_4X4 && !skip_inter) {
2830  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2831  int m_col_odd = (t << (w - 1)) - t;
2832 
2833  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2834  if (is_uv) {
2835  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2836 
2837  for (y = row_and_7; y < h + row_and_7; y++) {
2838  int col_mask_id = 2 - !(y & 7);
2839 
2840  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2841  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2842  // for odd lines, if the odd col is not being filtered,
2843  // skip odd row also:
2844  // .---. <-- a
2845  // | |
2846  // |___| <-- b
2847  // ^ ^
2848  // c d
2849  //
2850  // if a/c are even row/col and b/d are odd, and d is skipped,
2851  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2852  if ((col_end & 1) && (y & 1)) {
2853  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2854  } else {
2855  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2856  }
2857  }
2858  } else {
2859  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2860 
2861  for (y = row_and_7; y < h + row_and_7; y++) {
2862  int col_mask_id = 2 - !(y & 3);
2863 
2864  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2865  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2866  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2867  lflvl->mask[is_uv][0][y][3] |= m_col;
2868  lflvl->mask[is_uv][1][y][3] |= m_col;
2869  }
2870  }
2871  } else {
2872  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2873 
2874  if (!skip_inter) {
2875  int mask_id = (tx == TX_8X8);
2876  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2877  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2878  int m_row = m_col & masks[l2];
2879 
2880  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2881  // 8wd loopfilter to prevent going off the visible edge.
2882  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2883  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2884  int m_row_8 = m_row - m_row_16;
2885 
2886  for (y = row_and_7; y < h + row_and_7; y++) {
2887  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2888  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2889  }
2890  } else {
2891  for (y = row_and_7; y < h + row_and_7; y++)
2892  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2893  }
2894 
2895  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2896  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2897  lflvl->mask[is_uv][1][y][0] |= m_col;
2898  if (y - row_and_7 == h - 1)
2899  lflvl->mask[is_uv][1][y][1] |= m_col;
2900  } else {
2901  for (y = row_and_7; y < h + row_and_7; y += step1d)
2902  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2903  }
2904  } else if (tx != TX_4X4) {
2905  int mask_id;
2906 
2907  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2908  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2909  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2910  for (y = row_and_7; y < h + row_and_7; y++)
2911  lflvl->mask[is_uv][0][y][mask_id] |= t;
2912  } else if (is_uv) {
2913  int t8 = t & 0x01, t4 = t - t8;
2914 
2915  for (y = row_and_7; y < h + row_and_7; y++) {
2916  lflvl->mask[is_uv][0][y][2] |= t4;
2917  lflvl->mask[is_uv][0][y][1] |= t8;
2918  }
2919  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2920  } else {
2921  int t8 = t & 0x11, t4 = t - t8;
2922 
2923  for (y = row_and_7; y < h + row_and_7; y++) {
2924  lflvl->mask[is_uv][0][y][2] |= t4;
2925  lflvl->mask[is_uv][0][y][1] |= t8;
2926  }
2927  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2928  }
2929  }
2930 }
2931 
2932 static void decode_b(AVCodecContext *ctx, int row, int col,
2933  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2934  enum BlockLevel bl, enum BlockPartition bp)
2935 {
2936  VP9Context *s = ctx->priv_data;
2937  VP9Block *b = s->b;
2938  enum BlockSize bs = bl * 3 + bp;
2939  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2940  int emu[2];
2941  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2942 
2943  s->row = row;
2944  s->row7 = row & 7;
2945  s->col = col;
2946  s->col7 = col & 7;
2947  s->min_mv.x = -(128 + col * 64);
2948  s->min_mv.y = -(128 + row * 64);
2949  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2950  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2951  if (s->pass < 2) {
2952  b->bs = bs;
2953  b->bl = bl;
2954  b->bp = bp;
2955  decode_mode(ctx);
2956  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2957 
2958  if (!b->skip) {
2959  decode_coeffs(ctx);
2960  } else {
2961  int row7 = s->row7;
2962 
2963 #define SPLAT_ZERO_CTX(v, n) \
2964  switch (n) { \
2965  case 1: v = 0; break; \
2966  case 2: AV_ZERO16(&v); break; \
2967  case 4: AV_ZERO32(&v); break; \
2968  case 8: AV_ZERO64(&v); break; \
2969  case 16: AV_ZERO128(&v); break; \
2970  }
2971 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2972  do { \
2973  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2974  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2975  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2976  } while (0)
2977 
2978  switch (w4) {
2979  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2980  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2981  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2982  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2983  }
2984  switch (h4) {
2985  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2986  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2987  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2988  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2989  }
2990  }
2991  if (s->pass == 1) {
2992  s->b++;
2993  s->block += w4 * h4 * 64;
2994  s->uvblock[0] += w4 * h4 * 16;
2995  s->uvblock[1] += w4 * h4 * 16;
2996  s->eob += 4 * w4 * h4;
2997  s->uveob[0] += w4 * h4;
2998  s->uveob[1] += w4 * h4;
2999 
3000  return;
3001  }
3002  }
3003 
3004  // emulated overhangs if the stride of the target buffer can't hold. This
3005  // allows to support emu-edge and so on even if we have large block
3006  // overhangs
3007  emu[0] = (col + w4) * 8 > f->linesize[0] ||
3008  (row + h4) > s->rows;
3009  emu[1] = (col + w4) * 4 > f->linesize[1] ||
3010  (row + h4) > s->rows;
3011  if (emu[0]) {
3012  s->dst[0] = s->tmp_y;
3013  s->y_stride = 64;
3014  } else {
3015  s->dst[0] = f->data[0] + yoff;
3016  s->y_stride = f->linesize[0];
3017  }
3018  if (emu[1]) {
3019  s->dst[1] = s->tmp_uv[0];
3020  s->dst[2] = s->tmp_uv[1];
3021  s->uv_stride = 32;
3022  } else {
3023  s->dst[1] = f->data[1] + uvoff;
3024  s->dst[2] = f->data[2] + uvoff;
3025  s->uv_stride = f->linesize[1];
3026  }
3027  if (b->intra) {
3028  intra_recon(ctx, yoff, uvoff);
3029  } else {
3030  inter_recon(ctx);
3031  }
3032  if (emu[0]) {
3033  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3034 
3035  for (n = 0; o < w; n++) {
3036  int bw = 64 >> n;
3037 
3038  av_assert2(n <= 4);
3039  if (w & bw) {
3040  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3041  s->tmp_y + o, 64, h, 0, 0);
3042  o += bw;
3043  }
3044  }
3045  }
3046  if (emu[1]) {
3047  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3048 
3049  for (n = 1; o < w; n++) {
3050  int bw = 64 >> n;
3051 
3052  av_assert2(n <= 4);
3053  if (w & bw) {
3054  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3055  s->tmp_uv[0] + o, 32, h, 0, 0);
3056  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3057  s->tmp_uv[1] + o, 32, h, 0, 0);
3058  o += bw;
3059  }
3060  }
3061  }
3062 
3063  // pick filter level and find edges to apply filter to
3064  if (s->filter.level &&
3065  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3066  [b->mode[3] != ZEROMV]) > 0) {
3067  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3068  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3069 
3070  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3071  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3072  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3073  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3074  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3075  b->uvtx, skip_inter);
3076 
3077  if (!s->filter.lim_lut[lvl]) {
3078  int sharp = s->filter.sharpness;
3079  int limit = lvl;
3080 
3081  if (sharp > 0) {
3082  limit >>= (sharp + 3) >> 2;
3083  limit = FFMIN(limit, 9 - sharp);
3084  }
3085  limit = FFMAX(limit, 1);
3086 
3087  s->filter.lim_lut[lvl] = limit;
3088  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3089  }
3090  }
3091 
3092  if (s->pass == 2) {
3093  s->b++;
3094  s->block += w4 * h4 * 64;
3095  s->uvblock[0] += w4 * h4 * 16;
3096  s->uvblock[1] += w4 * h4 * 16;
3097  s->eob += 4 * w4 * h4;
3098  s->uveob[0] += w4 * h4;
3099  s->uveob[1] += w4 * h4;
3100  }
3101 }
3102 
3103 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3104  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3105 {
3106  VP9Context *s = ctx->priv_data;
3107  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3108  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3109  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3110  s->prob.p.partition[bl][c];
3111  enum BlockPartition bp;
3112  ptrdiff_t hbs = 4 >> bl;
3113  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3114  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3115 
3116  if (bl == BL_8X8) {
3117  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3118  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3119  } else if (col + hbs < s->cols) { // FIXME why not <=?
3120  if (row + hbs < s->rows) { // FIXME why not <=?
3121  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3122  switch (bp) {
3123  case PARTITION_NONE:
3124  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3125  break;
3126  case PARTITION_H:
3127  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3128  yoff += hbs * 8 * y_stride;
3129  uvoff += hbs * 4 * uv_stride;
3130  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3131  break;
3132  case PARTITION_V:
3133  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3134  yoff += hbs * 8;
3135  uvoff += hbs * 4;
3136  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3137  break;
3138  case PARTITION_SPLIT:
3139  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3140  decode_sb(ctx, row, col + hbs, lflvl,
3141  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3142  yoff += hbs * 8 * y_stride;
3143  uvoff += hbs * 4 * uv_stride;
3144  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3145  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3146  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3147  break;
3148  default:
3149  av_assert0(0);
3150  }
3151  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3152  bp = PARTITION_SPLIT;
3153  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3154  decode_sb(ctx, row, col + hbs, lflvl,
3155  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3156  } else {
3157  bp = PARTITION_H;
3158  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3159  }
3160  } else if (row + hbs < s->rows) { // FIXME why not <=?
3161  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3162  bp = PARTITION_SPLIT;
3163  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3164  yoff += hbs * 8 * y_stride;
3165  uvoff += hbs * 4 * uv_stride;
3166  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3167  } else {
3168  bp = PARTITION_V;
3169  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3170  }
3171  } else {
3172  bp = PARTITION_SPLIT;
3173  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3174  }
3175  s->counts.partition[bl][c][bp]++;
3176 }
3177 
3178 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3179  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3180 {
3181  VP9Context *s = ctx->priv_data;
3182  VP9Block *b = s->b;
3183  ptrdiff_t hbs = 4 >> bl;
3184  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3185  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3186 
3187  if (bl == BL_8X8) {
3188  av_assert2(b->bl == BL_8X8);
3189  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3190  } else if (s->b->bl == bl) {
3191  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3192  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3193  yoff += hbs * 8 * y_stride;
3194  uvoff += hbs * 4 * uv_stride;
3195  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3196  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3197  yoff += hbs * 8;
3198  uvoff += hbs * 4;
3199  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3200  }
3201  } else {
3202  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3203  if (col + hbs < s->cols) { // FIXME why not <=?
3204  if (row + hbs < s->rows) {
3205  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3206  uvoff + 4 * hbs, bl + 1);
3207  yoff += hbs * 8 * y_stride;
3208  uvoff += hbs * 4 * uv_stride;
3209  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3210  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3211  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3212  } else {
3213  yoff += hbs * 8;
3214  uvoff += hbs * 4;
3215  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3216  }
3217  } else if (row + hbs < s->rows) {
3218  yoff += hbs * 8 * y_stride;
3219  uvoff += hbs * 4 * uv_stride;
3220  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3221  }
3222  }
3223 }
3224 
3225 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3226  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3227 {
3228  VP9Context *s = ctx->priv_data;
3229  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3230  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3231  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3232  int y, x, p;
3233 
3234  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3235  // if you think of them as acting on a 8x8 block max, we can interleave
3236  // each v/h within the single x loop, but that only works if we work on
3237  // 8 pixel blocks, and we won't always do that (we want at least 16px
3238  // to use SSE2 optimizations, perhaps 32 for AVX2)
3239 
3240  // filter edges between columns, Y plane (e.g. block1 | block2)
3241  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3242  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3243  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3244  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3245  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3246  unsigned hm = hm1 | hm2 | hm13 | hm23;
3247 
3248  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3249  if (hm1 & x) {
3250  int L = *l, H = L >> 4;
3251  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3252 
3253  if (col || x > 1) {
3254  if (hmask1[0] & x) {
3255  if (hmask2[0] & x) {
3256  av_assert2(l[8] == L);
3257  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3258  } else {
3259  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3260  }
3261  } else if (hm2 & x) {
3262  L = l[8];
3263  H |= (L >> 4) << 8;
3264  E |= s->filter.mblim_lut[L] << 8;
3265  I |= s->filter.lim_lut[L] << 8;
3266  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3267  [!!(hmask2[1] & x)]
3268  [0](ptr, ls_y, E, I, H);
3269  } else {
3270  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3271  [0](ptr, ls_y, E, I, H);
3272  }
3273  }
3274  } else if (hm2 & x) {
3275  int L = l[8], H = L >> 4;
3276  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3277 
3278  if (col || x > 1) {
3279  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3280  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3281  }
3282  }
3283  if (hm13 & x) {
3284  int L = *l, H = L >> 4;
3285  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3286 
3287  if (hm23 & x) {
3288  L = l[8];
3289  H |= (L >> 4) << 8;
3290  E |= s->filter.mblim_lut[L] << 8;
3291  I |= s->filter.lim_lut[L] << 8;
3292  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3293  } else {
3294  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3295  }
3296  } else if (hm23 & x) {
3297  int L = l[8], H = L >> 4;
3298  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3299 
3300  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3301  }
3302  }
3303  }
3304 
3305  // block1
3306  // filter edges between rows, Y plane (e.g. ------)
3307  // block2
3308  dst = f->data[0] + yoff;
3309  lvl = lflvl->level;
3310  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3311  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3312  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3313 
3314  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3315  if (row || y) {
3316  if (vm & x) {
3317  int L = *l, H = L >> 4;
3318  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3319 
3320  if (vmask[0] & x) {
3321  if (vmask[0] & (x << 1)) {
3322  av_assert2(l[1] == L);
3323  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3324  } else {
3325  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3326  }
3327  } else if (vm & (x << 1)) {
3328  L = l[1];
3329  H |= (L >> 4) << 8;
3330  E |= s->filter.mblim_lut[L] << 8;
3331  I |= s->filter.lim_lut[L] << 8;
3332  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3333  [!!(vmask[1] & (x << 1))]
3334  [1](ptr, ls_y, E, I, H);
3335  } else {
3336  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3337  [1](ptr, ls_y, E, I, H);
3338  }
3339  } else if (vm & (x << 1)) {
3340  int L = l[1], H = L >> 4;
3341  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3342 
3343  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3344  [1](ptr + 8, ls_y, E, I, H);
3345  }
3346  }
3347  if (vm3 & x) {
3348  int L = *l, H = L >> 4;
3349  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3350 
3351  if (vm3 & (x << 1)) {
3352  L = l[1];
3353  H |= (L >> 4) << 8;
3354  E |= s->filter.mblim_lut[L] << 8;
3355  I |= s->filter.lim_lut[L] << 8;
3356  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3357  } else {
3358  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3359  }
3360  } else if (vm3 & (x << 1)) {
3361  int L = l[1], H = L >> 4;
3362  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3363 
3364  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3365  }
3366  }
3367  }
3368 
3369  // same principle but for U/V planes
3370  for (p = 0; p < 2; p++) {
3371  lvl = lflvl->level;
3372  dst = f->data[1 + p] + uvoff;
3373  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3374  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3375  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3376  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3377  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3378 
3379  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3380  if (col || x > 1) {
3381  if (hm1 & x) {
3382  int L = *l, H = L >> 4;
3383  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3384 
3385  if (hmask1[0] & x) {
3386  if (hmask2[0] & x) {
3387  av_assert2(l[16] == L);
3388  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3389  } else {
3390  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3391  }
3392  } else if (hm2 & x) {
3393  L = l[16];
3394  H |= (L >> 4) << 8;
3395  E |= s->filter.mblim_lut[L] << 8;
3396  I |= s->filter.lim_lut[L] << 8;
3397  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3398  [!!(hmask2[1] & x)]
3399  [0](ptr, ls_uv, E, I, H);
3400  } else {
3401  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3402  [0](ptr, ls_uv, E, I, H);
3403  }
3404  } else if (hm2 & x) {
3405  int L = l[16], H = L >> 4;
3406  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3407 
3408  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3409  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3410  }
3411  }
3412  if (x & 0xAA)
3413  l += 2;
3414  }
3415  }
3416  lvl = lflvl->level;
3417  dst = f->data[1 + p] + uvoff;
3418  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3419  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3420  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3421 
3422  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3423  if (row || y) {
3424  if (vm & x) {
3425  int L = *l, H = L >> 4;
3426  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3427 
3428  if (vmask[0] & x) {
3429  if (vmask[0] & (x << 2)) {
3430  av_assert2(l[2] == L);
3431  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3432  } else {
3433  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3434  }
3435  } else if (vm & (x << 2)) {
3436  L = l[2];
3437  H |= (L >> 4) << 8;
3438  E |= s->filter.mblim_lut[L] << 8;
3439  I |= s->filter.lim_lut[L] << 8;
3440  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3441  [!!(vmask[1] & (x << 2))]
3442  [1](ptr, ls_uv, E, I, H);
3443  } else {
3444  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3445  [1](ptr, ls_uv, E, I, H);
3446  }
3447  } else if (vm & (x << 2)) {
3448  int L = l[2], H = L >> 4;
3449  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3450 
3451  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3452  [1](ptr + 8, ls_uv, E, I, H);
3453  }
3454  }
3455  }
3456  if (y & 1)
3457  lvl += 16;
3458  }
3459  }
3460 }
3461 
3462 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3463 {
3464  int sb_start = ( idx * n) >> log2_n;
3465  int sb_end = ((idx + 1) * n) >> log2_n;
3466  *start = FFMIN(sb_start, n) << 3;
3467  *end = FFMIN(sb_end, n) << 3;
3468 }
3469 
3470 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3471  int max_count, int update_factor)
3472 {
3473  unsigned ct = ct0 + ct1, p2, p1;
3474 
3475  if (!ct)
3476  return;
3477 
3478  p1 = *p;
3479  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3480  p2 = av_clip(p2, 1, 255);
3481  ct = FFMIN(ct, max_count);
3482  update_factor = FASTDIV(update_factor * ct, max_count);
3483 
3484  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3485  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3486 }
3487 
3488 static void adapt_probs(VP9Context *s)
3489 {
3490  int i, j, k, l, m;
3491  prob_context *p = &s->prob_ctx[s->framectxid].p;
3492  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3493 
3494  // coefficients
3495  for (i = 0; i < 4; i++)
3496  for (j = 0; j < 2; j++)
3497  for (k = 0; k < 2; k++)
3498  for (l = 0; l < 6; l++)
3499  for (m = 0; m < 6; m++) {
3500  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3501  unsigned *e = s->counts.eob[i][j][k][l][m];
3502  unsigned *c = s->counts.coef[i][j][k][l][m];
3503 
3504  if (l == 0 && m >= 3) // dc only has 3 pt
3505  break;
3506 
3507  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3508  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3509  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3510  }
3511 
3512  if (s->keyframe || s->intraonly) {
3513  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3514  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3515  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3516  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3517  return;
3518  }
3519 
3520  // skip flag
3521  for (i = 0; i < 3; i++)
3522  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3523 
3524  // intra/inter flag
3525  for (i = 0; i < 4; i++)
3526  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3527 
3528  // comppred flag
3529  if (s->comppredmode == PRED_SWITCHABLE) {
3530  for (i = 0; i < 5; i++)
3531  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3532  }
3533 
3534  // reference frames
3535  if (s->comppredmode != PRED_SINGLEREF) {
3536  for (i = 0; i < 5; i++)
3537  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3538  s->counts.comp_ref[i][1], 20, 128);
3539  }
3540 
3541  if (s->comppredmode != PRED_COMPREF) {
3542  for (i = 0; i < 5; i++) {
3543  uint8_t *pp = p->single_ref[i];
3544  unsigned (*c)[2] = s->counts.single_ref[i];
3545 
3546  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3547  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3548  }
3549  }
3550 
3551  // block partitioning
3552  for (i = 0; i < 4; i++)
3553  for (j = 0; j < 4; j++) {
3554  uint8_t *pp = p->partition[i][j];
3555  unsigned *c = s->counts.partition[i][j];
3556 
3557  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3558  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3559  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3560  }
3561 
3562  // tx size
3563  if (s->txfmmode == TX_SWITCHABLE) {
3564  for (i = 0; i < 2; i++) {
3565  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3566 
3567  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3568  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3569  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3570  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3571  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3572  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3573  }
3574  }
3575 
3576  // interpolation filter
3577  if (s->filtermode == FILTER_SWITCHABLE) {
3578  for (i = 0; i < 4; i++) {
3579  uint8_t *pp = p->filter[i];
3580  unsigned *c = s->counts.filter[i];
3581 
3582  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3583  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3584  }
3585  }
3586 
3587  // inter modes
3588  for (i = 0; i < 7; i++) {
3589  uint8_t *pp = p->mv_mode[i];
3590  unsigned *c = s->counts.mv_mode[i];
3591 
3592  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3593  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3594  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3595  }
3596 
3597  // mv joints
3598  {
3599  uint8_t *pp = p->mv_joint;
3600  unsigned *c = s->counts.mv_joint;
3601 
3602  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3603  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3604  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3605  }
3606 
3607  // mv components
3608  for (i = 0; i < 2; i++) {
3609  uint8_t *pp;
3610  unsigned *c, (*c2)[2], sum;
3611 
3612  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3613  s->counts.mv_comp[i].sign[1], 20, 128);
3614 
3615  pp = p->mv_comp[i].classes;
3616  c = s->counts.mv_comp[i].classes;
3617  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3618  adapt_prob(&pp[0], c[0], sum, 20, 128);
3619  sum -= c[1];
3620  adapt_prob(&pp[1], c[1], sum, 20, 128);
3621  sum -= c[2] + c[3];
3622  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3623  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3624  sum -= c[4] + c[5];
3625  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3626  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3627  sum -= c[6];
3628  adapt_prob(&pp[6], c[6], sum, 20, 128);
3629  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3630  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3631  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3632 
3633  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3634  s->counts.mv_comp[i].class0[1], 20, 128);
3635  pp = p->mv_comp[i].bits;
3636  c2 = s->counts.mv_comp[i].bits;
3637  for (j = 0; j < 10; j++)
3638  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3639 
3640  for (j = 0; j < 2; j++) {
3641  pp = p->mv_comp[i].class0_fp[j];
3642  c = s->counts.mv_comp[i].class0_fp[j];
3643  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3644  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3645  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3646  }
3647  pp = p->mv_comp[i].fp;
3648  c = s->counts.mv_comp[i].fp;
3649  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3650  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3651  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3652 
3653  if (s->highprecisionmvs) {
3654  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3655  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3656  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3657  s->counts.mv_comp[i].hp[1], 20, 128);
3658  }
3659  }
3660 
3661  // y intra modes
3662  for (i = 0; i < 4; i++) {
3663  uint8_t *pp = p->y_mode[i];
3664  unsigned *c = s->counts.y_mode[i], sum, s2;
3665 
3666  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3667  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3668  sum -= c[TM_VP8_PRED];
3669  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3670  sum -= c[VERT_PRED];
3671  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3672  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3673  sum -= s2;
3674  adapt_prob(&pp[3], s2, sum, 20, 128);
3675  s2 -= c[HOR_PRED];
3676  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3677  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3678  sum -= c[DIAG_DOWN_LEFT_PRED];
3679  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3680  sum -= c[VERT_LEFT_PRED];
3681  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3682  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3683  }
3684 
3685  // uv intra modes
3686  for (i = 0; i < 10; i++) {
3687  uint8_t *pp = p->uv_mode[i];
3688  unsigned *c = s->counts.uv_mode[i], sum, s2;
3689 
3690  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3691  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3692  sum -= c[TM_VP8_PRED];
3693  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3694  sum -= c[VERT_PRED];
3695  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3696  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3697  sum -= s2;
3698  adapt_prob(&pp[3], s2, sum, 20, 128);
3699  s2 -= c[HOR_PRED];
3700  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3701  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3702  sum -= c[DIAG_DOWN_LEFT_PRED];
3703  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3704  sum -= c[VERT_LEFT_PRED];
3705  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3706  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3707  }
3708 }
3709 
3710 static void free_buffers(VP9Context *s)
3711 {
3712  av_freep(&s->intra_pred_data[0]);
3713  av_freep(&s->b_base);
3714  av_freep(&s->block_base);
3715 }
3716 
3718 {
3719  VP9Context *s = ctx->priv_data;
3720  int i;
3721 
3722  for (i = 0; i < 2; i++) {
3723  if (s->frames[i].tf.f->data[0])
3724  vp9_unref_frame(ctx, &s->frames[i]);
3725  av_frame_free(&s->frames[i].tf.f);
3726  }
3727  for (i = 0; i < 8; i++) {
3728  if (s->refs[i].f->data[0])
3729  ff_thread_release_buffer(ctx, &s->refs[i]);
3730  av_frame_free(&s->refs[i].f);
3731  if (s->next_refs[i].f->data[0])
3732  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3733  av_frame_free(&s->next_refs[i].f);
3734  }
3735  free_buffers(s);
3736  av_freep(&s->c_b);
3737  s->c_b_size = 0;
3738 
3739  return 0;
3740 }
3741 
3742 
3743 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3744  int *got_frame, AVPacket *pkt)
3745 {
3746  const uint8_t *data = pkt->data;
3747  int size = pkt->size;
3748  VP9Context *s = ctx->priv_data;
3749  int res, tile_row, tile_col, i, ref, row, col;
3750  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3751  AVFrame *f;
3752 
3753  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3754  return res;
3755  } else if (res == 0) {
3756  if (!s->refs[ref].f->data[0]) {
3757  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3758  return AVERROR_INVALIDDATA;
3759  }
3760  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3761  return res;
3762  *got_frame = 1;
3763  return 0;
3764  }
3765  data += res;
3766  size -= res;
3767 
3768  if (s->frames[LAST_FRAME].tf.f->data[0])
3769  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3770  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3771  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3772  return res;
3773  if (s->frames[CUR_FRAME].tf.f->data[0])
3774  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3775  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3776  return res;
3777  f = s->frames[CUR_FRAME].tf.f;
3778  f->key_frame = s->keyframe;
3780  ls_y = f->linesize[0];
3781  ls_uv =f->linesize[1];
3782 
3783  // ref frame setup
3784  for (i = 0; i < 8; i++) {
3785  if (s->next_refs[i].f->data[0])
3786  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3787  if (s->refreshrefmask & (1 << i)) {
3788  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3789  } else {
3790  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3791  }
3792  if (res < 0)
3793  return res;
3794  }
3795 
3796  if (s->fullrange)
3798  else
3800 
3801  switch (s->colorspace) {
3802  case 1: ctx->colorspace = AVCOL_SPC_BT470BG; break;
3803  case 2: ctx->colorspace = AVCOL_SPC_BT709; break;
3804  case 3: ctx->colorspace = AVCOL_SPC_SMPTE170M; break;
3805  case 4: ctx->colorspace = AVCOL_SPC_SMPTE240M; break;
3806  }
3807 
3808  // main tile decode loop
3809  memset(s->above_partition_ctx, 0, s->cols);
3810  memset(s->above_skip_ctx, 0, s->cols);
3811  if (s->keyframe || s->intraonly) {
3812  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3813  } else {
3814  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3815  }
3816  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3817  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3818  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3819  memset(s->above_segpred_ctx, 0, s->cols);
3820  s->pass = s->uses_2pass =
3822  if ((res = update_block_buffers(ctx)) < 0) {
3823  av_log(ctx, AV_LOG_ERROR,
3824  "Failed to allocate block buffers\n");
3825  return res;
3826  }
3827  if (s->refreshctx && s->parallelmode) {
3828  int j, k, l, m;
3829 
3830  for (i = 0; i < 4; i++) {
3831  for (j = 0; j < 2; j++)
3832  for (k = 0; k < 2; k++)
3833  for (l = 0; l < 6; l++)
3834  for (m = 0; m < 6; m++)
3835  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3836  s->prob.coef[i][j][k][l][m], 3);
3837  if (s->txfmmode == i)
3838  break;
3839  }
3840  s->prob_ctx[s->framectxid].p = s->prob.p;
3842  } else if (!s->refreshctx) {
3844  }
3845 
3846  do {
3847  yoff = uvoff = 0;
3848  s->b = s->b_base;
3849  s->block = s->block_base;
3850  s->uvblock[0] = s->uvblock_base[0];
3851  s->uvblock[1] = s->uvblock_base[1];
3852  s->eob = s->eob_base;
3853  s->uveob[0] = s->uveob_base[0];
3854  s->uveob[1] = s->uveob_base[1];
3855 
3856  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3858  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3859  if (s->pass != 2) {
3860  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3861  unsigned tile_size;
3862 
3863  if (tile_col == s->tiling.tile_cols - 1 &&
3864  tile_row == s->tiling.tile_rows - 1) {
3865  tile_size = size;
3866  } else {
3867  tile_size = AV_RB32(data);
3868  data += 4;
3869  size -= 4;
3870  }
3871  if (tile_size > size) {
3872  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3873  return AVERROR_INVALIDDATA;
3874  }
3875  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3876  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3877  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3878  return AVERROR_INVALIDDATA;
3879  }
3880  data += tile_size;
3881  size -= tile_size;
3882  }
3883  }
3884 
3885  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3886  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3887  struct VP9Filter *lflvl_ptr = s->lflvl;
3888  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3889 
3890  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3892  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3893 
3894  if (s->pass != 2) {
3895  memset(s->left_partition_ctx, 0, 8);
3896  memset(s->left_skip_ctx, 0, 8);
3897  if (s->keyframe || s->intraonly) {
3898  memset(s->left_mode_ctx, DC_PRED, 16);
3899  } else {
3900  memset(s->left_mode_ctx, NEARESTMV, 8);
3901  }
3902  memset(s->left_y_nnz_ctx, 0, 16);
3903  memset(s->left_uv_nnz_ctx, 0, 16);
3904  memset(s->left_segpred_ctx, 0, 8);
3905 
3906  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3907  }
3908 
3909  for (col = s->tiling.tile_col_start;
3910  col < s->tiling.tile_col_end;
3911  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3912  // FIXME integrate with lf code (i.e. zero after each
3913  // use, similar to invtxfm coefficients, or similar)
3914  if (s->pass != 1) {
3915  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3916  }
3917 
3918  if (s->pass == 2) {
3919  decode_sb_mem(ctx, row, col, lflvl_ptr,
3920  yoff2, uvoff2, BL_64X64);
3921  } else {
3922  decode_sb(ctx, row, col, lflvl_ptr,
3923  yoff2, uvoff2, BL_64X64);
3924  }
3925  }
3926  if (s->pass != 2) {
3927  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3928  }
3929  }
3930 
3931  if (s->pass == 1) {
3932  continue;
3933  }
3934 
3935  // backup pre-loopfilter reconstruction data for intra
3936  // prediction of next row of sb64s
3937  if (row + 8 < s->rows) {
3938  memcpy(s->intra_pred_data[0],
3939  f->data[0] + yoff + 63 * ls_y,
3940  8 * s->cols);
3941  memcpy(s->intra_pred_data[1],
3942  f->data[1] + uvoff + 31 * ls_uv,
3943  4 * s->cols);
3944  memcpy(s->intra_pred_data[2],
3945  f->data[2] + uvoff + 31 * ls_uv,
3946  4 * s->cols);
3947  }
3948 
3949  // loopfilter one row
3950  if (s->filter.level) {
3951  yoff2 = yoff;
3952  uvoff2 = uvoff;
3953  lflvl_ptr = s->lflvl;
3954  for (col = 0; col < s->cols;
3955  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3956  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3957  }
3958  }
3959 
3960  // FIXME maybe we can make this more finegrained by running the
3961  // loopfilter per-block instead of after each sbrow
3962  // In fact that would also make intra pred left preparation easier?
3963  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3964  }
3965  }
3966 
3967  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3968  adapt_probs(s);
3970  }
3971  } while (s->pass++ == 1);
3972  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3973 
3974  // ref frame setup
3975  for (i = 0; i < 8; i++) {
3976  if (s->refs[i].f->data[0])
3977  ff_thread_release_buffer(ctx, &s->refs[i]);
3978  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3979  }
3980 
3981  if (!s->invisible) {
3982  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3983  return res;
3984  *got_frame = 1;
3985  }
3986 
3987  return 0;
3988 }
3989 
3991 {
3992  VP9Context *s = ctx->priv_data;
3993  int i;
3994 
3995  for (i = 0; i < 2; i++)
3996  vp9_unref_frame(ctx, &s->frames[i]);
3997  for (i = 0; i < 8; i++)
3998  ff_thread_release_buffer(ctx, &s->refs[i]);
3999 }
4000 
4001 static int init_frames(AVCodecContext *ctx)
4002 {
4003  VP9Context *s = ctx->priv_data;
4004  int i;
4005 
4006  for (i = 0; i < 2; i++) {
4007  s->frames[i].tf.f = av_frame_alloc();
4008  if (!s->frames[i].tf.f) {
4009  vp9_decode_free(ctx);
4010  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4011  return AVERROR(ENOMEM);
4012  }
4013  }
4014  for (i = 0; i < 8; i++) {
4015  s->refs[i].f = av_frame_alloc();
4016  s->next_refs[i].f = av_frame_alloc();
4017  if (!s->refs[i].f || !s->next_refs[i].f) {
4018  vp9_decode_free(ctx);
4019  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4020  return AVERROR(ENOMEM);
4021  }
4022  }
4023 
4024  return 0;
4025 }
4026 
4028 {
4029  VP9Context *s = ctx->priv_data;
4030 
4031  ctx->internal->allocate_progress = 1;
4032  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4033  ff_vp9dsp_init(&s->dsp);
4034  ff_videodsp_init(&s->vdsp, 8);
4035  s->filter.sharpness = -1;
4036 
4037  return init_frames(ctx);
4038 }
4039 
4041 {
4042  return init_frames(avctx);
4043 }
4044 
4046 {
4047  int i, res;
4048  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4049 
4050  // detect size changes in other threads
4051  if (s->intra_pred_data[0] &&
4052  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4053  free_buffers(s);
4054  }
4055 
4056  for (i = 0; i < 2; i++) {
4057  if (s->frames[i].tf.f->data[0])
4058  vp9_unref_frame(dst, &s->frames[i]);
4059  if (ssrc->frames[i].tf.f->data[0]) {
4060  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4061  return res;
4062  }
4063  }
4064  for (i = 0; i < 8; i++) {
4065  if (s->refs[i].f->data[0])
4066  ff_thread_release_buffer(dst, &s->refs[i]);
4067  if (ssrc->next_refs[i].f->data[0]) {
4068  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4069  return res;
4070  }
4071  }
4072 
4073  s->invisible = ssrc->invisible;
4074  s->keyframe = ssrc->keyframe;
4075  s->uses_2pass = ssrc->uses_2pass;
4076  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4077  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4078  if (ssrc->segmentation.enabled) {
4079  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4080  sizeof(s->segmentation.feat));
4081  }
4082 
4083  return 0;
4084 }
4085 
4087  .name = "vp9",
4088  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4089  .type = AVMEDIA_TYPE_VIDEO,
4090  .id = AV_CODEC_ID_VP9,
4091  .priv_data_size = sizeof(VP9Context),
4092  .init = vp9_decode_init,
4095  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4099 };