FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148  struct {
153  struct {
159  int16_t q_val;
160  int8_t lf_val;
161  int16_t qmul[2][2];
162  uint8_t lflvl[4][2];
163  } feat[8];
164  } segmentation;
165  struct {
167  unsigned tile_cols, tile_rows;
169  } tiling;
170  unsigned sb_cols, sb_rows, rows, cols;
171  struct {
173  uint8_t coef[4][2][2][6][6][3];
174  } prob_ctx[4];
175  struct {
176  prob_context p;
177  uint8_t coef[4][2][2][6][6][11];
180  } prob;
181  struct {
182  unsigned y_mode[4][10];
183  unsigned uv_mode[10][10];
184  unsigned filter[4][3];
185  unsigned mv_mode[7][4];
186  unsigned intra[4][2];
187  unsigned comp[5][2];
188  unsigned single_ref[5][2][2];
189  unsigned comp_ref[5][2];
190  unsigned tx32p[2][4];
191  unsigned tx16p[2][3];
192  unsigned tx8p[2][2];
193  unsigned skip[3][2];
194  unsigned mv_joint[4];
195  struct {
196  unsigned sign[2];
197  unsigned classes[11];
198  unsigned class0[2];
199  unsigned bits[10][2];
200  unsigned class0_fp[2][4];
201  unsigned fp[4];
202  unsigned class0_hp[2];
203  unsigned hp[2];
204  } mv_comp[2];
205  unsigned partition[4][4][4];
206  unsigned coef[4][2][2][6][6][3];
207  unsigned eob[4][2][2][6][6][2];
208  } counts;
211 
212  // contextual (left/above) cache
227  // FIXME maybe merge some of the below in a flags field?
238 
239  // whole-frame cache
241  struct VP9Filter *lflvl;
243 
244  // block reconstruction intermediates
246  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
248  struct { int x, y; } min_mv, max_mv;
250  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
251 } VP9Context;
252 
253 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
254  {
255  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
256  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
257  }, {
258  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
259  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
260  }
261 };
262 
264 {
265  VP9Context *s = ctx->priv_data;
266  int ret, sz;
267 
268  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
269  return ret;
270  sz = 64 * s->sb_cols * s->sb_rows;
271  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
272  ff_thread_release_buffer(ctx, &f->tf);
273  return AVERROR(ENOMEM);
274  }
275 
277  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
278 
279  // retain segmentation map if it doesn't update
281  !s->intraonly && !s->keyframe) {
283  }
284 
285  return 0;
286 }
287 
289 {
290  ff_thread_release_buffer(ctx, &f->tf);
292 }
293 
295 {
296  int res;
297 
298  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
299  return res;
300  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301  vp9_unref_frame(ctx, dst);
302  return AVERROR(ENOMEM);
303  }
304 
306  dst->mv = src->mv;
307 
308  return 0;
309 }
310 
311 static int update_size(AVCodecContext *ctx, int w, int h)
312 {
313  VP9Context *s = ctx->priv_data;
314  uint8_t *p;
315 
316  av_assert0(w > 0 && h > 0);
317 
318  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
319  return 0;
320 
321  ctx->width = w;
322  ctx->height = h;
323  s->sb_cols = (w + 63) >> 6;
324  s->sb_rows = (h + 63) >> 6;
325  s->cols = (w + 7) >> 3;
326  s->rows = (h + 7) >> 3;
327 
328 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
329  av_freep(&s->intra_pred_data[0]);
330  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
331  if (!p)
332  return AVERROR(ENOMEM);
333  assign(s->intra_pred_data[0], uint8_t *, 64);
334  assign(s->intra_pred_data[1], uint8_t *, 32);
335  assign(s->intra_pred_data[2], uint8_t *, 32);
336  assign(s->above_y_nnz_ctx, uint8_t *, 16);
337  assign(s->above_mode_ctx, uint8_t *, 16);
338  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
340  assign(s->above_skip_ctx, uint8_t *, 8);
341  assign(s->above_txfm_ctx, uint8_t *, 8);
342  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
343  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
344  assign(s->above_segpred_ctx, uint8_t *, 8);
345  assign(s->above_intra_ctx, uint8_t *, 8);
346  assign(s->above_comp_ctx, uint8_t *, 8);
347  assign(s->above_ref_ctx, uint8_t *, 8);
348  assign(s->above_filter_ctx, uint8_t *, 8);
349  assign(s->lflvl, struct VP9Filter *, 1);
350 #undef assign
351 
352  // these will be re-allocated a little later
353  av_freep(&s->b_base);
354  av_freep(&s->block_base);
355 
356  return 0;
357 }
358 
360 {
361  VP9Context *s = ctx->priv_data;
362 
363  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
364  return 0;
365 
366  av_free(s->b_base);
367  av_free(s->block_base);
368  if (s->uses_2pass) {
369  int sbs = s->sb_cols * s->sb_rows;
370 
371  s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
372  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
373  if (!s->b_base || !s->block_base)
374  return AVERROR(ENOMEM);
375  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
376  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
377  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
378  s->uveob_base[0] = s->eob_base + 256 * sbs;
379  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
380  } else {
381  s->b_base = av_malloc(sizeof(VP9Block));
382  s->block_base = av_mallocz((64 * 64 + 128) * 3);
383  if (!s->b_base || !s->block_base)
384  return AVERROR(ENOMEM);
385  s->uvblock_base[0] = s->block_base + 64 * 64;
386  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
387  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
388  s->uveob_base[0] = s->eob_base + 256;
389  s->uveob_base[1] = s->uveob_base[0] + 64;
390  }
392 
393  return 0;
394 }
395 
396 // for some reason the sign bit is at the end, not the start, of a bit sequence
398 {
399  int v = get_bits(gb, n);
400  return get_bits1(gb) ? -v : v;
401 }
402 
404 {
405  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
406 }
407 
408 // differential forward probability updates
409 static int update_prob(VP56RangeCoder *c, int p)
410 {
411  static const int inv_map_table[254] = {
412  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
413  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
414  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
415  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
416  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
417  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
418  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
419  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
420  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
421  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
422  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
423  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
424  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
425  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
426  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
427  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
428  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
429  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
430  252, 253,
431  };
432  int d;
433 
434  /* This code is trying to do a differential probability update. For a
435  * current probability A in the range [1, 255], the difference to a new
436  * probability of any value can be expressed differentially as 1-A,255-A
437  * where some part of this (absolute range) exists both in positive as
438  * well as the negative part, whereas another part only exists in one
439  * half. We're trying to code this shared part differentially, i.e.
440  * times two where the value of the lowest bit specifies the sign, and
441  * the single part is then coded on top of this. This absolute difference
442  * then again has a value of [0,254], but a bigger value in this range
443  * indicates that we're further away from the original value A, so we
444  * can code this as a VLC code, since higher values are increasingly
445  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
446  * updates vs. the 'fine, exact' updates further down the range, which
447  * adds one extra dimension to this differential update model. */
448 
449  if (!vp8_rac_get(c)) {
450  d = vp8_rac_get_uint(c, 4) + 0;
451  } else if (!vp8_rac_get(c)) {
452  d = vp8_rac_get_uint(c, 4) + 16;
453  } else if (!vp8_rac_get(c)) {
454  d = vp8_rac_get_uint(c, 5) + 32;
455  } else {
456  d = vp8_rac_get_uint(c, 7);
457  if (d >= 65)
458  d = (d << 1) - 65 + vp8_rac_get(c);
459  d += 64;
460  }
461 
462  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
463  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
464 }
465 
467  const uint8_t *data, int size, int *ref)
468 {
469  VP9Context *s = ctx->priv_data;
470  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
471  int last_invisible;
472  const uint8_t *data2;
473 
474  /* general header */
475  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
476  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
477  return res;
478  }
479  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
480  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
481  return AVERROR_INVALIDDATA;
482  }
483  s->profile = get_bits1(&s->gb);
484  if (get_bits1(&s->gb)) { // reserved bit
485  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
486  return AVERROR_INVALIDDATA;
487  }
488  if (get_bits1(&s->gb)) {
489  *ref = get_bits(&s->gb, 3);
490  return 0;
491  }
492  s->last_uses_2pass = s->uses_2pass;
493  s->last_keyframe = s->keyframe;
494  s->keyframe = !get_bits1(&s->gb);
495  last_invisible = s->invisible;
496  s->invisible = !get_bits1(&s->gb);
497  s->errorres = get_bits1(&s->gb);
498  s->use_last_frame_mvs = !s->errorres && !last_invisible;
499  if (s->keyframe) {
500  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
501  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
502  return AVERROR_INVALIDDATA;
503  }
504  s->colorspace = get_bits(&s->gb, 3);
505  if (s->colorspace == 7) { // RGB = profile 1
506  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
507  return AVERROR_INVALIDDATA;
508  }
509  s->fullrange = get_bits1(&s->gb);
510  // for profile 1, here follows the subsampling bits
511  s->refreshrefmask = 0xff;
512  w = get_bits(&s->gb, 16) + 1;
513  h = get_bits(&s->gb, 16) + 1;
514  if (get_bits1(&s->gb)) // display size
515  skip_bits(&s->gb, 32);
516  } else {
517  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
518  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
519  if (s->intraonly) {
520  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
521  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
522  return AVERROR_INVALIDDATA;
523  }
524  s->refreshrefmask = get_bits(&s->gb, 8);
525  w = get_bits(&s->gb, 16) + 1;
526  h = get_bits(&s->gb, 16) + 1;
527  if (get_bits1(&s->gb)) // display size
528  skip_bits(&s->gb, 32);
529  } else {
530  s->refreshrefmask = get_bits(&s->gb, 8);
531  s->refidx[0] = get_bits(&s->gb, 3);
532  s->signbias[0] = get_bits1(&s->gb);
533  s->refidx[1] = get_bits(&s->gb, 3);
534  s->signbias[1] = get_bits1(&s->gb);
535  s->refidx[2] = get_bits(&s->gb, 3);
536  s->signbias[2] = get_bits1(&s->gb);
537  if (!s->refs[s->refidx[0]].f->data[0] ||
538  !s->refs[s->refidx[1]].f->data[0] ||
539  !s->refs[s->refidx[2]].f->data[0]) {
540  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
541  return AVERROR_INVALIDDATA;
542  }
543  if (get_bits1(&s->gb)) {
544  w = s->refs[s->refidx[0]].f->width;
545  h = s->refs[s->refidx[0]].f->height;
546  } else if (get_bits1(&s->gb)) {
547  w = s->refs[s->refidx[1]].f->width;
548  h = s->refs[s->refidx[1]].f->height;
549  } else if (get_bits1(&s->gb)) {
550  w = s->refs[s->refidx[2]].f->width;
551  h = s->refs[s->refidx[2]].f->height;
552  } else {
553  w = get_bits(&s->gb, 16) + 1;
554  h = get_bits(&s->gb, 16) + 1;
555  }
556  // Note that in this code, "CUR_FRAME" is actually before we
557  // have formally allocated a frame, and thus actually represents
558  // the _last_ frame
559  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
560  s->frames[CUR_FRAME].tf.f->height == h;
561  if (get_bits1(&s->gb)) // display size
562  skip_bits(&s->gb, 32);
563  s->highprecisionmvs = get_bits1(&s->gb);
565  get_bits(&s->gb, 2);
566  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
567  s->signbias[0] != s->signbias[2];
568  if (s->allowcompinter) {
569  if (s->signbias[0] == s->signbias[1]) {
570  s->fixcompref = 2;
571  s->varcompref[0] = 0;
572  s->varcompref[1] = 1;
573  } else if (s->signbias[0] == s->signbias[2]) {
574  s->fixcompref = 1;
575  s->varcompref[0] = 0;
576  s->varcompref[1] = 2;
577  } else {
578  s->fixcompref = 0;
579  s->varcompref[0] = 1;
580  s->varcompref[1] = 2;
581  }
582  }
583  }
584  }
585  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
586  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
587  s->framectxid = c = get_bits(&s->gb, 2);
588 
589  /* loopfilter header data */
590  s->filter.level = get_bits(&s->gb, 6);
591  sharp = get_bits(&s->gb, 3);
592  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
593  // the old cache values since they are still valid
594  if (s->filter.sharpness != sharp)
595  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
596  s->filter.sharpness = sharp;
597  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
598  if (get_bits1(&s->gb)) {
599  for (i = 0; i < 4; i++)
600  if (get_bits1(&s->gb))
601  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
602  for (i = 0; i < 2; i++)
603  if (get_bits1(&s->gb))
604  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
605  }
606  } else {
607  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
608  }
609 
610  /* quantization header data */
611  s->yac_qi = get_bits(&s->gb, 8);
612  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
613  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
616  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
617 
618  /* segmentation header info */
619  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
620  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
621  for (i = 0; i < 7; i++)
622  s->prob.seg[i] = get_bits1(&s->gb) ?
623  get_bits(&s->gb, 8) : 255;
624  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
625  for (i = 0; i < 3; i++)
626  s->prob.segpred[i] = get_bits1(&s->gb) ?
627  get_bits(&s->gb, 8) : 255;
628  }
629  }
630  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
631  (w != s->frames[CUR_FRAME].tf.f->width ||
632  h != s->frames[CUR_FRAME].tf.f->height)) {
633  av_log(ctx, AV_LOG_ERROR,
634  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
636  return AVERROR_INVALIDDATA;
637  }
638 
639  if (get_bits1(&s->gb)) {
641  for (i = 0; i < 8; i++) {
642  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
643  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
644  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
645  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
646  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
647  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
648  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
649  }
650  }
651  } else {
652  s->segmentation.feat[0].q_enabled = 0;
653  s->segmentation.feat[0].lf_enabled = 0;
654  s->segmentation.feat[0].skip_enabled = 0;
655  s->segmentation.feat[0].ref_enabled = 0;
656  }
657 
658  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
659  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
660  int qyac, qydc, quvac, quvdc, lflvl, sh;
661 
662  if (s->segmentation.feat[i].q_enabled) {
664  qyac = s->segmentation.feat[i].q_val;
665  else
666  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
667  } else {
668  qyac = s->yac_qi;
669  }
670  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
671  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
672  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
673  qyac = av_clip_uintp2(qyac, 8);
674 
675  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
676  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
677  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
678  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
679 
680  sh = s->filter.level >= 32;
681  if (s->segmentation.feat[i].lf_enabled) {
683  lflvl = s->segmentation.feat[i].lf_val;
684  else
685  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
686  } else {
687  lflvl = s->filter.level;
688  }
689  s->segmentation.feat[i].lflvl[0][0] =
690  s->segmentation.feat[i].lflvl[0][1] =
691  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
692  for (j = 1; j < 4; j++) {
693  s->segmentation.feat[i].lflvl[j][0] =
694  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
695  s->lf_delta.mode[0]) << sh), 6);
696  s->segmentation.feat[i].lflvl[j][1] =
697  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
698  s->lf_delta.mode[1]) << sh), 6);
699  }
700  }
701 
702  /* tiling info */
703  if ((res = update_size(ctx, w, h)) < 0) {
704  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
705  return res;
706  }
707  for (s->tiling.log2_tile_cols = 0;
708  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
709  s->tiling.log2_tile_cols++) ;
710  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
711  max = FFMAX(0, max - 1);
712  while (max > s->tiling.log2_tile_cols) {
713  if (get_bits1(&s->gb))
714  s->tiling.log2_tile_cols++;
715  else
716  break;
717  }
718  s->tiling.log2_tile_rows = decode012(&s->gb);
719  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
720  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
721  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
722  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
723  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
724  if (!s->c_b) {
725  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
726  return AVERROR(ENOMEM);
727  }
728  }
729 
730  if (s->keyframe || s->errorres || s->intraonly) {
731  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
732  s->prob_ctx[3].p = vp9_default_probs;
733  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
734  sizeof(vp9_default_coef_probs));
735  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
736  sizeof(vp9_default_coef_probs));
737  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
738  sizeof(vp9_default_coef_probs));
739  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
740  sizeof(vp9_default_coef_probs));
741  }
742 
743  // next 16 bits is size of the rest of the header (arith-coded)
744  size2 = get_bits(&s->gb, 16);
745  data2 = align_get_bits(&s->gb);
746  if (size2 > size - (data2 - data)) {
747  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
748  return AVERROR_INVALIDDATA;
749  }
750  ff_vp56_init_range_decoder(&s->c, data2, size2);
751  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
752  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
753  return AVERROR_INVALIDDATA;
754  }
755 
756  if (s->keyframe || s->intraonly) {
757  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
758  } else {
759  memset(&s->counts, 0, sizeof(s->counts));
760  }
761  // FIXME is it faster to not copy here, but do it down in the fw updates
762  // as explicit copies if the fw update is missing (and skip the copy upon
763  // fw update)?
764  s->prob.p = s->prob_ctx[c].p;
765 
766  // txfm updates
767  if (s->lossless) {
768  s->txfmmode = TX_4X4;
769  } else {
770  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
771  if (s->txfmmode == 3)
772  s->txfmmode += vp8_rac_get(&s->c);
773 
774  if (s->txfmmode == TX_SWITCHABLE) {
775  for (i = 0; i < 2; i++)
776  if (vp56_rac_get_prob_branchy(&s->c, 252))
777  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
778  for (i = 0; i < 2; i++)
779  for (j = 0; j < 2; j++)
780  if (vp56_rac_get_prob_branchy(&s->c, 252))
781  s->prob.p.tx16p[i][j] =
782  update_prob(&s->c, s->prob.p.tx16p[i][j]);
783  for (i = 0; i < 2; i++)
784  for (j = 0; j < 3; j++)
785  if (vp56_rac_get_prob_branchy(&s->c, 252))
786  s->prob.p.tx32p[i][j] =
787  update_prob(&s->c, s->prob.p.tx32p[i][j]);
788  }
789  }
790 
791  // coef updates
792  for (i = 0; i < 4; i++) {
793  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
794  if (vp8_rac_get(&s->c)) {
795  for (j = 0; j < 2; j++)
796  for (k = 0; k < 2; k++)
797  for (l = 0; l < 6; l++)
798  for (m = 0; m < 6; m++) {
799  uint8_t *p = s->prob.coef[i][j][k][l][m];
800  uint8_t *r = ref[j][k][l][m];
801  if (m >= 3 && l == 0) // dc only has 3 pt
802  break;
803  for (n = 0; n < 3; n++) {
804  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
805  p[n] = update_prob(&s->c, r[n]);
806  } else {
807  p[n] = r[n];
808  }
809  }
810  p[3] = 0;
811  }
812  } else {
813  for (j = 0; j < 2; j++)
814  for (k = 0; k < 2; k++)
815  for (l = 0; l < 6; l++)
816  for (m = 0; m < 6; m++) {
817  uint8_t *p = s->prob.coef[i][j][k][l][m];
818  uint8_t *r = ref[j][k][l][m];
819  if (m > 3 && l == 0) // dc only has 3 pt
820  break;
821  memcpy(p, r, 3);
822  p[3] = 0;
823  }
824  }
825  if (s->txfmmode == i)
826  break;
827  }
828 
829  // mode updates
830  for (i = 0; i < 3; i++)
831  if (vp56_rac_get_prob_branchy(&s->c, 252))
832  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
833  if (!s->keyframe && !s->intraonly) {
834  for (i = 0; i < 7; i++)
835  for (j = 0; j < 3; j++)
836  if (vp56_rac_get_prob_branchy(&s->c, 252))
837  s->prob.p.mv_mode[i][j] =
838  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
839 
840  if (s->filtermode == FILTER_SWITCHABLE)
841  for (i = 0; i < 4; i++)
842  for (j = 0; j < 2; j++)
843  if (vp56_rac_get_prob_branchy(&s->c, 252))
844  s->prob.p.filter[i][j] =
845  update_prob(&s->c, s->prob.p.filter[i][j]);
846 
847  for (i = 0; i < 4; i++)
848  if (vp56_rac_get_prob_branchy(&s->c, 252))
849  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
850 
851  if (s->allowcompinter) {
852  s->comppredmode = vp8_rac_get(&s->c);
853  if (s->comppredmode)
854  s->comppredmode += vp8_rac_get(&s->c);
855  if (s->comppredmode == PRED_SWITCHABLE)
856  for (i = 0; i < 5; i++)
857  if (vp56_rac_get_prob_branchy(&s->c, 252))
858  s->prob.p.comp[i] =
859  update_prob(&s->c, s->prob.p.comp[i]);
860  } else {
862  }
863 
864  if (s->comppredmode != PRED_COMPREF) {
865  for (i = 0; i < 5; i++) {
866  if (vp56_rac_get_prob_branchy(&s->c, 252))
867  s->prob.p.single_ref[i][0] =
868  update_prob(&s->c, s->prob.p.single_ref[i][0]);
869  if (vp56_rac_get_prob_branchy(&s->c, 252))
870  s->prob.p.single_ref[i][1] =
871  update_prob(&s->c, s->prob.p.single_ref[i][1]);
872  }
873  }
874 
875  if (s->comppredmode != PRED_SINGLEREF) {
876  for (i = 0; i < 5; i++)
877  if (vp56_rac_get_prob_branchy(&s->c, 252))
878  s->prob.p.comp_ref[i] =
879  update_prob(&s->c, s->prob.p.comp_ref[i]);
880  }
881 
882  for (i = 0; i < 4; i++)
883  for (j = 0; j < 9; j++)
884  if (vp56_rac_get_prob_branchy(&s->c, 252))
885  s->prob.p.y_mode[i][j] =
886  update_prob(&s->c, s->prob.p.y_mode[i][j]);
887 
888  for (i = 0; i < 4; i++)
889  for (j = 0; j < 4; j++)
890  for (k = 0; k < 3; k++)
891  if (vp56_rac_get_prob_branchy(&s->c, 252))
892  s->prob.p.partition[3 - i][j][k] =
893  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
894 
895  // mv fields don't use the update_prob subexp model for some reason
896  for (i = 0; i < 3; i++)
897  if (vp56_rac_get_prob_branchy(&s->c, 252))
898  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
899 
900  for (i = 0; i < 2; i++) {
901  if (vp56_rac_get_prob_branchy(&s->c, 252))
902  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
903 
904  for (j = 0; j < 10; j++)
905  if (vp56_rac_get_prob_branchy(&s->c, 252))
906  s->prob.p.mv_comp[i].classes[j] =
907  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
908 
909  if (vp56_rac_get_prob_branchy(&s->c, 252))
910  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
911 
912  for (j = 0; j < 10; j++)
913  if (vp56_rac_get_prob_branchy(&s->c, 252))
914  s->prob.p.mv_comp[i].bits[j] =
915  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
916  }
917 
918  for (i = 0; i < 2; i++) {
919  for (j = 0; j < 2; j++)
920  for (k = 0; k < 3; k++)
921  if (vp56_rac_get_prob_branchy(&s->c, 252))
922  s->prob.p.mv_comp[i].class0_fp[j][k] =
923  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
924 
925  for (j = 0; j < 3; j++)
926  if (vp56_rac_get_prob_branchy(&s->c, 252))
927  s->prob.p.mv_comp[i].fp[j] =
928  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
929  }
930 
931  if (s->highprecisionmvs) {
932  for (i = 0; i < 2; i++) {
933  if (vp56_rac_get_prob_branchy(&s->c, 252))
934  s->prob.p.mv_comp[i].class0_hp =
935  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
936 
937  if (vp56_rac_get_prob_branchy(&s->c, 252))
938  s->prob.p.mv_comp[i].hp =
939  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
940  }
941  }
942  }
943 
944  return (data2 - data) + size2;
945 }
946 
947 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
948  VP9Context *s)
949 {
950  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
951  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
952 }
953 
954 static void find_ref_mvs(VP9Context *s,
955  VP56mv *pmv, int ref, int z, int idx, int sb)
956 {
957  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
958  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
959  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
960  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
961  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
962  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
963  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
964  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
965  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
966  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
967  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
969  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
970  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
971  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
972  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
973  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
974  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
975  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
976  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
977  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
978  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984  };
985  VP9Block *b = s->b;
986  int row = s->row, col = s->col, row7 = s->row7;
987  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
988 #define INVALID_MV 0x80008000U
989  uint32_t mem = INVALID_MV;
990  int i;
991 
992 #define RETURN_DIRECT_MV(mv) \
993  do { \
994  uint32_t m = AV_RN32A(&mv); \
995  if (!idx) { \
996  AV_WN32A(pmv, m); \
997  return; \
998  } else if (mem == INVALID_MV) { \
999  mem = m; \
1000  } else if (m != mem) { \
1001  AV_WN32A(pmv, m); \
1002  return; \
1003  } \
1004  } while (0)
1005 
1006  if (sb >= 0) {
1007  if (sb == 2 || sb == 1) {
1008  RETURN_DIRECT_MV(b->mv[0][z]);
1009  } else if (sb == 3) {
1010  RETURN_DIRECT_MV(b->mv[2][z]);
1011  RETURN_DIRECT_MV(b->mv[1][z]);
1012  RETURN_DIRECT_MV(b->mv[0][z]);
1013  }
1014 
1015 #define RETURN_MV(mv) \
1016  do { \
1017  if (sb > 0) { \
1018  VP56mv tmp; \
1019  uint32_t m; \
1020  clamp_mv(&tmp, &mv, s); \
1021  m = AV_RN32A(&tmp); \
1022  if (!idx) { \
1023  AV_WN32A(pmv, m); \
1024  return; \
1025  } else if (mem == INVALID_MV) { \
1026  mem = m; \
1027  } else if (m != mem) { \
1028  AV_WN32A(pmv, m); \
1029  return; \
1030  } \
1031  } else { \
1032  uint32_t m = AV_RN32A(&mv); \
1033  if (!idx) { \
1034  clamp_mv(pmv, &mv, s); \
1035  return; \
1036  } else if (mem == INVALID_MV) { \
1037  mem = m; \
1038  } else if (m != mem) { \
1039  clamp_mv(pmv, &mv, s); \
1040  return; \
1041  } \
1042  } \
1043  } while (0)
1044 
1045  if (row > 0) {
1046  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1047  if (mv->ref[0] == ref) {
1048  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1049  } else if (mv->ref[1] == ref) {
1050  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1051  }
1052  }
1053  if (col > s->tiling.tile_col_start) {
1054  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1055  if (mv->ref[0] == ref) {
1056  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1057  } else if (mv->ref[1] == ref) {
1058  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1059  }
1060  }
1061  i = 2;
1062  } else {
1063  i = 0;
1064  }
1065 
1066  // previously coded MVs in this neighbourhood, using same reference frame
1067  for (; i < 8; i++) {
1068  int c = p[i][0] + col, r = p[i][1] + row;
1069 
1070  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1071  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1072 
1073  if (mv->ref[0] == ref) {
1074  RETURN_MV(mv->mv[0]);
1075  } else if (mv->ref[1] == ref) {
1076  RETURN_MV(mv->mv[1]);
1077  }
1078  }
1079  }
1080 
1081  // MV at this position in previous frame, using same reference frame
1082  if (s->use_last_frame_mvs) {
1083  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1084 
1085  if (!s->last_uses_2pass)
1086  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1087  if (mv->ref[0] == ref) {
1088  RETURN_MV(mv->mv[0]);
1089  } else if (mv->ref[1] == ref) {
1090  RETURN_MV(mv->mv[1]);
1091  }
1092  }
1093 
1094 #define RETURN_SCALE_MV(mv, scale) \
1095  do { \
1096  if (scale) { \
1097  VP56mv mv_temp = { -mv.x, -mv.y }; \
1098  RETURN_MV(mv_temp); \
1099  } else { \
1100  RETURN_MV(mv); \
1101  } \
1102  } while (0)
1103 
1104  // previously coded MVs in this neighbourhood, using different reference frame
1105  for (i = 0; i < 8; i++) {
1106  int c = p[i][0] + col, r = p[i][1] + row;
1107 
1108  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1110 
1111  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1112  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1113  }
1114  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1115  // BUG - libvpx has this condition regardless of whether
1116  // we used the first ref MV and pre-scaling
1117  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1118  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1119  }
1120  }
1121  }
1122 
1123  // MV at this position in previous frame, using different reference frame
1124  if (s->use_last_frame_mvs) {
1125  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1126 
1127  // no need to await_progress, because we already did that above
1128  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1129  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1130  }
1131  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1132  // BUG - libvpx has this condition regardless of whether
1133  // we used the first ref MV and pre-scaling
1134  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1135  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1136  }
1137  }
1138 
1139  AV_ZERO32(pmv);
1140 #undef INVALID_MV
1141 #undef RETURN_MV
1142 #undef RETURN_SCALE_MV
1143 }
1144 
1145 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1146 {
1147  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1148  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1149  s->prob.p.mv_comp[idx].classes);
1150 
1151  s->counts.mv_comp[idx].sign[sign]++;
1152  s->counts.mv_comp[idx].classes[c]++;
1153  if (c) {
1154  int m;
1155 
1156  for (n = 0, m = 0; m < c; m++) {
1157  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1158  n |= bit << m;
1159  s->counts.mv_comp[idx].bits[m][bit]++;
1160  }
1161  n <<= 3;
1162  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1163  n |= bit << 1;
1164  s->counts.mv_comp[idx].fp[bit]++;
1165  if (hp) {
1166  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1167  s->counts.mv_comp[idx].hp[bit]++;
1168  n |= bit;
1169  } else {
1170  n |= 1;
1171  // bug in libvpx - we count for bw entropy purposes even if the
1172  // bit wasn't coded
1173  s->counts.mv_comp[idx].hp[1]++;
1174  }
1175  n += 8 << c;
1176  } else {
1177  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1178  s->counts.mv_comp[idx].class0[n]++;
1179  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1180  s->prob.p.mv_comp[idx].class0_fp[n]);
1181  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1182  n = (n << 3) | (bit << 1);
1183  if (hp) {
1184  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1185  s->counts.mv_comp[idx].class0_hp[bit]++;
1186  n |= bit;
1187  } else {
1188  n |= 1;
1189  // bug in libvpx - we count for bw entropy purposes even if the
1190  // bit wasn't coded
1191  s->counts.mv_comp[idx].class0_hp[1]++;
1192  }
1193  }
1194 
1195  return sign ? -(n + 1) : (n + 1);
1196 }
1197 
1198 static void fill_mv(VP9Context *s,
1199  VP56mv *mv, int mode, int sb)
1200 {
1201  VP9Block *b = s->b;
1202 
1203  if (mode == ZEROMV) {
1204  AV_ZERO64(mv);
1205  } else {
1206  int hp;
1207 
1208  // FIXME cache this value and reuse for other subblocks
1209  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1210  mode == NEWMV ? -1 : sb);
1211  // FIXME maybe move this code into find_ref_mvs()
1212  if ((mode == NEWMV || sb == -1) &&
1213  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1214  if (mv[0].y & 1) {
1215  if (mv[0].y < 0)
1216  mv[0].y++;
1217  else
1218  mv[0].y--;
1219  }
1220  if (mv[0].x & 1) {
1221  if (mv[0].x < 0)
1222  mv[0].x++;
1223  else
1224  mv[0].x--;
1225  }
1226  }
1227  if (mode == NEWMV) {
1229  s->prob.p.mv_joint);
1230 
1231  s->counts.mv_joint[j]++;
1232  if (j >= MV_JOINT_V)
1233  mv[0].y += read_mv_component(s, 0, hp);
1234  if (j & 1)
1235  mv[0].x += read_mv_component(s, 1, hp);
1236  }
1237 
1238  if (b->comp) {
1239  // FIXME cache this value and reuse for other subblocks
1240  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1241  mode == NEWMV ? -1 : sb);
1242  if ((mode == NEWMV || sb == -1) &&
1243  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1244  if (mv[1].y & 1) {
1245  if (mv[1].y < 0)
1246  mv[1].y++;
1247  else
1248  mv[1].y--;
1249  }
1250  if (mv[1].x & 1) {
1251  if (mv[1].x < 0)
1252  mv[1].x++;
1253  else
1254  mv[1].x--;
1255  }
1256  }
1257  if (mode == NEWMV) {
1259  s->prob.p.mv_joint);
1260 
1261  s->counts.mv_joint[j]++;
1262  if (j >= MV_JOINT_V)
1263  mv[1].y += read_mv_component(s, 0, hp);
1264  if (j & 1)
1265  mv[1].x += read_mv_component(s, 1, hp);
1266  }
1267  }
1268  }
1269 }
1270 
1271 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1272  ptrdiff_t stride, int v)
1273 {
1274  switch (w) {
1275  case 1:
1276  do {
1277  *ptr = v;
1278  ptr += stride;
1279  } while (--h);
1280  break;
1281  case 2: {
1282  int v16 = v * 0x0101;
1283  do {
1284  AV_WN16A(ptr, v16);
1285  ptr += stride;
1286  } while (--h);
1287  break;
1288  }
1289  case 4: {
1290  uint32_t v32 = v * 0x01010101;
1291  do {
1292  AV_WN32A(ptr, v32);
1293  ptr += stride;
1294  } while (--h);
1295  break;
1296  }
1297  case 8: {
1298 #if HAVE_FAST_64BIT
1299  uint64_t v64 = v * 0x0101010101010101ULL;
1300  do {
1301  AV_WN64A(ptr, v64);
1302  ptr += stride;
1303  } while (--h);
1304 #else
1305  uint32_t v32 = v * 0x01010101;
1306  do {
1307  AV_WN32A(ptr, v32);
1308  AV_WN32A(ptr + 4, v32);
1309  ptr += stride;
1310  } while (--h);
1311 #endif
1312  break;
1313  }
1314  }
1315 }
1316 
1317 static void decode_mode(AVCodecContext *ctx)
1318 {
1319  static const uint8_t left_ctx[N_BS_SIZES] = {
1320  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1321  };
1322  static const uint8_t above_ctx[N_BS_SIZES] = {
1323  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1324  };
1325  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1327  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1328  };
1329  VP9Context *s = ctx->priv_data;
1330  VP9Block *b = s->b;
1331  int row = s->row, col = s->col, row7 = s->row7;
1332  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1333  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1334  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1335  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1336  int vref, filter_id;
1337 
1338  if (!s->segmentation.enabled) {
1339  b->seg_id = 0;
1340  } else if (s->keyframe || s->intraonly) {
1342  } else if (!s->segmentation.update_map ||
1343  (s->segmentation.temporal &&
1345  s->prob.segpred[s->above_segpred_ctx[col] +
1346  s->left_segpred_ctx[row7]]))) {
1347  int pred = 8, x;
1348  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1349 
1350  if (!s->last_uses_2pass)
1351  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1352  for (y = 0; y < h4; y++)
1353  for (x = 0; x < w4; x++)
1354  pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1355  av_assert1(pred < 8);
1356  b->seg_id = pred;
1357 
1358  memset(&s->above_segpred_ctx[col], 1, w4);
1359  memset(&s->left_segpred_ctx[row7], 1, h4);
1360  } else {
1362  s->prob.seg);
1363 
1364  memset(&s->above_segpred_ctx[col], 0, w4);
1365  memset(&s->left_segpred_ctx[row7], 0, h4);
1366  }
1367  if (s->segmentation.enabled &&
1368  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1369  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1370  w4, h4, 8 * s->sb_cols, b->seg_id);
1371  }
1372 
1373  b->skip = s->segmentation.enabled &&
1374  s->segmentation.feat[b->seg_id].skip_enabled;
1375  if (!b->skip) {
1376  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1377  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1378  s->counts.skip[c][b->skip]++;
1379  }
1380 
1381  if (s->keyframe || s->intraonly) {
1382  b->intra = 1;
1383  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1384  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1385  } else {
1386  int c, bit;
1387 
1388  if (have_a && have_l) {
1389  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1390  c += (c == 2);
1391  } else {
1392  c = have_a ? 2 * s->above_intra_ctx[col] :
1393  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1394  }
1395  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1396  s->counts.intra[c][bit]++;
1397  b->intra = !bit;
1398  }
1399 
1400  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1401  int c;
1402  if (have_a) {
1403  if (have_l) {
1404  c = (s->above_skip_ctx[col] ? max_tx :
1405  s->above_txfm_ctx[col]) +
1406  (s->left_skip_ctx[row7] ? max_tx :
1407  s->left_txfm_ctx[row7]) > max_tx;
1408  } else {
1409  c = s->above_skip_ctx[col] ? 1 :
1410  (s->above_txfm_ctx[col] * 2 > max_tx);
1411  }
1412  } else if (have_l) {
1413  c = s->left_skip_ctx[row7] ? 1 :
1414  (s->left_txfm_ctx[row7] * 2 > max_tx);
1415  } else {
1416  c = 1;
1417  }
1418  switch (max_tx) {
1419  case TX_32X32:
1420  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1421  if (b->tx) {
1422  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1423  if (b->tx == 2)
1424  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1425  }
1426  s->counts.tx32p[c][b->tx]++;
1427  break;
1428  case TX_16X16:
1429  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1430  if (b->tx)
1431  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1432  s->counts.tx16p[c][b->tx]++;
1433  break;
1434  case TX_8X8:
1435  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1436  s->counts.tx8p[c][b->tx]++;
1437  break;
1438  case TX_4X4:
1439  b->tx = TX_4X4;
1440  break;
1441  }
1442  } else {
1443  b->tx = FFMIN(max_tx, s->txfmmode);
1444  }
1445 
1446  if (s->keyframe || s->intraonly) {
1447  uint8_t *a = &s->above_mode_ctx[col * 2];
1448  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1449 
1450  b->comp = 0;
1451  if (b->bs > BS_8x8) {
1452  // FIXME the memory storage intermediates here aren't really
1453  // necessary, they're just there to make the code slightly
1454  // simpler for now
1455  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1456  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1457  if (b->bs != BS_8x4) {
1459  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1460  l[0] = a[1] = b->mode[1];
1461  } else {
1462  l[0] = a[1] = b->mode[1] = b->mode[0];
1463  }
1464  if (b->bs != BS_4x8) {
1465  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1467  if (b->bs != BS_8x4) {
1469  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1470  l[1] = a[1] = b->mode[3];
1471  } else {
1472  l[1] = a[1] = b->mode[3] = b->mode[2];
1473  }
1474  } else {
1475  b->mode[2] = b->mode[0];
1476  l[1] = a[1] = b->mode[3] = b->mode[1];
1477  }
1478  } else {
1480  vp9_default_kf_ymode_probs[*a][*l]);
1481  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1482  // FIXME this can probably be optimized
1483  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1484  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1485  }
1488  } else if (b->intra) {
1489  b->comp = 0;
1490  if (b->bs > BS_8x8) {
1492  s->prob.p.y_mode[0]);
1493  s->counts.y_mode[0][b->mode[0]]++;
1494  if (b->bs != BS_8x4) {
1496  s->prob.p.y_mode[0]);
1497  s->counts.y_mode[0][b->mode[1]]++;
1498  } else {
1499  b->mode[1] = b->mode[0];
1500  }
1501  if (b->bs != BS_4x8) {
1503  s->prob.p.y_mode[0]);
1504  s->counts.y_mode[0][b->mode[2]]++;
1505  if (b->bs != BS_8x4) {
1507  s->prob.p.y_mode[0]);
1508  s->counts.y_mode[0][b->mode[3]]++;
1509  } else {
1510  b->mode[3] = b->mode[2];
1511  }
1512  } else {
1513  b->mode[2] = b->mode[0];
1514  b->mode[3] = b->mode[1];
1515  }
1516  } else {
1517  static const uint8_t size_group[10] = {
1518  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1519  };
1520  int sz = size_group[b->bs];
1521 
1523  s->prob.p.y_mode[sz]);
1524  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1525  s->counts.y_mode[sz][b->mode[3]]++;
1526  }
1528  s->prob.p.uv_mode[b->mode[3]]);
1529  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1530  } else {
1531  static const uint8_t inter_mode_ctx_lut[14][14] = {
1532  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1533  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1534  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1535  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1536  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1537  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1543  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1544  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1545  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1546  };
1547 
1548  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1549  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1550  b->comp = 0;
1551  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1552  } else {
1553  // read comp_pred flag
1554  if (s->comppredmode != PRED_SWITCHABLE) {
1555  b->comp = s->comppredmode == PRED_COMPREF;
1556  } else {
1557  int c;
1558 
1559  // FIXME add intra as ref=0xff (or -1) to make these easier?
1560  if (have_a) {
1561  if (have_l) {
1562  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1563  c = 4;
1564  } else if (s->above_comp_ctx[col]) {
1565  c = 2 + (s->left_intra_ctx[row7] ||
1566  s->left_ref_ctx[row7] == s->fixcompref);
1567  } else if (s->left_comp_ctx[row7]) {
1568  c = 2 + (s->above_intra_ctx[col] ||
1569  s->above_ref_ctx[col] == s->fixcompref);
1570  } else {
1571  c = (!s->above_intra_ctx[col] &&
1572  s->above_ref_ctx[col] == s->fixcompref) ^
1573  (!s->left_intra_ctx[row7] &&
1574  s->left_ref_ctx[row & 7] == s->fixcompref);
1575  }
1576  } else {
1577  c = s->above_comp_ctx[col] ? 3 :
1578  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1579  }
1580  } else if (have_l) {
1581  c = s->left_comp_ctx[row7] ? 3 :
1582  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1583  } else {
1584  c = 1;
1585  }
1586  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1587  s->counts.comp[c][b->comp]++;
1588  }
1589 
1590  // read actual references
1591  // FIXME probably cache a few variables here to prevent repetitive
1592  // memory accesses below
1593  if (b->comp) /* two references */ {
1594  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1595 
1596  b->ref[fix_idx] = s->fixcompref;
1597  // FIXME can this codeblob be replaced by some sort of LUT?
1598  if (have_a) {
1599  if (have_l) {
1600  if (s->above_intra_ctx[col]) {
1601  if (s->left_intra_ctx[row7]) {
1602  c = 2;
1603  } else {
1604  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1605  }
1606  } else if (s->left_intra_ctx[row7]) {
1607  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1608  } else {
1609  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1610 
1611  if (refl == refa && refa == s->varcompref[1]) {
1612  c = 0;
1613  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1614  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1615  (refl == s->fixcompref && refa == s->varcompref[0])) {
1616  c = 4;
1617  } else {
1618  c = (refa == refl) ? 3 : 1;
1619  }
1620  } else if (!s->left_comp_ctx[row7]) {
1621  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1622  c = 1;
1623  } else {
1624  c = (refl == s->varcompref[1] &&
1625  refa != s->varcompref[1]) ? 2 : 4;
1626  }
1627  } else if (!s->above_comp_ctx[col]) {
1628  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1629  c = 1;
1630  } else {
1631  c = (refa == s->varcompref[1] &&
1632  refl != s->varcompref[1]) ? 2 : 4;
1633  }
1634  } else {
1635  c = (refl == refa) ? 4 : 2;
1636  }
1637  }
1638  } else {
1639  if (s->above_intra_ctx[col]) {
1640  c = 2;
1641  } else if (s->above_comp_ctx[col]) {
1642  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1643  } else {
1644  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1645  }
1646  }
1647  } else if (have_l) {
1648  if (s->left_intra_ctx[row7]) {
1649  c = 2;
1650  } else if (s->left_comp_ctx[row7]) {
1651  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1652  } else {
1653  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1654  }
1655  } else {
1656  c = 2;
1657  }
1658  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1659  b->ref[var_idx] = s->varcompref[bit];
1660  s->counts.comp_ref[c][bit]++;
1661  } else /* single reference */ {
1662  int bit, c;
1663 
1664  if (have_a && !s->above_intra_ctx[col]) {
1665  if (have_l && !s->left_intra_ctx[row7]) {
1666  if (s->left_comp_ctx[row7]) {
1667  if (s->above_comp_ctx[col]) {
1668  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1669  !s->above_ref_ctx[col]);
1670  } else {
1671  c = (3 * !s->above_ref_ctx[col]) +
1672  (!s->fixcompref || !s->left_ref_ctx[row7]);
1673  }
1674  } else if (s->above_comp_ctx[col]) {
1675  c = (3 * !s->left_ref_ctx[row7]) +
1676  (!s->fixcompref || !s->above_ref_ctx[col]);
1677  } else {
1678  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1679  }
1680  } else if (s->above_intra_ctx[col]) {
1681  c = 2;
1682  } else if (s->above_comp_ctx[col]) {
1683  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1684  } else {
1685  c = 4 * (!s->above_ref_ctx[col]);
1686  }
1687  } else if (have_l && !s->left_intra_ctx[row7]) {
1688  if (s->left_intra_ctx[row7]) {
1689  c = 2;
1690  } else if (s->left_comp_ctx[row7]) {
1691  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1692  } else {
1693  c = 4 * (!s->left_ref_ctx[row7]);
1694  }
1695  } else {
1696  c = 2;
1697  }
1698  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1699  s->counts.single_ref[c][0][bit]++;
1700  if (!bit) {
1701  b->ref[0] = 0;
1702  } else {
1703  // FIXME can this codeblob be replaced by some sort of LUT?
1704  if (have_a) {
1705  if (have_l) {
1706  if (s->left_intra_ctx[row7]) {
1707  if (s->above_intra_ctx[col]) {
1708  c = 2;
1709  } else if (s->above_comp_ctx[col]) {
1710  c = 1 + 2 * (s->fixcompref == 1 ||
1711  s->above_ref_ctx[col] == 1);
1712  } else if (!s->above_ref_ctx[col]) {
1713  c = 3;
1714  } else {
1715  c = 4 * (s->above_ref_ctx[col] == 1);
1716  }
1717  } else if (s->above_intra_ctx[col]) {
1718  if (s->left_intra_ctx[row7]) {
1719  c = 2;
1720  } else if (s->left_comp_ctx[row7]) {
1721  c = 1 + 2 * (s->fixcompref == 1 ||
1722  s->left_ref_ctx[row7] == 1);
1723  } else if (!s->left_ref_ctx[row7]) {
1724  c = 3;
1725  } else {
1726  c = 4 * (s->left_ref_ctx[row7] == 1);
1727  }
1728  } else if (s->above_comp_ctx[col]) {
1729  if (s->left_comp_ctx[row7]) {
1730  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1731  c = 3 * (s->fixcompref == 1 ||
1732  s->left_ref_ctx[row7] == 1);
1733  } else {
1734  c = 2;
1735  }
1736  } else if (!s->left_ref_ctx[row7]) {
1737  c = 1 + 2 * (s->fixcompref == 1 ||
1738  s->above_ref_ctx[col] == 1);
1739  } else {
1740  c = 3 * (s->left_ref_ctx[row7] == 1) +
1741  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1742  }
1743  } else if (s->left_comp_ctx[row7]) {
1744  if (!s->above_ref_ctx[col]) {
1745  c = 1 + 2 * (s->fixcompref == 1 ||
1746  s->left_ref_ctx[row7] == 1);
1747  } else {
1748  c = 3 * (s->above_ref_ctx[col] == 1) +
1749  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1750  }
1751  } else if (!s->above_ref_ctx[col]) {
1752  if (!s->left_ref_ctx[row7]) {
1753  c = 3;
1754  } else {
1755  c = 4 * (s->left_ref_ctx[row7] == 1);
1756  }
1757  } else if (!s->left_ref_ctx[row7]) {
1758  c = 4 * (s->above_ref_ctx[col] == 1);
1759  } else {
1760  c = 2 * (s->left_ref_ctx[row7] == 1) +
1761  2 * (s->above_ref_ctx[col] == 1);
1762  }
1763  } else {
1764  if (s->above_intra_ctx[col] ||
1765  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1766  c = 2;
1767  } else if (s->above_comp_ctx[col]) {
1768  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1769  } else {
1770  c = 4 * (s->above_ref_ctx[col] == 1);
1771  }
1772  }
1773  } else if (have_l) {
1774  if (s->left_intra_ctx[row7] ||
1775  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1776  c = 2;
1777  } else if (s->left_comp_ctx[row7]) {
1778  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1779  } else {
1780  c = 4 * (s->left_ref_ctx[row7] == 1);
1781  }
1782  } else {
1783  c = 2;
1784  }
1785  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1786  s->counts.single_ref[c][1][bit]++;
1787  b->ref[0] = 1 + bit;
1788  }
1789  }
1790  }
1791 
1792  if (b->bs <= BS_8x8) {
1793  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1794  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1795  } else {
1796  static const uint8_t off[10] = {
1797  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1798  };
1799 
1800  // FIXME this needs to use the LUT tables from find_ref_mvs
1801  // because not all are -1,0/0,-1
1802  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1803  [s->left_mode_ctx[row7 + off[b->bs]]];
1804 
1806  s->prob.p.mv_mode[c]);
1807  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1808  s->counts.mv_mode[c][b->mode[0] - 10]++;
1809  }
1810  }
1811 
1812  if (s->filtermode == FILTER_SWITCHABLE) {
1813  int c;
1814 
1815  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1816  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1817  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1818  s->left_filter_ctx[row7] : 3;
1819  } else {
1820  c = s->above_filter_ctx[col];
1821  }
1822  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1823  c = s->left_filter_ctx[row7];
1824  } else {
1825  c = 3;
1826  }
1827 
1828  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1829  s->prob.p.filter[c]);
1830  s->counts.filter[c][filter_id]++;
1831  b->filter = vp9_filter_lut[filter_id];
1832  } else {
1833  b->filter = s->filtermode;
1834  }
1835 
1836  if (b->bs > BS_8x8) {
1837  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1838 
1840  s->prob.p.mv_mode[c]);
1841  s->counts.mv_mode[c][b->mode[0] - 10]++;
1842  fill_mv(s, b->mv[0], b->mode[0], 0);
1843 
1844  if (b->bs != BS_8x4) {
1846  s->prob.p.mv_mode[c]);
1847  s->counts.mv_mode[c][b->mode[1] - 10]++;
1848  fill_mv(s, b->mv[1], b->mode[1], 1);
1849  } else {
1850  b->mode[1] = b->mode[0];
1851  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1852  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1853  }
1854 
1855  if (b->bs != BS_4x8) {
1857  s->prob.p.mv_mode[c]);
1858  s->counts.mv_mode[c][b->mode[2] - 10]++;
1859  fill_mv(s, b->mv[2], b->mode[2], 2);
1860 
1861  if (b->bs != BS_8x4) {
1863  s->prob.p.mv_mode[c]);
1864  s->counts.mv_mode[c][b->mode[3] - 10]++;
1865  fill_mv(s, b->mv[3], b->mode[3], 3);
1866  } else {
1867  b->mode[3] = b->mode[2];
1868  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1869  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1870  }
1871  } else {
1872  b->mode[2] = b->mode[0];
1873  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1874  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1875  b->mode[3] = b->mode[1];
1876  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1877  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1878  }
1879  } else {
1880  fill_mv(s, b->mv[0], b->mode[0], -1);
1881  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1882  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1883  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1884  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1885  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1886  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1887  }
1888 
1889  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1890  }
1891 
1892 #if HAVE_FAST_64BIT
1893 #define SPLAT_CTX(var, val, n) \
1894  switch (n) { \
1895  case 1: var = val; break; \
1896  case 2: AV_WN16A(&var, val * 0x0101); break; \
1897  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1898  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1899  case 16: { \
1900  uint64_t v64 = val * 0x0101010101010101ULL; \
1901  AV_WN64A( &var, v64); \
1902  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1903  break; \
1904  } \
1905  }
1906 #else
1907 #define SPLAT_CTX(var, val, n) \
1908  switch (n) { \
1909  case 1: var = val; break; \
1910  case 2: AV_WN16A(&var, val * 0x0101); break; \
1911  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1912  case 8: { \
1913  uint32_t v32 = val * 0x01010101; \
1914  AV_WN32A( &var, v32); \
1915  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1916  break; \
1917  } \
1918  case 16: { \
1919  uint32_t v32 = val * 0x01010101; \
1920  AV_WN32A( &var, v32); \
1921  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1922  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1923  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1924  break; \
1925  } \
1926  }
1927 #endif
1928 
1929  switch (bwh_tab[1][b->bs][0]) {
1930 #define SET_CTXS(dir, off, n) \
1931  do { \
1932  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1933  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1934  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1935  if (!s->keyframe && !s->intraonly) { \
1936  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1937  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1938  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1939  if (!b->intra) { \
1940  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1941  if (s->filtermode == FILTER_SWITCHABLE) { \
1942  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1943  } \
1944  } \
1945  } \
1946  } while (0)
1947  case 1: SET_CTXS(above, col, 1); break;
1948  case 2: SET_CTXS(above, col, 2); break;
1949  case 4: SET_CTXS(above, col, 4); break;
1950  case 8: SET_CTXS(above, col, 8); break;
1951  }
1952  switch (bwh_tab[1][b->bs][1]) {
1953  case 1: SET_CTXS(left, row7, 1); break;
1954  case 2: SET_CTXS(left, row7, 2); break;
1955  case 4: SET_CTXS(left, row7, 4); break;
1956  case 8: SET_CTXS(left, row7, 8); break;
1957  }
1958 #undef SPLAT_CTX
1959 #undef SET_CTXS
1960 
1961  if (!s->keyframe && !s->intraonly) {
1962  if (b->bs > BS_8x8) {
1963  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1964 
1965  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1966  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1967  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1968  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1969  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1970  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1971  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1972  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1973  } else {
1974  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1975 
1976  for (n = 0; n < w4 * 2; n++) {
1977  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1978  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1979  }
1980  for (n = 0; n < h4 * 2; n++) {
1981  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1982  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1983  }
1984  }
1985  }
1986 
1987  // FIXME kinda ugly
1988  for (y = 0; y < h4; y++) {
1989  int x, o = (row + y) * s->sb_cols * 8 + col;
1990  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1991 
1992  if (b->intra) {
1993  for (x = 0; x < w4; x++) {
1994  mv[x].ref[0] =
1995  mv[x].ref[1] = -1;
1996  }
1997  } else if (b->comp) {
1998  for (x = 0; x < w4; x++) {
1999  mv[x].ref[0] = b->ref[0];
2000  mv[x].ref[1] = b->ref[1];
2001  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2002  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2003  }
2004  } else {
2005  for (x = 0; x < w4; x++) {
2006  mv[x].ref[0] = b->ref[0];
2007  mv[x].ref[1] = -1;
2008  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2009  }
2010  }
2011  }
2012 }
2013 
2014 // FIXME merge cnt/eob arguments?
2015 static av_always_inline int
2016 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2017  int is_tx32x32, unsigned (*cnt)[6][3],
2018  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2019  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2020  const int16_t *band_counts, const int16_t *qmul)
2021 {
2022  int i = 0, band = 0, band_left = band_counts[band];
2023  uint8_t *tp = p[0][nnz];
2024  uint8_t cache[1024];
2025 
2026  do {
2027  int val, rc;
2028 
2029  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2030  eob[band][nnz][val]++;
2031  if (!val)
2032  break;
2033 
2034  skip_eob:
2035  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2036  cnt[band][nnz][0]++;
2037  if (!--band_left)
2038  band_left = band_counts[++band];
2039  cache[scan[i]] = 0;
2040  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2041  tp = p[band][nnz];
2042  if (++i == n_coeffs)
2043  break; //invalid input; blocks should end with EOB
2044  goto skip_eob;
2045  }
2046 
2047  rc = scan[i];
2048  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2049  cnt[band][nnz][1]++;
2050  val = 1;
2051  cache[rc] = 1;
2052  } else {
2053  // fill in p[3-10] (model fill) - only once per frame for each pos
2054  if (!tp[3])
2055  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2056 
2057  cnt[band][nnz][2]++;
2058  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2059  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2060  cache[rc] = val = 2;
2061  } else {
2062  val = 3 + vp56_rac_get_prob(c, tp[5]);
2063  cache[rc] = 3;
2064  }
2065  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2066  cache[rc] = 4;
2067  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2068  val = 5 + vp56_rac_get_prob(c, 159);
2069  } else {
2070  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2071  val += vp56_rac_get_prob(c, 145);
2072  }
2073  } else { // cat 3-6
2074  cache[rc] = 5;
2075  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2076  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2077  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2078  val += (vp56_rac_get_prob(c, 148) << 1);
2079  val += vp56_rac_get_prob(c, 140);
2080  } else {
2081  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2082  val += (vp56_rac_get_prob(c, 155) << 2);
2083  val += (vp56_rac_get_prob(c, 140) << 1);
2084  val += vp56_rac_get_prob(c, 135);
2085  }
2086  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2087  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2088  val += (vp56_rac_get_prob(c, 157) << 3);
2089  val += (vp56_rac_get_prob(c, 141) << 2);
2090  val += (vp56_rac_get_prob(c, 134) << 1);
2091  val += vp56_rac_get_prob(c, 130);
2092  } else {
2093  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2094  val += (vp56_rac_get_prob(c, 254) << 12);
2095  val += (vp56_rac_get_prob(c, 254) << 11);
2096  val += (vp56_rac_get_prob(c, 252) << 10);
2097  val += (vp56_rac_get_prob(c, 249) << 9);
2098  val += (vp56_rac_get_prob(c, 243) << 8);
2099  val += (vp56_rac_get_prob(c, 230) << 7);
2100  val += (vp56_rac_get_prob(c, 196) << 6);
2101  val += (vp56_rac_get_prob(c, 177) << 5);
2102  val += (vp56_rac_get_prob(c, 153) << 4);
2103  val += (vp56_rac_get_prob(c, 140) << 3);
2104  val += (vp56_rac_get_prob(c, 133) << 2);
2105  val += (vp56_rac_get_prob(c, 130) << 1);
2106  val += vp56_rac_get_prob(c, 129);
2107  }
2108  }
2109  }
2110  if (!--band_left)
2111  band_left = band_counts[++band];
2112  if (is_tx32x32)
2113  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2114  else
2115  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2116  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2117  tp = p[band][nnz];
2118  } while (++i < n_coeffs);
2119 
2120  return i;
2121 }
2122 
2123 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2124  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2125  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2126  const int16_t (*nb)[2], const int16_t *band_counts,
2127  const int16_t *qmul)
2128 {
2129  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2130  nnz, scan, nb, band_counts, qmul);
2131 }
2132 
2133 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2134  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2135  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2136  const int16_t (*nb)[2], const int16_t *band_counts,
2137  const int16_t *qmul)
2138 {
2139  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2140  nnz, scan, nb, band_counts, qmul);
2141 }
2142 
2144 {
2145  VP9Context *s = ctx->priv_data;
2146  VP9Block *b = s->b;
2147  int row = s->row, col = s->col;
2148  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2149  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2150  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2151  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2152  int end_x = FFMIN(2 * (s->cols - col), w4);
2153  int end_y = FFMIN(2 * (s->rows - row), h4);
2154  int n, pl, x, y, res;
2155  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2156  int tx = 4 * s->lossless + b->tx;
2157  const int16_t * const *yscans = vp9_scans[tx];
2158  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2159  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2160  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2161  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2162  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2163  static const int16_t band_counts[4][8] = {
2164  { 1, 2, 3, 4, 3, 16 - 13 },
2165  { 1, 2, 3, 4, 11, 64 - 21 },
2166  { 1, 2, 3, 4, 11, 256 - 21 },
2167  { 1, 2, 3, 4, 11, 1024 - 21 },
2168  };
2169  const int16_t *y_band_counts = band_counts[b->tx];
2170  const int16_t *uv_band_counts = band_counts[b->uvtx];
2171 
2172 #define MERGE(la, end, step, rd) \
2173  for (n = 0; n < end; n += step) \
2174  la[n] = !!rd(&la[n])
2175 #define MERGE_CTX(step, rd) \
2176  do { \
2177  MERGE(l, end_y, step, rd); \
2178  MERGE(a, end_x, step, rd); \
2179  } while (0)
2180 
2181 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2182  for (n = 0, y = 0; y < end_y; y += step) { \
2183  for (x = 0; x < end_x; x += step, n += step * step) { \
2184  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2185  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2186  c, e, p, a[x] + l[y], yscans[txtp], \
2187  ynbs[txtp], y_band_counts, qmul[0]); \
2188  a[x] = l[y] = !!res; \
2189  if (step >= 4) { \
2190  AV_WN16A(&s->eob[n], res); \
2191  } else { \
2192  s->eob[n] = res; \
2193  } \
2194  } \
2195  }
2196 
2197 #define SPLAT(la, end, step, cond) \
2198  if (step == 2) { \
2199  for (n = 1; n < end; n += step) \
2200  la[n] = la[n - 1]; \
2201  } else if (step == 4) { \
2202  if (cond) { \
2203  for (n = 0; n < end; n += step) \
2204  AV_WN32A(&la[n], la[n] * 0x01010101); \
2205  } else { \
2206  for (n = 0; n < end; n += step) \
2207  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2208  } \
2209  } else /* step == 8 */ { \
2210  if (cond) { \
2211  if (HAVE_FAST_64BIT) { \
2212  for (n = 0; n < end; n += step) \
2213  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2214  } else { \
2215  for (n = 0; n < end; n += step) { \
2216  uint32_t v32 = la[n] * 0x01010101; \
2217  AV_WN32A(&la[n], v32); \
2218  AV_WN32A(&la[n + 4], v32); \
2219  } \
2220  } \
2221  } else { \
2222  for (n = 0; n < end; n += step) \
2223  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2224  } \
2225  }
2226 #define SPLAT_CTX(step) \
2227  do { \
2228  SPLAT(a, end_x, step, end_x == w4); \
2229  SPLAT(l, end_y, step, end_y == h4); \
2230  } while (0)
2231 
2232  /* y tokens */
2233  switch (b->tx) {
2234  case TX_4X4:
2235  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2236  break;
2237  case TX_8X8:
2238  MERGE_CTX(2, AV_RN16A);
2239  DECODE_Y_COEF_LOOP(2, 0,);
2240  SPLAT_CTX(2);
2241  break;
2242  case TX_16X16:
2243  MERGE_CTX(4, AV_RN32A);
2244  DECODE_Y_COEF_LOOP(4, 0,);
2245  SPLAT_CTX(4);
2246  break;
2247  case TX_32X32:
2248  MERGE_CTX(8, AV_RN64A);
2249  DECODE_Y_COEF_LOOP(8, 0, 32);
2250  SPLAT_CTX(8);
2251  break;
2252  }
2253 
2254 #define DECODE_UV_COEF_LOOP(step) \
2255  for (n = 0, y = 0; y < end_y; y += step) { \
2256  for (x = 0; x < end_x; x += step, n += step * step) { \
2257  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2258  16 * step * step, c, e, p, a[x] + l[y], \
2259  uvscan, uvnb, uv_band_counts, qmul[1]); \
2260  a[x] = l[y] = !!res; \
2261  s->uveob[pl][n] = res; \
2262  } \
2263  }
2264 
2265  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2266  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2267  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2268  w4 >>= 1;
2269  h4 >>= 1;
2270  end_x >>= 1;
2271  end_y >>= 1;
2272  for (pl = 0; pl < 2; pl++) {
2273  a = &s->above_uv_nnz_ctx[pl][col];
2274  l = &s->left_uv_nnz_ctx[pl][row & 7];
2275  switch (b->uvtx) {
2276  case TX_4X4:
2278  break;
2279  case TX_8X8:
2280  MERGE_CTX(2, AV_RN16A);
2282  SPLAT_CTX(2);
2283  break;
2284  case TX_16X16:
2285  MERGE_CTX(4, AV_RN32A);
2287  SPLAT_CTX(4);
2288  break;
2289  case TX_32X32:
2290  MERGE_CTX(8, AV_RN64A);
2291  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2292  // so there is no need to loop
2293  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2294  1024, c, e, p, a[0] + l[0],
2295  uvscan, uvnb, uv_band_counts, qmul[1]);
2296  a[0] = l[0] = !!res;
2297  AV_WN16A(&s->uveob[pl][0], res);
2298  SPLAT_CTX(8);
2299  break;
2300  }
2301  }
2302 }
2303 
2305  uint8_t *dst_edge, ptrdiff_t stride_edge,
2306  uint8_t *dst_inner, ptrdiff_t stride_inner,
2307  uint8_t *l, int col, int x, int w,
2308  int row, int y, enum TxfmMode tx,
2309  int p)
2310 {
2311  int have_top = row > 0 || y > 0;
2312  int have_left = col > s->tiling.tile_col_start || x > 0;
2313  int have_right = x < w - 1;
2314  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2315  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2316  { DC_127_PRED, VERT_PRED } },
2317  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2318  { HOR_PRED, HOR_PRED } },
2319  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2320  { LEFT_DC_PRED, DC_PRED } },
2330  { DC_127_PRED, VERT_LEFT_PRED } },
2331  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2332  { HOR_UP_PRED, HOR_UP_PRED } },
2333  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2334  { HOR_PRED, TM_VP8_PRED } },
2335  };
2336  static const struct {
2337  uint8_t needs_left:1;
2338  uint8_t needs_top:1;
2339  uint8_t needs_topleft:1;
2340  uint8_t needs_topright:1;
2341  } edges[N_INTRA_PRED_MODES] = {
2342  [VERT_PRED] = { .needs_top = 1 },
2343  [HOR_PRED] = { .needs_left = 1 },
2344  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2345  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2346  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2347  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2348  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2349  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2350  [HOR_UP_PRED] = { .needs_left = 1 },
2351  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2352  [LEFT_DC_PRED] = { .needs_left = 1 },
2353  [TOP_DC_PRED] = { .needs_top = 1 },
2354  [DC_128_PRED] = { 0 },
2355  [DC_127_PRED] = { 0 },
2356  [DC_129_PRED] = { 0 }
2357  };
2358 
2359  av_assert2(mode >= 0 && mode < 10);
2360  mode = mode_conv[mode][have_left][have_top];
2361  if (edges[mode].needs_top) {
2362  uint8_t *top, *topleft;
2363  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2364  int n_px_need_tr = 0;
2365 
2366  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2367  n_px_need_tr = 4;
2368 
2369  // if top of sb64-row, use s->intra_pred_data[] instead of
2370  // dst[-stride] for intra prediction (it contains pre- instead of
2371  // post-loopfilter data)
2372  if (have_top) {
2373  top = !(row & 7) && !y ?
2374  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2375  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2376  if (have_left)
2377  topleft = !(row & 7) && !y ?
2378  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2379  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2380  &dst_inner[-stride_inner];
2381  }
2382 
2383  if (have_top &&
2384  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2385  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2386  n_px_need + n_px_need_tr <= n_px_have) {
2387  *a = top;
2388  } else {
2389  if (have_top) {
2390  if (n_px_need <= n_px_have) {
2391  memcpy(*a, top, n_px_need);
2392  } else {
2393  memcpy(*a, top, n_px_have);
2394  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2395  n_px_need - n_px_have);
2396  }
2397  } else {
2398  memset(*a, 127, n_px_need);
2399  }
2400  if (edges[mode].needs_topleft) {
2401  if (have_left && have_top) {
2402  (*a)[-1] = topleft[-1];
2403  } else {
2404  (*a)[-1] = have_top ? 129 : 127;
2405  }
2406  }
2407  if (tx == TX_4X4 && edges[mode].needs_topright) {
2408  if (have_top && have_right &&
2409  n_px_need + n_px_need_tr <= n_px_have) {
2410  memcpy(&(*a)[4], &top[4], 4);
2411  } else {
2412  memset(&(*a)[4], (*a)[3], 4);
2413  }
2414  }
2415  }
2416  }
2417  if (edges[mode].needs_left) {
2418  if (have_left) {
2419  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2420  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2421  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2422 
2423  if (n_px_need <= n_px_have) {
2424  for (i = 0; i < n_px_need; i++)
2425  l[n_px_need - 1 - i] = dst[i * stride - 1];
2426  } else {
2427  for (i = 0; i < n_px_have; i++)
2428  l[n_px_need - 1 - i] = dst[i * stride - 1];
2429  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2430  }
2431  } else {
2432  memset(l, 129, 4 << tx);
2433  }
2434  }
2435 
2436  return mode;
2437 }
2438 
2439 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2440 {
2441  VP9Context *s = ctx->priv_data;
2442  VP9Block *b = s->b;
2443  int row = s->row, col = s->col;
2444  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2445  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2446  int end_x = FFMIN(2 * (s->cols - col), w4);
2447  int end_y = FFMIN(2 * (s->rows - row), h4);
2448  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2449  int uvstep1d = 1 << b->uvtx, p;
2450  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2451  LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2452  LOCAL_ALIGNED_16(uint8_t, l, [32]);
2453 
2454  for (n = 0, y = 0; y < end_y; y += step1d) {
2455  uint8_t *ptr = dst, *ptr_r = dst_r;
2456  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2457  ptr_r += 4 * step1d, n += step) {
2458  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2459  y * 2 + x : 0];
2460  uint8_t *a = &a_buf[16];
2461  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2462  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2463 
2464  mode = check_intra_mode(s, mode, &a, ptr_r,
2465  s->frames[CUR_FRAME].tf.f->linesize[0],
2466  ptr, s->y_stride, l,
2467  col, x, w4, row, y, b->tx, 0);
2468  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2469  if (eob)
2470  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2471  s->block + 16 * n, eob);
2472  }
2473  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2474  dst += 4 * step1d * s->y_stride;
2475  }
2476 
2477  // U/V
2478  h4 >>= 1;
2479  w4 >>= 1;
2480  end_x >>= 1;
2481  end_y >>= 1;
2482  step = 1 << (b->uvtx * 2);
2483  for (p = 0; p < 2; p++) {
2484  dst = s->dst[1 + p];
2485  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2486  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2487  uint8_t *ptr = dst, *ptr_r = dst_r;
2488  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2489  ptr_r += 4 * uvstep1d, n += step) {
2490  int mode = b->uvmode;
2491  uint8_t *a = &a_buf[16];
2492  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2493 
2494  mode = check_intra_mode(s, mode, &a, ptr_r,
2495  s->frames[CUR_FRAME].tf.f->linesize[1],
2496  ptr, s->uv_stride, l,
2497  col, x, w4, row, y, b->uvtx, p + 1);
2498  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2499  if (eob)
2500  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2501  s->uvblock[p] + 16 * n, eob);
2502  }
2503  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2504  dst += 4 * uvstep1d * s->uv_stride;
2505  }
2506  }
2507 }
2508 
2510  uint8_t *dst, ptrdiff_t dst_stride,
2511  const uint8_t *ref, ptrdiff_t ref_stride,
2513  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2514  int bw, int bh, int w, int h)
2515 {
2516  int mx = mv->x, my = mv->y, th;
2517 
2518  y += my >> 3;
2519  x += mx >> 3;
2520  ref += y * ref_stride + x;
2521  mx &= 7;
2522  my &= 7;
2523  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2524  // we use +7 because the last 7 pixels of each sbrow can be changed in
2525  // the longest loopfilter of the next sbrow
2526  th = (y + bh + 4 * !!my + 7) >> 6;
2527  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2528  if (x < !!mx * 3 || y < !!my * 3 ||
2529  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2531  ref - !!my * 3 * ref_stride - !!mx * 3,
2532  80, ref_stride,
2533  bw + !!mx * 7, bh + !!my * 7,
2534  x - !!mx * 3, y - !!my * 3, w, h);
2535  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2536  ref_stride = 80;
2537  }
2538  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2539 }
2540 
2542  uint8_t *dst_u, uint8_t *dst_v,
2543  ptrdiff_t dst_stride,
2544  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2545  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2547  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2548  int bw, int bh, int w, int h)
2549 {
2550  int mx = mv->x, my = mv->y, th;
2551 
2552  y += my >> 4;
2553  x += mx >> 4;
2554  ref_u += y * src_stride_u + x;
2555  ref_v += y * src_stride_v + x;
2556  mx &= 15;
2557  my &= 15;
2558  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2559  // we use +7 because the last 7 pixels of each sbrow can be changed in
2560  // the longest loopfilter of the next sbrow
2561  th = (y + bh + 4 * !!my + 7) >> 5;
2562  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2563  if (x < !!mx * 3 || y < !!my * 3 ||
2564  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2566  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2567  80, src_stride_u,
2568  bw + !!mx * 7, bh + !!my * 7,
2569  x - !!mx * 3, y - !!my * 3, w, h);
2570  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2571  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2572 
2574  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2575  80, src_stride_v,
2576  bw + !!mx * 7, bh + !!my * 7,
2577  x - !!mx * 3, y - !!my * 3, w, h);
2578  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2579  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2580  } else {
2581  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2582  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2583  }
2584 }
2585 
2586 static void inter_recon(AVCodecContext *ctx)
2587 {
2588  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2589  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2590  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2591  };
2592  VP9Context *s = ctx->priv_data;
2593  VP9Block *b = s->b;
2594  int row = s->row, col = s->col;
2595  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2596  AVFrame *ref1 = tref1->f, *ref2;
2597  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2598  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2599 
2600  if (b->comp) {
2601  tref2 = &s->refs[s->refidx[b->ref[1]]];
2602  ref2 = tref2->f;
2603  w2 = ref2->width;
2604  h2 = ref2->height;
2605  }
2606 
2607  // y inter pred
2608  if (b->bs > BS_8x8) {
2609  if (b->bs == BS_8x4) {
2610  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2611  ref1->data[0], ref1->linesize[0], tref1,
2612  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2613  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2614  s->dst[0] + 4 * ls_y, ls_y,
2615  ref1->data[0], ref1->linesize[0], tref1,
2616  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2617 
2618  if (b->comp) {
2619  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2620  ref2->data[0], ref2->linesize[0], tref2,
2621  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2622  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2623  s->dst[0] + 4 * ls_y, ls_y,
2624  ref2->data[0], ref2->linesize[0], tref2,
2625  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2626  }
2627  } else if (b->bs == BS_4x8) {
2628  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2629  ref1->data[0], ref1->linesize[0], tref1,
2630  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2631  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2632  ref1->data[0], ref1->linesize[0], tref1,
2633  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2634 
2635  if (b->comp) {
2636  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2637  ref2->data[0], ref2->linesize[0], tref2,
2638  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2639  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2640  ref2->data[0], ref2->linesize[0], tref2,
2641  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2642  }
2643  } else {
2644  av_assert2(b->bs == BS_4x4);
2645 
2646  // FIXME if two horizontally adjacent blocks have the same MV,
2647  // do a w8 instead of a w4 call
2648  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2649  ref1->data[0], ref1->linesize[0], tref1,
2650  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2651  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2652  ref1->data[0], ref1->linesize[0], tref1,
2653  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2654  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2655  s->dst[0] + 4 * ls_y, ls_y,
2656  ref1->data[0], ref1->linesize[0], tref1,
2657  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2658  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2659  s->dst[0] + 4 * ls_y + 4, ls_y,
2660  ref1->data[0], ref1->linesize[0], tref1,
2661  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2662 
2663  if (b->comp) {
2664  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2665  ref2->data[0], ref2->linesize[0], tref2,
2666  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2667  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2668  ref2->data[0], ref2->linesize[0], tref2,
2669  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2670  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2671  s->dst[0] + 4 * ls_y, ls_y,
2672  ref2->data[0], ref2->linesize[0], tref2,
2673  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2674  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2675  s->dst[0] + 4 * ls_y + 4, ls_y,
2676  ref2->data[0], ref2->linesize[0], tref2,
2677  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2678  }
2679  }
2680  } else {
2681  int bwl = bwlog_tab[0][b->bs];
2682  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2683 
2684  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2685  ref1->data[0], ref1->linesize[0], tref1,
2686  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2687 
2688  if (b->comp)
2689  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2690  ref2->data[0], ref2->linesize[0], tref2,
2691  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2692  }
2693 
2694  // uv inter pred
2695  {
2696  int bwl = bwlog_tab[1][b->bs];
2697  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2698  VP56mv mvuv;
2699 
2700  w1 = (w1 + 1) >> 1;
2701  h1 = (h1 + 1) >> 1;
2702  if (b->comp) {
2703  w2 = (w2 + 1) >> 1;
2704  h2 = (h2 + 1) >> 1;
2705  }
2706  if (b->bs > BS_8x8) {
2707  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2708  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2709  } else {
2710  mvuv = b->mv[0][0];
2711  }
2712 
2713  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2714  s->dst[1], s->dst[2], ls_uv,
2715  ref1->data[1], ref1->linesize[1],
2716  ref1->data[2], ref1->linesize[2], tref1,
2717  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2718 
2719  if (b->comp) {
2720  if (b->bs > BS_8x8) {
2721  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2722  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2723  } else {
2724  mvuv = b->mv[0][1];
2725  }
2726  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2727  s->dst[1], s->dst[2], ls_uv,
2728  ref2->data[1], ref2->linesize[1],
2729  ref2->data[2], ref2->linesize[2], tref2,
2730  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2731  }
2732  }
2733 
2734  if (!b->skip) {
2735  /* mostly copied intra_reconn() */
2736 
2737  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2738  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2739  int end_x = FFMIN(2 * (s->cols - col), w4);
2740  int end_y = FFMIN(2 * (s->rows - row), h4);
2741  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2742  int uvstep1d = 1 << b->uvtx, p;
2743  uint8_t *dst = s->dst[0];
2744 
2745  // y itxfm add
2746  for (n = 0, y = 0; y < end_y; y += step1d) {
2747  uint8_t *ptr = dst;
2748  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2749  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2750 
2751  if (eob)
2752  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2753  s->block + 16 * n, eob);
2754  }
2755  dst += 4 * s->y_stride * step1d;
2756  }
2757 
2758  // uv itxfm add
2759  h4 >>= 1;
2760  w4 >>= 1;
2761  end_x >>= 1;
2762  end_y >>= 1;
2763  step = 1 << (b->uvtx * 2);
2764  for (p = 0; p < 2; p++) {
2765  dst = s->dst[p + 1];
2766  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2767  uint8_t *ptr = dst;
2768  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2769  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2770 
2771  if (eob)
2772  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2773  s->uvblock[p] + 16 * n, eob);
2774  }
2775  dst += 4 * uvstep1d * s->uv_stride;
2776  }
2777  }
2778  }
2779 }
2780 
2781 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2782  int row_and_7, int col_and_7,
2783  int w, int h, int col_end, int row_end,
2784  enum TxfmMode tx, int skip_inter)
2785 {
2786  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2787  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2788  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2789  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2790 
2791  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2792  // edges. This means that for UV, we work on two subsampled blocks at
2793  // a time, and we only use the topleft block's mode information to set
2794  // things like block strength. Thus, for any block size smaller than
2795  // 16x16, ignore the odd portion of the block.
2796  if (tx == TX_4X4 && is_uv) {
2797  if (h == 1) {
2798  if (row_and_7 & 1)
2799  return;
2800  if (!row_end)
2801  h += 1;
2802  }
2803  if (w == 1) {
2804  if (col_and_7 & 1)
2805  return;
2806  if (!col_end)
2807  w += 1;
2808  }
2809  }
2810 
2811  if (tx == TX_4X4 && !skip_inter) {
2812  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2813  int m_col_odd = (t << (w - 1)) - t;
2814 
2815  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2816  if (is_uv) {
2817  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2818 
2819  for (y = row_and_7; y < h + row_and_7; y++) {
2820  int col_mask_id = 2 - !(y & 7);
2821 
2822  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2823  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2824  // for odd lines, if the odd col is not being filtered,
2825  // skip odd row also:
2826  // .---. <-- a
2827  // | |
2828  // |___| <-- b
2829  // ^ ^
2830  // c d
2831  //
2832  // if a/c are even row/col and b/d are odd, and d is skipped,
2833  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2834  if ((col_end & 1) && (y & 1)) {
2835  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2836  } else {
2837  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2838  }
2839  }
2840  } else {
2841  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2842 
2843  for (y = row_and_7; y < h + row_and_7; y++) {
2844  int col_mask_id = 2 - !(y & 3);
2845 
2846  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2847  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2848  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2849  lflvl->mask[is_uv][0][y][3] |= m_col;
2850  lflvl->mask[is_uv][1][y][3] |= m_col;
2851  }
2852  }
2853  } else {
2854  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2855 
2856  if (!skip_inter) {
2857  int mask_id = (tx == TX_8X8);
2858  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2859  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2860  int m_row = m_col & masks[l2];
2861 
2862  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2863  // 8wd loopfilter to prevent going off the visible edge.
2864  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2865  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2866  int m_row_8 = m_row - m_row_16;
2867 
2868  for (y = row_and_7; y < h + row_and_7; y++) {
2869  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2870  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2871  }
2872  } else {
2873  for (y = row_and_7; y < h + row_and_7; y++)
2874  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2875  }
2876 
2877  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2878  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2879  lflvl->mask[is_uv][1][y][0] |= m_col;
2880  if (y - row_and_7 == h - 1)
2881  lflvl->mask[is_uv][1][y][1] |= m_col;
2882  } else {
2883  for (y = row_and_7; y < h + row_and_7; y += step1d)
2884  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2885  }
2886  } else if (tx != TX_4X4) {
2887  int mask_id;
2888 
2889  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2890  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2891  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2892  for (y = row_and_7; y < h + row_and_7; y++)
2893  lflvl->mask[is_uv][0][y][mask_id] |= t;
2894  } else if (is_uv) {
2895  int t8 = t & 0x01, t4 = t - t8;
2896 
2897  for (y = row_and_7; y < h + row_and_7; y++) {
2898  lflvl->mask[is_uv][0][y][2] |= t4;
2899  lflvl->mask[is_uv][0][y][1] |= t8;
2900  }
2901  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2902  } else {
2903  int t8 = t & 0x11, t4 = t - t8;
2904 
2905  for (y = row_and_7; y < h + row_and_7; y++) {
2906  lflvl->mask[is_uv][0][y][2] |= t4;
2907  lflvl->mask[is_uv][0][y][1] |= t8;
2908  }
2909  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2910  }
2911  }
2912 }
2913 
2914 static void decode_b(AVCodecContext *ctx, int row, int col,
2915  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2916  enum BlockLevel bl, enum BlockPartition bp)
2917 {
2918  VP9Context *s = ctx->priv_data;
2919  VP9Block *b = s->b;
2920  enum BlockSize bs = bl * 3 + bp;
2921  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2922  int emu[2];
2923  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2924 
2925  s->row = row;
2926  s->row7 = row & 7;
2927  s->col = col;
2928  s->col7 = col & 7;
2929  s->min_mv.x = -(128 + col * 64);
2930  s->min_mv.y = -(128 + row * 64);
2931  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2932  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2933  if (s->pass < 2) {
2934  b->bs = bs;
2935  b->bl = bl;
2936  b->bp = bp;
2937  decode_mode(ctx);
2938  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2939 
2940  if (!b->skip) {
2941  decode_coeffs(ctx);
2942  } else {
2943  int row7 = s->row7;
2944 
2945 #define SPLAT_ZERO_CTX(v, n) \
2946  switch (n) { \
2947  case 1: v = 0; break; \
2948  case 2: AV_ZERO16(&v); break; \
2949  case 4: AV_ZERO32(&v); break; \
2950  case 8: AV_ZERO64(&v); break; \
2951  case 16: AV_ZERO128(&v); break; \
2952  }
2953 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2954  do { \
2955  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2956  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2957  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2958  } while (0)
2959 
2960  switch (w4) {
2961  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2962  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2963  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2964  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2965  }
2966  switch (h4) {
2967  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2968  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2969  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2970  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2971  }
2972  }
2973  if (s->pass == 1) {
2974  s->b++;
2975  s->block += w4 * h4 * 64;
2976  s->uvblock[0] += w4 * h4 * 16;
2977  s->uvblock[1] += w4 * h4 * 16;
2978  s->eob += 4 * w4 * h4;
2979  s->uveob[0] += w4 * h4;
2980  s->uveob[1] += w4 * h4;
2981 
2982  return;
2983  }
2984  }
2985 
2986  // emulated overhangs if the stride of the target buffer can't hold. This
2987  // allows to support emu-edge and so on even if we have large block
2988  // overhangs
2989  emu[0] = (col + w4) * 8 > f->linesize[0] ||
2990  (row + h4) > s->rows;
2991  emu[1] = (col + w4) * 4 > f->linesize[1] ||
2992  (row + h4) > s->rows;
2993  if (emu[0]) {
2994  s->dst[0] = s->tmp_y;
2995  s->y_stride = 64;
2996  } else {
2997  s->dst[0] = f->data[0] + yoff;
2998  s->y_stride = f->linesize[0];
2999  }
3000  if (emu[1]) {
3001  s->dst[1] = s->tmp_uv[0];
3002  s->dst[2] = s->tmp_uv[1];
3003  s->uv_stride = 32;
3004  } else {
3005  s->dst[1] = f->data[1] + uvoff;
3006  s->dst[2] = f->data[2] + uvoff;
3007  s->uv_stride = f->linesize[1];
3008  }
3009  if (b->intra) {
3010  intra_recon(ctx, yoff, uvoff);
3011  } else {
3012  inter_recon(ctx);
3013  }
3014  if (emu[0]) {
3015  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3016 
3017  for (n = 0; o < w; n++) {
3018  int bw = 64 >> n;
3019 
3020  av_assert2(n <= 4);
3021  if (w & bw) {
3022  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3023  s->tmp_y + o, 64, h, 0, 0);
3024  o += bw;
3025  }
3026  }
3027  }
3028  if (emu[1]) {
3029  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3030 
3031  for (n = 1; o < w; n++) {
3032  int bw = 64 >> n;
3033 
3034  av_assert2(n <= 4);
3035  if (w & bw) {
3036  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3037  s->tmp_uv[0] + o, 32, h, 0, 0);
3038  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3039  s->tmp_uv[1] + o, 32, h, 0, 0);
3040  o += bw;
3041  }
3042  }
3043  }
3044 
3045  // pick filter level and find edges to apply filter to
3046  if (s->filter.level &&
3047  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3048  [b->mode[3] != ZEROMV]) > 0) {
3049  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3050  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3051 
3052  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3053  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3054  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3055  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3056  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3057  b->uvtx, skip_inter);
3058 
3059  if (!s->filter.lim_lut[lvl]) {
3060  int sharp = s->filter.sharpness;
3061  int limit = lvl;
3062 
3063  if (sharp > 0) {
3064  limit >>= (sharp + 3) >> 2;
3065  limit = FFMIN(limit, 9 - sharp);
3066  }
3067  limit = FFMAX(limit, 1);
3068 
3069  s->filter.lim_lut[lvl] = limit;
3070  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3071  }
3072  }
3073 
3074  if (s->pass == 2) {
3075  s->b++;
3076  s->block += w4 * h4 * 64;
3077  s->uvblock[0] += w4 * h4 * 16;
3078  s->uvblock[1] += w4 * h4 * 16;
3079  s->eob += 4 * w4 * h4;
3080  s->uveob[0] += w4 * h4;
3081  s->uveob[1] += w4 * h4;
3082  }
3083 }
3084 
3085 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3086  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3087 {
3088  VP9Context *s = ctx->priv_data;
3089  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3090  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3091  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3092  s->prob.p.partition[bl][c];
3093  enum BlockPartition bp;
3094  ptrdiff_t hbs = 4 >> bl;
3095  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3096  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3097 
3098  if (bl == BL_8X8) {
3099  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3100  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3101  } else if (col + hbs < s->cols) { // FIXME why not <=?
3102  if (row + hbs < s->rows) { // FIXME why not <=?
3103  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3104  switch (bp) {
3105  case PARTITION_NONE:
3106  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3107  break;
3108  case PARTITION_H:
3109  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3110  yoff += hbs * 8 * y_stride;
3111  uvoff += hbs * 4 * uv_stride;
3112  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3113  break;
3114  case PARTITION_V:
3115  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3116  yoff += hbs * 8;
3117  uvoff += hbs * 4;
3118  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3119  break;
3120  case PARTITION_SPLIT:
3121  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3122  decode_sb(ctx, row, col + hbs, lflvl,
3123  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3124  yoff += hbs * 8 * y_stride;
3125  uvoff += hbs * 4 * uv_stride;
3126  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3127  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3128  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3129  break;
3130  default:
3131  av_assert0(0);
3132  }
3133  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3134  bp = PARTITION_SPLIT;
3135  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3136  decode_sb(ctx, row, col + hbs, lflvl,
3137  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3138  } else {
3139  bp = PARTITION_H;
3140  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3141  }
3142  } else if (row + hbs < s->rows) { // FIXME why not <=?
3143  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3144  bp = PARTITION_SPLIT;
3145  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3146  yoff += hbs * 8 * y_stride;
3147  uvoff += hbs * 4 * uv_stride;
3148  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3149  } else {
3150  bp = PARTITION_V;
3151  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3152  }
3153  } else {
3154  bp = PARTITION_SPLIT;
3155  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3156  }
3157  s->counts.partition[bl][c][bp]++;
3158 }
3159 
3160 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3161  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3162 {
3163  VP9Context *s = ctx->priv_data;
3164  VP9Block *b = s->b;
3165  ptrdiff_t hbs = 4 >> bl;
3166  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3167  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3168 
3169  if (bl == BL_8X8) {
3170  av_assert2(b->bl == BL_8X8);
3171  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3172  } else if (s->b->bl == bl) {
3173  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3174  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3175  yoff += hbs * 8 * y_stride;
3176  uvoff += hbs * 4 * uv_stride;
3177  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3178  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3179  yoff += hbs * 8;
3180  uvoff += hbs * 4;
3181  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3182  }
3183  } else {
3184  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3185  if (col + hbs < s->cols) { // FIXME why not <=?
3186  if (row + hbs < s->rows) {
3187  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3188  uvoff + 4 * hbs, bl + 1);
3189  yoff += hbs * 8 * y_stride;
3190  uvoff += hbs * 4 * uv_stride;
3191  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3192  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3193  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3194  } else {
3195  yoff += hbs * 8;
3196  uvoff += hbs * 4;
3197  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3198  }
3199  } else if (row + hbs < s->rows) {
3200  yoff += hbs * 8 * y_stride;
3201  uvoff += hbs * 4 * uv_stride;
3202  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3203  }
3204  }
3205 }
3206 
3207 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3208  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3209 {
3210  VP9Context *s = ctx->priv_data;
3211  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3212  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3213  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3214  int y, x, p;
3215 
3216  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3217  // if you think of them as acting on a 8x8 block max, we can interleave
3218  // each v/h within the single x loop, but that only works if we work on
3219  // 8 pixel blocks, and we won't always do that (we want at least 16px
3220  // to use SSE2 optimizations, perhaps 32 for AVX2)
3221 
3222  // filter edges between columns, Y plane (e.g. block1 | block2)
3223  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3224  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3225  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3226  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3227  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3228  unsigned hm = hm1 | hm2 | hm13 | hm23;
3229 
3230  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3231  if (hm1 & x) {
3232  int L = *l, H = L >> 4;
3233  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3234 
3235  if (col || x > 1) {
3236  if (hmask1[0] & x) {
3237  if (hmask2[0] & x) {
3238  av_assert2(l[8] == L);
3239  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3240  } else {
3241  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3242  }
3243  } else if (hm2 & x) {
3244  L = l[8];
3245  H |= (L >> 4) << 8;
3246  E |= s->filter.mblim_lut[L] << 8;
3247  I |= s->filter.lim_lut[L] << 8;
3248  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3249  [!!(hmask2[1] & x)]
3250  [0](ptr, ls_y, E, I, H);
3251  } else {
3252  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3253  [0](ptr, ls_y, E, I, H);
3254  }
3255  }
3256  } else if (hm2 & x) {
3257  int L = l[8], H = L >> 4;
3258  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3259 
3260  if (col || x > 1) {
3261  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3262  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3263  }
3264  }
3265  if (hm13 & x) {
3266  int L = *l, H = L >> 4;
3267  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3268 
3269  if (hm23 & x) {
3270  L = l[8];
3271  H |= (L >> 4) << 8;
3272  E |= s->filter.mblim_lut[L] << 8;
3273  I |= s->filter.lim_lut[L] << 8;
3274  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3275  } else {
3276  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3277  }
3278  } else if (hm23 & x) {
3279  int L = l[8], H = L >> 4;
3280  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3281 
3282  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3283  }
3284  }
3285  }
3286 
3287  // block1
3288  // filter edges between rows, Y plane (e.g. ------)
3289  // block2
3290  dst = f->data[0] + yoff;
3291  lvl = lflvl->level;
3292  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3293  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3294  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3295 
3296  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3297  if (row || y) {
3298  if (vm & x) {
3299  int L = *l, H = L >> 4;
3300  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3301 
3302  if (vmask[0] & x) {
3303  if (vmask[0] & (x << 1)) {
3304  av_assert2(l[1] == L);
3305  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3306  } else {
3307  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3308  }
3309  } else if (vm & (x << 1)) {
3310  L = l[1];
3311  H |= (L >> 4) << 8;
3312  E |= s->filter.mblim_lut[L] << 8;
3313  I |= s->filter.lim_lut[L] << 8;
3314  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3315  [!!(vmask[1] & (x << 1))]
3316  [1](ptr, ls_y, E, I, H);
3317  } else {
3318  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3319  [1](ptr, ls_y, E, I, H);
3320  }
3321  } else if (vm & (x << 1)) {
3322  int L = l[1], H = L >> 4;
3323  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3324 
3325  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3326  [1](ptr + 8, ls_y, E, I, H);
3327  }
3328  }
3329  if (vm3 & x) {
3330  int L = *l, H = L >> 4;
3331  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3332 
3333  if (vm3 & (x << 1)) {
3334  L = l[1];
3335  H |= (L >> 4) << 8;
3336  E |= s->filter.mblim_lut[L] << 8;
3337  I |= s->filter.lim_lut[L] << 8;
3338  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3339  } else {
3340  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3341  }
3342  } else if (vm3 & (x << 1)) {
3343  int L = l[1], H = L >> 4;
3344  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3345 
3346  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3347  }
3348  }
3349  }
3350 
3351  // same principle but for U/V planes
3352  for (p = 0; p < 2; p++) {
3353  lvl = lflvl->level;
3354  dst = f->data[1 + p] + uvoff;
3355  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3356  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3357  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3358  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3359  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3360 
3361  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3362  if (col || x > 1) {
3363  if (hm1 & x) {
3364  int L = *l, H = L >> 4;
3365  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3366 
3367  if (hmask1[0] & x) {
3368  if (hmask2[0] & x) {
3369  av_assert2(l[16] == L);
3370  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3371  } else {
3372  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3373  }
3374  } else if (hm2 & x) {
3375  L = l[16];
3376  H |= (L >> 4) << 8;
3377  E |= s->filter.mblim_lut[L] << 8;
3378  I |= s->filter.lim_lut[L] << 8;
3379  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3380  [!!(hmask2[1] & x)]
3381  [0](ptr, ls_uv, E, I, H);
3382  } else {
3383  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3384  [0](ptr, ls_uv, E, I, H);
3385  }
3386  } else if (hm2 & x) {
3387  int L = l[16], H = L >> 4;
3388  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3389 
3390  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3391  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3392  }
3393  }
3394  if (x & 0xAA)
3395  l += 2;
3396  }
3397  }
3398  lvl = lflvl->level;
3399  dst = f->data[1 + p] + uvoff;
3400  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3401  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3402  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3403 
3404  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3405  if (row || y) {
3406  if (vm & x) {
3407  int L = *l, H = L >> 4;
3408  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3409 
3410  if (vmask[0] & x) {
3411  if (vmask[0] & (x << 2)) {
3412  av_assert2(l[2] == L);
3413  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3414  } else {
3415  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3416  }
3417  } else if (vm & (x << 2)) {
3418  L = l[2];
3419  H |= (L >> 4) << 8;
3420  E |= s->filter.mblim_lut[L] << 8;
3421  I |= s->filter.lim_lut[L] << 8;
3422  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3423  [!!(vmask[1] & (x << 2))]
3424  [1](ptr, ls_uv, E, I, H);
3425  } else {
3426  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3427  [1](ptr, ls_uv, E, I, H);
3428  }
3429  } else if (vm & (x << 2)) {
3430  int L = l[2], H = L >> 4;
3431  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3432 
3433  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3434  [1](ptr + 8, ls_uv, E, I, H);
3435  }
3436  }
3437  }
3438  if (y & 1)
3439  lvl += 16;
3440  }
3441  }
3442 }
3443 
3444 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3445 {
3446  int sb_start = ( idx * n) >> log2_n;
3447  int sb_end = ((idx + 1) * n) >> log2_n;
3448  *start = FFMIN(sb_start, n) << 3;
3449  *end = FFMIN(sb_end, n) << 3;
3450 }
3451 
3452 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3453  int max_count, int update_factor)
3454 {
3455  unsigned ct = ct0 + ct1, p2, p1;
3456 
3457  if (!ct)
3458  return;
3459 
3460  p1 = *p;
3461  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3462  p2 = av_clip(p2, 1, 255);
3463  ct = FFMIN(ct, max_count);
3464  update_factor = FASTDIV(update_factor * ct, max_count);
3465 
3466  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3467  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3468 }
3469 
3470 static void adapt_probs(VP9Context *s)
3471 {
3472  int i, j, k, l, m;
3473  prob_context *p = &s->prob_ctx[s->framectxid].p;
3474  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3475 
3476  // coefficients
3477  for (i = 0; i < 4; i++)
3478  for (j = 0; j < 2; j++)
3479  for (k = 0; k < 2; k++)
3480  for (l = 0; l < 6; l++)
3481  for (m = 0; m < 6; m++) {
3482  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3483  unsigned *e = s->counts.eob[i][j][k][l][m];
3484  unsigned *c = s->counts.coef[i][j][k][l][m];
3485 
3486  if (l == 0 && m >= 3) // dc only has 3 pt
3487  break;
3488 
3489  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3490  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3491  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3492  }
3493 
3494  if (s->keyframe || s->intraonly) {
3495  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3496  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3497  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3498  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3499  return;
3500  }
3501 
3502  // skip flag
3503  for (i = 0; i < 3; i++)
3504  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3505 
3506  // intra/inter flag
3507  for (i = 0; i < 4; i++)
3508  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3509 
3510  // comppred flag
3511  if (s->comppredmode == PRED_SWITCHABLE) {
3512  for (i = 0; i < 5; i++)
3513  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3514  }
3515 
3516  // reference frames
3517  if (s->comppredmode != PRED_SINGLEREF) {
3518  for (i = 0; i < 5; i++)
3519  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3520  s->counts.comp_ref[i][1], 20, 128);
3521  }
3522 
3523  if (s->comppredmode != PRED_COMPREF) {
3524  for (i = 0; i < 5; i++) {
3525  uint8_t *pp = p->single_ref[i];
3526  unsigned (*c)[2] = s->counts.single_ref[i];
3527 
3528  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3529  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3530  }
3531  }
3532 
3533  // block partitioning
3534  for (i = 0; i < 4; i++)
3535  for (j = 0; j < 4; j++) {
3536  uint8_t *pp = p->partition[i][j];
3537  unsigned *c = s->counts.partition[i][j];
3538 
3539  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3540  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3541  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3542  }
3543 
3544  // tx size
3545  if (s->txfmmode == TX_SWITCHABLE) {
3546  for (i = 0; i < 2; i++) {
3547  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3548 
3549  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3550  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3551  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3552  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3553  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3554  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3555  }
3556  }
3557 
3558  // interpolation filter
3559  if (s->filtermode == FILTER_SWITCHABLE) {
3560  for (i = 0; i < 4; i++) {
3561  uint8_t *pp = p->filter[i];
3562  unsigned *c = s->counts.filter[i];
3563 
3564  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3565  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3566  }
3567  }
3568 
3569  // inter modes
3570  for (i = 0; i < 7; i++) {
3571  uint8_t *pp = p->mv_mode[i];
3572  unsigned *c = s->counts.mv_mode[i];
3573 
3574  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3575  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3576  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3577  }
3578 
3579  // mv joints
3580  {
3581  uint8_t *pp = p->mv_joint;
3582  unsigned *c = s->counts.mv_joint;
3583 
3584  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3585  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3586  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3587  }
3588 
3589  // mv components
3590  for (i = 0; i < 2; i++) {
3591  uint8_t *pp;
3592  unsigned *c, (*c2)[2], sum;
3593 
3594  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3595  s->counts.mv_comp[i].sign[1], 20, 128);
3596 
3597  pp = p->mv_comp[i].classes;
3598  c = s->counts.mv_comp[i].classes;
3599  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3600  adapt_prob(&pp[0], c[0], sum, 20, 128);
3601  sum -= c[1];
3602  adapt_prob(&pp[1], c[1], sum, 20, 128);
3603  sum -= c[2] + c[3];
3604  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3605  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3606  sum -= c[4] + c[5];
3607  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3608  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3609  sum -= c[6];
3610  adapt_prob(&pp[6], c[6], sum, 20, 128);
3611  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3612  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3613  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3614 
3615  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3616  s->counts.mv_comp[i].class0[1], 20, 128);
3617  pp = p->mv_comp[i].bits;
3618  c2 = s->counts.mv_comp[i].bits;
3619  for (j = 0; j < 10; j++)
3620  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3621 
3622  for (j = 0; j < 2; j++) {
3623  pp = p->mv_comp[i].class0_fp[j];
3624  c = s->counts.mv_comp[i].class0_fp[j];
3625  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3626  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3627  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3628  }
3629  pp = p->mv_comp[i].fp;
3630  c = s->counts.mv_comp[i].fp;
3631  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3632  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3633  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3634 
3635  if (s->highprecisionmvs) {
3636  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3637  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3638  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3639  s->counts.mv_comp[i].hp[1], 20, 128);
3640  }
3641  }
3642 
3643  // y intra modes
3644  for (i = 0; i < 4; i++) {
3645  uint8_t *pp = p->y_mode[i];
3646  unsigned *c = s->counts.y_mode[i], sum, s2;
3647 
3648  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3649  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3650  sum -= c[TM_VP8_PRED];
3651  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3652  sum -= c[VERT_PRED];
3653  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3654  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3655  sum -= s2;
3656  adapt_prob(&pp[3], s2, sum, 20, 128);
3657  s2 -= c[HOR_PRED];
3658  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3659  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3660  sum -= c[DIAG_DOWN_LEFT_PRED];
3661  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3662  sum -= c[VERT_LEFT_PRED];
3663  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3664  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3665  }
3666 
3667  // uv intra modes
3668  for (i = 0; i < 10; i++) {
3669  uint8_t *pp = p->uv_mode[i];
3670  unsigned *c = s->counts.uv_mode[i], sum, s2;
3671 
3672  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3673  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3674  sum -= c[TM_VP8_PRED];
3675  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3676  sum -= c[VERT_PRED];
3677  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3678  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3679  sum -= s2;
3680  adapt_prob(&pp[3], s2, sum, 20, 128);
3681  s2 -= c[HOR_PRED];
3682  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3683  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3684  sum -= c[DIAG_DOWN_LEFT_PRED];
3685  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3686  sum -= c[VERT_LEFT_PRED];
3687  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3688  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3689  }
3690 }
3691 
3692 static void free_buffers(VP9Context *s)
3693 {
3694  av_freep(&s->intra_pred_data[0]);
3695  av_freep(&s->b_base);
3696  av_freep(&s->block_base);
3697 }
3698 
3700 {
3701  VP9Context *s = ctx->priv_data;
3702  int i;
3703 
3704  for (i = 0; i < 2; i++) {
3705  if (s->frames[i].tf.f->data[0])
3706  vp9_unref_frame(ctx, &s->frames[i]);
3707  av_frame_free(&s->frames[i].tf.f);
3708  }
3709  for (i = 0; i < 8; i++) {
3710  if (s->refs[i].f->data[0])
3711  ff_thread_release_buffer(ctx, &s->refs[i]);
3712  av_frame_free(&s->refs[i].f);
3713  if (s->next_refs[i].f->data[0])
3714  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3715  av_frame_free(&s->next_refs[i].f);
3716  }
3717  free_buffers(s);
3718  av_freep(&s->c_b);
3719  s->c_b_size = 0;
3720 
3721  return 0;
3722 }
3723 
3724 
3725 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3726  int *got_frame, AVPacket *pkt)
3727 {
3728  const uint8_t *data = pkt->data;
3729  int size = pkt->size;
3730  VP9Context *s = ctx->priv_data;
3731  int res, tile_row, tile_col, i, ref, row, col;
3732  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3733  AVFrame *f;
3734 
3735  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3736  return res;
3737  } else if (res == 0) {
3738  if (!s->refs[ref].f->data[0]) {
3739  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3740  return AVERROR_INVALIDDATA;
3741  }
3742  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3743  return res;
3744  *got_frame = 1;
3745  return 0;
3746  }
3747  data += res;
3748  size -= res;
3749 
3750  if (s->frames[LAST_FRAME].tf.f->data[0])
3751  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3752  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3753  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3754  return res;
3755  if (s->frames[CUR_FRAME].tf.f->data[0])
3756  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3757  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3758  return res;
3759  f = s->frames[CUR_FRAME].tf.f;
3760  f->key_frame = s->keyframe;
3762  ls_y = f->linesize[0];
3763  ls_uv =f->linesize[1];
3764 
3765  // ref frame setup
3766  for (i = 0; i < 8; i++) {
3767  if (s->next_refs[i].f->data[0])
3768  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3769  if (s->refreshrefmask & (1 << i)) {
3770  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3771  } else {
3772  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3773  }
3774  if (res < 0)
3775  return res;
3776  }
3777 
3778  // main tile decode loop
3779  memset(s->above_partition_ctx, 0, s->cols);
3780  memset(s->above_skip_ctx, 0, s->cols);
3781  if (s->keyframe || s->intraonly) {
3782  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3783  } else {
3784  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3785  }
3786  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3787  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3788  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3789  memset(s->above_segpred_ctx, 0, s->cols);
3790  s->pass = s->uses_2pass =
3792  if ((res = update_block_buffers(ctx)) < 0) {
3793  av_log(ctx, AV_LOG_ERROR,
3794  "Failed to allocate block buffers\n");
3795  return res;
3796  }
3797  if (s->refreshctx && s->parallelmode) {
3798  int j, k, l, m;
3799 
3800  for (i = 0; i < 4; i++) {
3801  for (j = 0; j < 2; j++)
3802  for (k = 0; k < 2; k++)
3803  for (l = 0; l < 6; l++)
3804  for (m = 0; m < 6; m++)
3805  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3806  s->prob.coef[i][j][k][l][m], 3);
3807  if (s->txfmmode == i)
3808  break;
3809  }
3810  s->prob_ctx[s->framectxid].p = s->prob.p;
3812  }
3813 
3814  do {
3815  yoff = uvoff = 0;
3816  s->b = s->b_base;
3817  s->block = s->block_base;
3818  s->uvblock[0] = s->uvblock_base[0];
3819  s->uvblock[1] = s->uvblock_base[1];
3820  s->eob = s->eob_base;
3821  s->uveob[0] = s->uveob_base[0];
3822  s->uveob[1] = s->uveob_base[1];
3823 
3824  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3826  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3827  if (s->pass != 2) {
3828  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3829  unsigned tile_size;
3830 
3831  if (tile_col == s->tiling.tile_cols - 1 &&
3832  tile_row == s->tiling.tile_rows - 1) {
3833  tile_size = size;
3834  } else {
3835  tile_size = AV_RB32(data);
3836  data += 4;
3837  size -= 4;
3838  }
3839  if (tile_size > size) {
3840  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3841  return AVERROR_INVALIDDATA;
3842  }
3843  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3844  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3845  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3846  return AVERROR_INVALIDDATA;
3847  }
3848  data += tile_size;
3849  size -= tile_size;
3850  }
3851  }
3852 
3853  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3854  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3855  struct VP9Filter *lflvl_ptr = s->lflvl;
3856  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3857 
3858  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3860  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3861 
3862  if (s->pass != 2) {
3863  memset(s->left_partition_ctx, 0, 8);
3864  memset(s->left_skip_ctx, 0, 8);
3865  if (s->keyframe || s->intraonly) {
3866  memset(s->left_mode_ctx, DC_PRED, 16);
3867  } else {
3868  memset(s->left_mode_ctx, NEARESTMV, 8);
3869  }
3870  memset(s->left_y_nnz_ctx, 0, 16);
3871  memset(s->left_uv_nnz_ctx, 0, 16);
3872  memset(s->left_segpred_ctx, 0, 8);
3873 
3874  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3875  }
3876 
3877  for (col = s->tiling.tile_col_start;
3878  col < s->tiling.tile_col_end;
3879  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3880  // FIXME integrate with lf code (i.e. zero after each
3881  // use, similar to invtxfm coefficients, or similar)
3882  if (s->pass != 1) {
3883  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3884  }
3885 
3886  if (s->pass == 2) {
3887  decode_sb_mem(ctx, row, col, lflvl_ptr,
3888  yoff2, uvoff2, BL_64X64);
3889  } else {
3890  decode_sb(ctx, row, col, lflvl_ptr,
3891  yoff2, uvoff2, BL_64X64);
3892  }
3893  }
3894  if (s->pass != 2) {
3895  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3896  }
3897  }
3898 
3899  if (s->pass == 1) {
3900  continue;
3901  }
3902 
3903  // backup pre-loopfilter reconstruction data for intra
3904  // prediction of next row of sb64s
3905  if (row + 8 < s->rows) {
3906  memcpy(s->intra_pred_data[0],
3907  f->data[0] + yoff + 63 * ls_y,
3908  8 * s->cols);
3909  memcpy(s->intra_pred_data[1],
3910  f->data[1] + uvoff + 31 * ls_uv,
3911  4 * s->cols);
3912  memcpy(s->intra_pred_data[2],
3913  f->data[2] + uvoff + 31 * ls_uv,
3914  4 * s->cols);
3915  }
3916 
3917  // loopfilter one row
3918  if (s->filter.level) {
3919  yoff2 = yoff;
3920  uvoff2 = uvoff;
3921  lflvl_ptr = s->lflvl;
3922  for (col = 0; col < s->cols;
3923  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3924  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3925  }
3926  }
3927 
3928  // FIXME maybe we can make this more finegrained by running the
3929  // loopfilter per-block instead of after each sbrow
3930  // In fact that would also make intra pred left preparation easier?
3931  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3932  }
3933  }
3934 
3935  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3936  adapt_probs(s);
3938  }
3939  } while (s->pass++ == 1);
3940  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3941 
3942  // ref frame setup
3943  for (i = 0; i < 8; i++) {
3944  if (s->refs[i].f->data[0])
3945  ff_thread_release_buffer(ctx, &s->refs[i]);
3946  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3947  }
3948 
3949  if (!s->invisible) {
3950  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3951  return res;
3952  *got_frame = 1;
3953  }
3954 
3955  return 0;
3956 }
3957 
3959 {
3960  VP9Context *s = ctx->priv_data;
3961  int i;
3962 
3963  for (i = 0; i < 2; i++)
3964  vp9_unref_frame(ctx, &s->frames[i]);
3965  for (i = 0; i < 8; i++)
3966  ff_thread_release_buffer(ctx, &s->refs[i]);
3967 }
3968 
3969 static int init_frames(AVCodecContext *ctx)
3970 {
3971  VP9Context *s = ctx->priv_data;
3972  int i;
3973 
3974  for (i = 0; i < 2; i++) {
3975  s->frames[i].tf.f = av_frame_alloc();
3976  if (!s->frames[i].tf.f) {
3977  vp9_decode_free(ctx);
3978  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3979  return AVERROR(ENOMEM);
3980  }
3981  }
3982  for (i = 0; i < 8; i++) {
3983  s->refs[i].f = av_frame_alloc();
3984  s->next_refs[i].f = av_frame_alloc();
3985  if (!s->refs[i].f || !s->next_refs[i].f) {
3986  vp9_decode_free(ctx);
3987  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3988  return AVERROR(ENOMEM);
3989  }
3990  }
3991 
3992  return 0;
3993 }
3994 
3996 {
3997  VP9Context *s = ctx->priv_data;
3998 
3999  ctx->internal->allocate_progress = 1;
4000  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4001  ff_vp9dsp_init(&s->dsp);
4002  ff_videodsp_init(&s->vdsp, 8);
4003  s->filter.sharpness = -1;
4004 
4005  return init_frames(ctx);
4006 }
4007 
4009 {
4010  return init_frames(avctx);
4011 }
4012 
4014 {
4015  int i, res;
4016  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4017 
4018  // detect size changes in other threads
4019  if (s->intra_pred_data[0] &&
4020  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4021  free_buffers(s);
4022  }
4023 
4024  for (i = 0; i < 2; i++) {
4025  if (s->frames[i].tf.f->data[0])
4026  vp9_unref_frame(dst, &s->frames[i]);
4027  if (ssrc->frames[i].tf.f->data[0]) {
4028  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4029  return res;
4030  }
4031  }
4032  for (i = 0; i < 8; i++) {
4033  if (s->refs[i].f->data[0])
4034  ff_thread_release_buffer(dst, &s->refs[i]);
4035  if (ssrc->next_refs[i].f->data[0]) {
4036  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4037  return res;
4038  }
4039  }
4040 
4041  s->invisible = ssrc->invisible;
4042  s->keyframe = ssrc->keyframe;
4043  s->uses_2pass = ssrc->uses_2pass;
4044  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4045  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4046  if (ssrc->segmentation.enabled) {
4047  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4048  sizeof(s->segmentation.feat));
4049  }
4050 
4051  return 0;
4052 }
4053 
4055  .name = "vp9",
4056  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4057  .type = AVMEDIA_TYPE_VIDEO,
4058  .id = AV_CODEC_ID_VP9,
4059  .priv_data_size = sizeof(VP9Context),
4060  .init = vp9_decode_init,
4063  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4067 };