FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148  struct {
153  struct {
159  int16_t q_val;
160  int8_t lf_val;
161  int16_t qmul[2][2];
162  uint8_t lflvl[4][2];
163  } feat[8];
164  } segmentation;
165  struct {
167  unsigned tile_cols, tile_rows;
169  } tiling;
170  unsigned sb_cols, sb_rows, rows, cols;
171  struct {
173  uint8_t coef[4][2][2][6][6][3];
174  } prob_ctx[4];
175  struct {
176  prob_context p;
177  uint8_t coef[4][2][2][6][6][11];
180  } prob;
181  struct {
182  unsigned y_mode[4][10];
183  unsigned uv_mode[10][10];
184  unsigned filter[4][3];
185  unsigned mv_mode[7][4];
186  unsigned intra[4][2];
187  unsigned comp[5][2];
188  unsigned single_ref[5][2][2];
189  unsigned comp_ref[5][2];
190  unsigned tx32p[2][4];
191  unsigned tx16p[2][3];
192  unsigned tx8p[2][2];
193  unsigned skip[3][2];
194  unsigned mv_joint[4];
195  struct {
196  unsigned sign[2];
197  unsigned classes[11];
198  unsigned class0[2];
199  unsigned bits[10][2];
200  unsigned class0_fp[2][4];
201  unsigned fp[4];
202  unsigned class0_hp[2];
203  unsigned hp[2];
204  } mv_comp[2];
205  unsigned partition[4][4][4];
206  unsigned coef[4][2][2][6][6][3];
207  unsigned eob[4][2][2][6][6][2];
208  } counts;
211 
212  // contextual (left/above) cache
227  // FIXME maybe merge some of the below in a flags field?
238 
239  // whole-frame cache
241  struct VP9Filter *lflvl;
243 
244  // block reconstruction intermediates
246  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
248  struct { int x, y; } min_mv, max_mv;
250  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
251 } VP9Context;
252 
253 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
254  {
255  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
256  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
257  }, {
258  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
259  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
260  }
261 };
262 
264 {
265  VP9Context *s = ctx->priv_data;
266  int ret, sz;
267 
268  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
269  return ret;
270  sz = 64 * s->sb_cols * s->sb_rows;
271  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
272  ff_thread_release_buffer(ctx, &f->tf);
273  return AVERROR(ENOMEM);
274  }
275 
277  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
278 
279  // retain segmentation map if it doesn't update
281  !s->intraonly && !s->keyframe) {
283  }
284 
285  return 0;
286 }
287 
289 {
290  ff_thread_release_buffer(ctx, &f->tf);
292 }
293 
295 {
296  int res;
297 
298  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
299  return res;
300  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301  vp9_unref_frame(ctx, dst);
302  return AVERROR(ENOMEM);
303  }
304 
306  dst->mv = src->mv;
307 
308  return 0;
309 }
310 
311 static int update_size(AVCodecContext *ctx, int w, int h)
312 {
313  VP9Context *s = ctx->priv_data;
314  uint8_t *p;
315 
316  av_assert0(w > 0 && h > 0);
317 
318  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
319  return 0;
320 
321  ctx->width = w;
322  ctx->height = h;
323  s->sb_cols = (w + 63) >> 6;
324  s->sb_rows = (h + 63) >> 6;
325  s->cols = (w + 7) >> 3;
326  s->rows = (h + 7) >> 3;
327 
328 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
329  av_freep(&s->intra_pred_data[0]);
330  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
331  if (!p)
332  return AVERROR(ENOMEM);
333  assign(s->intra_pred_data[0], uint8_t *, 64);
334  assign(s->intra_pred_data[1], uint8_t *, 32);
335  assign(s->intra_pred_data[2], uint8_t *, 32);
336  assign(s->above_y_nnz_ctx, uint8_t *, 16);
337  assign(s->above_mode_ctx, uint8_t *, 16);
338  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
340  assign(s->above_skip_ctx, uint8_t *, 8);
341  assign(s->above_txfm_ctx, uint8_t *, 8);
342  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
343  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
344  assign(s->above_segpred_ctx, uint8_t *, 8);
345  assign(s->above_intra_ctx, uint8_t *, 8);
346  assign(s->above_comp_ctx, uint8_t *, 8);
347  assign(s->above_ref_ctx, uint8_t *, 8);
348  assign(s->above_filter_ctx, uint8_t *, 8);
349  assign(s->lflvl, struct VP9Filter *, 1);
350 #undef assign
351 
352  // these will be re-allocated a little later
353  av_freep(&s->b_base);
354  av_freep(&s->block_base);
355 
356  return 0;
357 }
358 
360 {
361  VP9Context *s = ctx->priv_data;
362 
363  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
364  return 0;
365 
366  av_free(s->b_base);
367  av_free(s->block_base);
368  if (s->uses_2pass) {
369  int sbs = s->sb_cols * s->sb_rows;
370 
371  s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
372  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
373  if (!s->b_base || !s->block_base)
374  return AVERROR(ENOMEM);
375  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
376  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
377  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
378  s->uveob_base[0] = s->eob_base + 256 * sbs;
379  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
380  } else {
381  s->b_base = av_malloc(sizeof(VP9Block));
382  s->block_base = av_mallocz((64 * 64 + 128) * 3);
383  if (!s->b_base || !s->block_base)
384  return AVERROR(ENOMEM);
385  s->uvblock_base[0] = s->block_base + 64 * 64;
386  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
387  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
388  s->uveob_base[0] = s->eob_base + 256;
389  s->uveob_base[1] = s->uveob_base[0] + 64;
390  }
392 
393  return 0;
394 }
395 
396 // for some reason the sign bit is at the end, not the start, of a bit sequence
398 {
399  int v = get_bits(gb, n);
400  return get_bits1(gb) ? -v : v;
401 }
402 
404 {
405  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
406 }
407 
408 // differential forward probability updates
409 static int update_prob(VP56RangeCoder *c, int p)
410 {
411  static const int inv_map_table[254] = {
412  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
413  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
414  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
415  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
416  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
417  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
418  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
419  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
420  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
421  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
422  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
423  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
424  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
425  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
426  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
427  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
428  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
429  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
430  252, 253,
431  };
432  int d;
433 
434  /* This code is trying to do a differential probability update. For a
435  * current probability A in the range [1, 255], the difference to a new
436  * probability of any value can be expressed differentially as 1-A,255-A
437  * where some part of this (absolute range) exists both in positive as
438  * well as the negative part, whereas another part only exists in one
439  * half. We're trying to code this shared part differentially, i.e.
440  * times two where the value of the lowest bit specifies the sign, and
441  * the single part is then coded on top of this. This absolute difference
442  * then again has a value of [0,254], but a bigger value in this range
443  * indicates that we're further away from the original value A, so we
444  * can code this as a VLC code, since higher values are increasingly
445  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
446  * updates vs. the 'fine, exact' updates further down the range, which
447  * adds one extra dimension to this differential update model. */
448 
449  if (!vp8_rac_get(c)) {
450  d = vp8_rac_get_uint(c, 4) + 0;
451  } else if (!vp8_rac_get(c)) {
452  d = vp8_rac_get_uint(c, 4) + 16;
453  } else if (!vp8_rac_get(c)) {
454  d = vp8_rac_get_uint(c, 5) + 32;
455  } else {
456  d = vp8_rac_get_uint(c, 7);
457  if (d >= 65)
458  d = (d << 1) - 65 + vp8_rac_get(c);
459  d += 64;
460  }
461 
462  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
463  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
464 }
465 
467  const uint8_t *data, int size, int *ref)
468 {
469  VP9Context *s = ctx->priv_data;
470  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
471  int last_invisible;
472  const uint8_t *data2;
473 
474  /* general header */
475  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
476  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
477  return res;
478  }
479  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
480  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
481  return AVERROR_INVALIDDATA;
482  }
483  s->profile = get_bits1(&s->gb);
484  if (get_bits1(&s->gb)) { // reserved bit
485  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
486  return AVERROR_INVALIDDATA;
487  }
488  if (get_bits1(&s->gb)) {
489  *ref = get_bits(&s->gb, 3);
490  return 0;
491  }
492  s->last_uses_2pass = s->uses_2pass;
493  s->last_keyframe = s->keyframe;
494  s->keyframe = !get_bits1(&s->gb);
495  last_invisible = s->invisible;
496  s->invisible = !get_bits1(&s->gb);
497  s->errorres = get_bits1(&s->gb);
498  s->use_last_frame_mvs = !s->errorres && !last_invisible;
499  if (s->keyframe) {
500  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
501  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
502  return AVERROR_INVALIDDATA;
503  }
504  s->colorspace = get_bits(&s->gb, 3);
505  if (s->colorspace == 7) { // RGB = profile 1
506  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
507  return AVERROR_INVALIDDATA;
508  }
509  s->fullrange = get_bits1(&s->gb);
510  // for profile 1, here follows the subsampling bits
511  s->refreshrefmask = 0xff;
512  w = get_bits(&s->gb, 16) + 1;
513  h = get_bits(&s->gb, 16) + 1;
514  if (get_bits1(&s->gb)) // display size
515  skip_bits(&s->gb, 32);
516  } else {
517  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
518  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
519  if (s->intraonly) {
520  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
521  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
522  return AVERROR_INVALIDDATA;
523  }
524  s->refreshrefmask = get_bits(&s->gb, 8);
525  w = get_bits(&s->gb, 16) + 1;
526  h = get_bits(&s->gb, 16) + 1;
527  if (get_bits1(&s->gb)) // display size
528  skip_bits(&s->gb, 32);
529  } else {
530  s->refreshrefmask = get_bits(&s->gb, 8);
531  s->refidx[0] = get_bits(&s->gb, 3);
532  s->signbias[0] = get_bits1(&s->gb);
533  s->refidx[1] = get_bits(&s->gb, 3);
534  s->signbias[1] = get_bits1(&s->gb);
535  s->refidx[2] = get_bits(&s->gb, 3);
536  s->signbias[2] = get_bits1(&s->gb);
537  if (!s->refs[s->refidx[0]].f->data[0] ||
538  !s->refs[s->refidx[1]].f->data[0] ||
539  !s->refs[s->refidx[2]].f->data[0]) {
540  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
541  return AVERROR_INVALIDDATA;
542  }
543  if (get_bits1(&s->gb)) {
544  w = s->refs[s->refidx[0]].f->width;
545  h = s->refs[s->refidx[0]].f->height;
546  } else if (get_bits1(&s->gb)) {
547  w = s->refs[s->refidx[1]].f->width;
548  h = s->refs[s->refidx[1]].f->height;
549  } else if (get_bits1(&s->gb)) {
550  w = s->refs[s->refidx[2]].f->width;
551  h = s->refs[s->refidx[2]].f->height;
552  } else {
553  w = get_bits(&s->gb, 16) + 1;
554  h = get_bits(&s->gb, 16) + 1;
555  }
556  // Note that in this code, "CUR_FRAME" is actually before we
557  // have formally allocated a frame, and thus actually represents
558  // the _last_ frame
559  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
560  s->frames[CUR_FRAME].tf.f->height == h;
561  if (get_bits1(&s->gb)) // display size
562  skip_bits(&s->gb, 32);
563  s->highprecisionmvs = get_bits1(&s->gb);
565  get_bits(&s->gb, 2);
566  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
567  s->signbias[0] != s->signbias[2];
568  if (s->allowcompinter) {
569  if (s->signbias[0] == s->signbias[1]) {
570  s->fixcompref = 2;
571  s->varcompref[0] = 0;
572  s->varcompref[1] = 1;
573  } else if (s->signbias[0] == s->signbias[2]) {
574  s->fixcompref = 1;
575  s->varcompref[0] = 0;
576  s->varcompref[1] = 2;
577  } else {
578  s->fixcompref = 0;
579  s->varcompref[0] = 1;
580  s->varcompref[1] = 2;
581  }
582  }
583  }
584  }
585  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
586  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
587  s->framectxid = c = get_bits(&s->gb, 2);
588 
589  /* loopfilter header data */
590  s->filter.level = get_bits(&s->gb, 6);
591  sharp = get_bits(&s->gb, 3);
592  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
593  // the old cache values since they are still valid
594  if (s->filter.sharpness != sharp)
595  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
596  s->filter.sharpness = sharp;
597  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
598  if (get_bits1(&s->gb)) {
599  for (i = 0; i < 4; i++)
600  if (get_bits1(&s->gb))
601  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
602  for (i = 0; i < 2; i++)
603  if (get_bits1(&s->gb))
604  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
605  }
606  } else {
607  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
608  }
609 
610  /* quantization header data */
611  s->yac_qi = get_bits(&s->gb, 8);
612  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
613  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
616  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
617 
618  /* segmentation header info */
619  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
620  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
621  for (i = 0; i < 7; i++)
622  s->prob.seg[i] = get_bits1(&s->gb) ?
623  get_bits(&s->gb, 8) : 255;
624  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
625  for (i = 0; i < 3; i++)
626  s->prob.segpred[i] = get_bits1(&s->gb) ?
627  get_bits(&s->gb, 8) : 255;
628  }
629  }
630  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
631  (w != s->frames[CUR_FRAME].tf.f->width ||
632  h != s->frames[CUR_FRAME].tf.f->height)) {
633  av_log(ctx, AV_LOG_ERROR,
634  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
636  return AVERROR_INVALIDDATA;
637  }
638 
639  if (get_bits1(&s->gb)) {
641  for (i = 0; i < 8; i++) {
642  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
643  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
644  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
645  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
646  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
647  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
648  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
649  }
650  }
651  } else {
652  s->segmentation.feat[0].q_enabled = 0;
653  s->segmentation.feat[0].lf_enabled = 0;
654  s->segmentation.feat[0].skip_enabled = 0;
655  s->segmentation.feat[0].ref_enabled = 0;
656  }
657 
658  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
659  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
660  int qyac, qydc, quvac, quvdc, lflvl, sh;
661 
662  if (s->segmentation.feat[i].q_enabled) {
664  qyac = s->segmentation.feat[i].q_val;
665  else
666  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
667  } else {
668  qyac = s->yac_qi;
669  }
670  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
671  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
672  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
673  qyac = av_clip_uintp2(qyac, 8);
674 
675  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
676  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
677  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
678  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
679 
680  sh = s->filter.level >= 32;
681  if (s->segmentation.feat[i].lf_enabled) {
683  lflvl = s->segmentation.feat[i].lf_val;
684  else
685  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
686  } else {
687  lflvl = s->filter.level;
688  }
689  s->segmentation.feat[i].lflvl[0][0] =
690  s->segmentation.feat[i].lflvl[0][1] =
691  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
692  for (j = 1; j < 4; j++) {
693  s->segmentation.feat[i].lflvl[j][0] =
694  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
695  s->lf_delta.mode[0]) << sh), 6);
696  s->segmentation.feat[i].lflvl[j][1] =
697  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
698  s->lf_delta.mode[1]) << sh), 6);
699  }
700  }
701 
702  /* tiling info */
703  if ((res = update_size(ctx, w, h)) < 0) {
704  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
705  return res;
706  }
707  for (s->tiling.log2_tile_cols = 0;
708  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
709  s->tiling.log2_tile_cols++) ;
710  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
711  max = FFMAX(0, max - 1);
712  while (max > s->tiling.log2_tile_cols) {
713  if (get_bits1(&s->gb))
714  s->tiling.log2_tile_cols++;
715  else
716  break;
717  }
718  s->tiling.log2_tile_rows = decode012(&s->gb);
719  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
720  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
721  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
722  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
723  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
724  if (!s->c_b) {
725  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
726  return AVERROR(ENOMEM);
727  }
728  }
729 
730  if (s->keyframe || s->errorres || s->intraonly) {
731  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
732  s->prob_ctx[3].p = vp9_default_probs;
733  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
734  sizeof(vp9_default_coef_probs));
735  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
736  sizeof(vp9_default_coef_probs));
737  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
738  sizeof(vp9_default_coef_probs));
739  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
740  sizeof(vp9_default_coef_probs));
741  }
742 
743  // next 16 bits is size of the rest of the header (arith-coded)
744  size2 = get_bits(&s->gb, 16);
745  data2 = align_get_bits(&s->gb);
746  if (size2 > size - (data2 - data)) {
747  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
748  return AVERROR_INVALIDDATA;
749  }
750  ff_vp56_init_range_decoder(&s->c, data2, size2);
751  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
752  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
753  return AVERROR_INVALIDDATA;
754  }
755 
756  if (s->keyframe || s->intraonly) {
757  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
758  } else {
759  memset(&s->counts, 0, sizeof(s->counts));
760  }
761  // FIXME is it faster to not copy here, but do it down in the fw updates
762  // as explicit copies if the fw update is missing (and skip the copy upon
763  // fw update)?
764  s->prob.p = s->prob_ctx[c].p;
765 
766  // txfm updates
767  if (s->lossless) {
768  s->txfmmode = TX_4X4;
769  } else {
770  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
771  if (s->txfmmode == 3)
772  s->txfmmode += vp8_rac_get(&s->c);
773 
774  if (s->txfmmode == TX_SWITCHABLE) {
775  for (i = 0; i < 2; i++)
776  if (vp56_rac_get_prob_branchy(&s->c, 252))
777  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
778  for (i = 0; i < 2; i++)
779  for (j = 0; j < 2; j++)
780  if (vp56_rac_get_prob_branchy(&s->c, 252))
781  s->prob.p.tx16p[i][j] =
782  update_prob(&s->c, s->prob.p.tx16p[i][j]);
783  for (i = 0; i < 2; i++)
784  for (j = 0; j < 3; j++)
785  if (vp56_rac_get_prob_branchy(&s->c, 252))
786  s->prob.p.tx32p[i][j] =
787  update_prob(&s->c, s->prob.p.tx32p[i][j]);
788  }
789  }
790 
791  // coef updates
792  for (i = 0; i < 4; i++) {
793  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
794  if (vp8_rac_get(&s->c)) {
795  for (j = 0; j < 2; j++)
796  for (k = 0; k < 2; k++)
797  for (l = 0; l < 6; l++)
798  for (m = 0; m < 6; m++) {
799  uint8_t *p = s->prob.coef[i][j][k][l][m];
800  uint8_t *r = ref[j][k][l][m];
801  if (m >= 3 && l == 0) // dc only has 3 pt
802  break;
803  for (n = 0; n < 3; n++) {
804  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
805  p[n] = update_prob(&s->c, r[n]);
806  } else {
807  p[n] = r[n];
808  }
809  }
810  p[3] = 0;
811  }
812  } else {
813  for (j = 0; j < 2; j++)
814  for (k = 0; k < 2; k++)
815  for (l = 0; l < 6; l++)
816  for (m = 0; m < 6; m++) {
817  uint8_t *p = s->prob.coef[i][j][k][l][m];
818  uint8_t *r = ref[j][k][l][m];
819  if (m > 3 && l == 0) // dc only has 3 pt
820  break;
821  memcpy(p, r, 3);
822  p[3] = 0;
823  }
824  }
825  if (s->txfmmode == i)
826  break;
827  }
828 
829  // mode updates
830  for (i = 0; i < 3; i++)
831  if (vp56_rac_get_prob_branchy(&s->c, 252))
832  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
833  if (!s->keyframe && !s->intraonly) {
834  for (i = 0; i < 7; i++)
835  for (j = 0; j < 3; j++)
836  if (vp56_rac_get_prob_branchy(&s->c, 252))
837  s->prob.p.mv_mode[i][j] =
838  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
839 
840  if (s->filtermode == FILTER_SWITCHABLE)
841  for (i = 0; i < 4; i++)
842  for (j = 0; j < 2; j++)
843  if (vp56_rac_get_prob_branchy(&s->c, 252))
844  s->prob.p.filter[i][j] =
845  update_prob(&s->c, s->prob.p.filter[i][j]);
846 
847  for (i = 0; i < 4; i++)
848  if (vp56_rac_get_prob_branchy(&s->c, 252))
849  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
850 
851  if (s->allowcompinter) {
852  s->comppredmode = vp8_rac_get(&s->c);
853  if (s->comppredmode)
854  s->comppredmode += vp8_rac_get(&s->c);
855  if (s->comppredmode == PRED_SWITCHABLE)
856  for (i = 0; i < 5; i++)
857  if (vp56_rac_get_prob_branchy(&s->c, 252))
858  s->prob.p.comp[i] =
859  update_prob(&s->c, s->prob.p.comp[i]);
860  } else {
862  }
863 
864  if (s->comppredmode != PRED_COMPREF) {
865  for (i = 0; i < 5; i++) {
866  if (vp56_rac_get_prob_branchy(&s->c, 252))
867  s->prob.p.single_ref[i][0] =
868  update_prob(&s->c, s->prob.p.single_ref[i][0]);
869  if (vp56_rac_get_prob_branchy(&s->c, 252))
870  s->prob.p.single_ref[i][1] =
871  update_prob(&s->c, s->prob.p.single_ref[i][1]);
872  }
873  }
874 
875  if (s->comppredmode != PRED_SINGLEREF) {
876  for (i = 0; i < 5; i++)
877  if (vp56_rac_get_prob_branchy(&s->c, 252))
878  s->prob.p.comp_ref[i] =
879  update_prob(&s->c, s->prob.p.comp_ref[i]);
880  }
881 
882  for (i = 0; i < 4; i++)
883  for (j = 0; j < 9; j++)
884  if (vp56_rac_get_prob_branchy(&s->c, 252))
885  s->prob.p.y_mode[i][j] =
886  update_prob(&s->c, s->prob.p.y_mode[i][j]);
887 
888  for (i = 0; i < 4; i++)
889  for (j = 0; j < 4; j++)
890  for (k = 0; k < 3; k++)
891  if (vp56_rac_get_prob_branchy(&s->c, 252))
892  s->prob.p.partition[3 - i][j][k] =
893  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
894 
895  // mv fields don't use the update_prob subexp model for some reason
896  for (i = 0; i < 3; i++)
897  if (vp56_rac_get_prob_branchy(&s->c, 252))
898  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
899 
900  for (i = 0; i < 2; i++) {
901  if (vp56_rac_get_prob_branchy(&s->c, 252))
902  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
903 
904  for (j = 0; j < 10; j++)
905  if (vp56_rac_get_prob_branchy(&s->c, 252))
906  s->prob.p.mv_comp[i].classes[j] =
907  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
908 
909  if (vp56_rac_get_prob_branchy(&s->c, 252))
910  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
911 
912  for (j = 0; j < 10; j++)
913  if (vp56_rac_get_prob_branchy(&s->c, 252))
914  s->prob.p.mv_comp[i].bits[j] =
915  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
916  }
917 
918  for (i = 0; i < 2; i++) {
919  for (j = 0; j < 2; j++)
920  for (k = 0; k < 3; k++)
921  if (vp56_rac_get_prob_branchy(&s->c, 252))
922  s->prob.p.mv_comp[i].class0_fp[j][k] =
923  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
924 
925  for (j = 0; j < 3; j++)
926  if (vp56_rac_get_prob_branchy(&s->c, 252))
927  s->prob.p.mv_comp[i].fp[j] =
928  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
929  }
930 
931  if (s->highprecisionmvs) {
932  for (i = 0; i < 2; i++) {
933  if (vp56_rac_get_prob_branchy(&s->c, 252))
934  s->prob.p.mv_comp[i].class0_hp =
935  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
936 
937  if (vp56_rac_get_prob_branchy(&s->c, 252))
938  s->prob.p.mv_comp[i].hp =
939  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
940  }
941  }
942  }
943 
944  return (data2 - data) + size2;
945 }
946 
947 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
948  VP9Context *s)
949 {
950  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
951  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
952 }
953 
954 static void find_ref_mvs(VP9Context *s,
955  VP56mv *pmv, int ref, int z, int idx, int sb)
956 {
957  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
958  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
959  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
960  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
961  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
962  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
963  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
964  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
965  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
966  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
967  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
969  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
970  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
971  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
972  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
973  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
974  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
975  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
976  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
977  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
978  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984  };
985  VP9Block *b = s->b;
986  int row = s->row, col = s->col, row7 = s->row7;
987  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
988 #define INVALID_MV 0x80008000U
989  uint32_t mem = INVALID_MV;
990  int i;
991 
992 #define RETURN_DIRECT_MV(mv) \
993  do { \
994  uint32_t m = AV_RN32A(&mv); \
995  if (!idx) { \
996  AV_WN32A(pmv, m); \
997  return; \
998  } else if (mem == INVALID_MV) { \
999  mem = m; \
1000  } else if (m != mem) { \
1001  AV_WN32A(pmv, m); \
1002  return; \
1003  } \
1004  } while (0)
1005 
1006  if (sb >= 0) {
1007  if (sb == 2 || sb == 1) {
1008  RETURN_DIRECT_MV(b->mv[0][z]);
1009  } else if (sb == 3) {
1010  RETURN_DIRECT_MV(b->mv[2][z]);
1011  RETURN_DIRECT_MV(b->mv[1][z]);
1012  RETURN_DIRECT_MV(b->mv[0][z]);
1013  }
1014 
1015 #define RETURN_MV(mv) \
1016  do { \
1017  if (sb > 0) { \
1018  VP56mv tmp; \
1019  uint32_t m; \
1020  clamp_mv(&tmp, &mv, s); \
1021  m = AV_RN32A(&tmp); \
1022  if (!idx) { \
1023  AV_WN32A(pmv, m); \
1024  return; \
1025  } else if (mem == INVALID_MV) { \
1026  mem = m; \
1027  } else if (m != mem) { \
1028  AV_WN32A(pmv, m); \
1029  return; \
1030  } \
1031  } else { \
1032  uint32_t m = AV_RN32A(&mv); \
1033  if (!idx) { \
1034  clamp_mv(pmv, &mv, s); \
1035  return; \
1036  } else if (mem == INVALID_MV) { \
1037  mem = m; \
1038  } else if (m != mem) { \
1039  clamp_mv(pmv, &mv, s); \
1040  return; \
1041  } \
1042  } \
1043  } while (0)
1044 
1045  if (row > 0) {
1046  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1047  if (mv->ref[0] == ref) {
1048  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1049  } else if (mv->ref[1] == ref) {
1050  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1051  }
1052  }
1053  if (col > s->tiling.tile_col_start) {
1054  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1055  if (mv->ref[0] == ref) {
1056  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1057  } else if (mv->ref[1] == ref) {
1058  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1059  }
1060  }
1061  i = 2;
1062  } else {
1063  i = 0;
1064  }
1065 
1066  // previously coded MVs in this neighbourhood, using same reference frame
1067  for (; i < 8; i++) {
1068  int c = p[i][0] + col, r = p[i][1] + row;
1069 
1070  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1071  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1072 
1073  if (mv->ref[0] == ref) {
1074  RETURN_MV(mv->mv[0]);
1075  } else if (mv->ref[1] == ref) {
1076  RETURN_MV(mv->mv[1]);
1077  }
1078  }
1079  }
1080 
1081  // MV at this position in previous frame, using same reference frame
1082  if (s->use_last_frame_mvs) {
1083  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1084 
1085  if (!s->last_uses_2pass)
1086  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1087  if (mv->ref[0] == ref) {
1088  RETURN_MV(mv->mv[0]);
1089  } else if (mv->ref[1] == ref) {
1090  RETURN_MV(mv->mv[1]);
1091  }
1092  }
1093 
1094 #define RETURN_SCALE_MV(mv, scale) \
1095  do { \
1096  if (scale) { \
1097  VP56mv mv_temp = { -mv.x, -mv.y }; \
1098  RETURN_MV(mv_temp); \
1099  } else { \
1100  RETURN_MV(mv); \
1101  } \
1102  } while (0)
1103 
1104  // previously coded MVs in this neighbourhood, using different reference frame
1105  for (i = 0; i < 8; i++) {
1106  int c = p[i][0] + col, r = p[i][1] + row;
1107 
1108  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1110 
1111  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1112  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1113  }
1114  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1115  // BUG - libvpx has this condition regardless of whether
1116  // we used the first ref MV and pre-scaling
1117  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1118  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1119  }
1120  }
1121  }
1122 
1123  // MV at this position in previous frame, using different reference frame
1124  if (s->use_last_frame_mvs) {
1125  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1126 
1127  // no need to await_progress, because we already did that above
1128  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1129  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1130  }
1131  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1132  // BUG - libvpx has this condition regardless of whether
1133  // we used the first ref MV and pre-scaling
1134  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1135  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1136  }
1137  }
1138 
1139  AV_ZERO32(pmv);
1140 #undef INVALID_MV
1141 #undef RETURN_MV
1142 #undef RETURN_SCALE_MV
1143 }
1144 
1145 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1146 {
1147  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1148  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1149  s->prob.p.mv_comp[idx].classes);
1150 
1151  s->counts.mv_comp[idx].sign[sign]++;
1152  s->counts.mv_comp[idx].classes[c]++;
1153  if (c) {
1154  int m;
1155 
1156  for (n = 0, m = 0; m < c; m++) {
1157  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1158  n |= bit << m;
1159  s->counts.mv_comp[idx].bits[m][bit]++;
1160  }
1161  n <<= 3;
1162  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1163  n |= bit << 1;
1164  s->counts.mv_comp[idx].fp[bit]++;
1165  if (hp) {
1166  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1167  s->counts.mv_comp[idx].hp[bit]++;
1168  n |= bit;
1169  } else {
1170  n |= 1;
1171  // bug in libvpx - we count for bw entropy purposes even if the
1172  // bit wasn't coded
1173  s->counts.mv_comp[idx].hp[1]++;
1174  }
1175  n += 8 << c;
1176  } else {
1177  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1178  s->counts.mv_comp[idx].class0[n]++;
1179  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1180  s->prob.p.mv_comp[idx].class0_fp[n]);
1181  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1182  n = (n << 3) | (bit << 1);
1183  if (hp) {
1184  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1185  s->counts.mv_comp[idx].class0_hp[bit]++;
1186  n |= bit;
1187  } else {
1188  n |= 1;
1189  // bug in libvpx - we count for bw entropy purposes even if the
1190  // bit wasn't coded
1191  s->counts.mv_comp[idx].class0_hp[1]++;
1192  }
1193  }
1194 
1195  return sign ? -(n + 1) : (n + 1);
1196 }
1197 
1198 static void fill_mv(VP9Context *s,
1199  VP56mv *mv, int mode, int sb)
1200 {
1201  VP9Block *b = s->b;
1202 
1203  if (mode == ZEROMV) {
1204  AV_ZERO64(mv);
1205  } else {
1206  int hp;
1207 
1208  // FIXME cache this value and reuse for other subblocks
1209  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1210  mode == NEWMV ? -1 : sb);
1211  // FIXME maybe move this code into find_ref_mvs()
1212  if ((mode == NEWMV || sb == -1) &&
1213  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1214  if (mv[0].y & 1) {
1215  if (mv[0].y < 0)
1216  mv[0].y++;
1217  else
1218  mv[0].y--;
1219  }
1220  if (mv[0].x & 1) {
1221  if (mv[0].x < 0)
1222  mv[0].x++;
1223  else
1224  mv[0].x--;
1225  }
1226  }
1227  if (mode == NEWMV) {
1229  s->prob.p.mv_joint);
1230 
1231  s->counts.mv_joint[j]++;
1232  if (j >= MV_JOINT_V)
1233  mv[0].y += read_mv_component(s, 0, hp);
1234  if (j & 1)
1235  mv[0].x += read_mv_component(s, 1, hp);
1236  }
1237 
1238  if (b->comp) {
1239  // FIXME cache this value and reuse for other subblocks
1240  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1241  mode == NEWMV ? -1 : sb);
1242  if ((mode == NEWMV || sb == -1) &&
1243  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1244  if (mv[1].y & 1) {
1245  if (mv[1].y < 0)
1246  mv[1].y++;
1247  else
1248  mv[1].y--;
1249  }
1250  if (mv[1].x & 1) {
1251  if (mv[1].x < 0)
1252  mv[1].x++;
1253  else
1254  mv[1].x--;
1255  }
1256  }
1257  if (mode == NEWMV) {
1259  s->prob.p.mv_joint);
1260 
1261  s->counts.mv_joint[j]++;
1262  if (j >= MV_JOINT_V)
1263  mv[1].y += read_mv_component(s, 0, hp);
1264  if (j & 1)
1265  mv[1].x += read_mv_component(s, 1, hp);
1266  }
1267  }
1268  }
1269 }
1270 
1271 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1272  ptrdiff_t stride, int v)
1273 {
1274  switch (w) {
1275  case 1:
1276  do {
1277  *ptr = v;
1278  ptr += stride;
1279  } while (--h);
1280  break;
1281  case 2: {
1282  int v16 = v * 0x0101;
1283  do {
1284  AV_WN16A(ptr, v16);
1285  ptr += stride;
1286  } while (--h);
1287  break;
1288  }
1289  case 4: {
1290  uint32_t v32 = v * 0x01010101;
1291  do {
1292  AV_WN32A(ptr, v32);
1293  ptr += stride;
1294  } while (--h);
1295  break;
1296  }
1297  case 8: {
1298 #if HAVE_FAST_64BIT
1299  uint64_t v64 = v * 0x0101010101010101ULL;
1300  do {
1301  AV_WN64A(ptr, v64);
1302  ptr += stride;
1303  } while (--h);
1304 #else
1305  uint32_t v32 = v * 0x01010101;
1306  do {
1307  AV_WN32A(ptr, v32);
1308  AV_WN32A(ptr + 4, v32);
1309  ptr += stride;
1310  } while (--h);
1311 #endif
1312  break;
1313  }
1314  }
1315 }
1316 
1317 static void decode_mode(AVCodecContext *ctx)
1318 {
1319  static const uint8_t left_ctx[N_BS_SIZES] = {
1320  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1321  };
1322  static const uint8_t above_ctx[N_BS_SIZES] = {
1323  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1324  };
1325  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1327  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1328  };
1329  VP9Context *s = ctx->priv_data;
1330  VP9Block *b = s->b;
1331  int row = s->row, col = s->col, row7 = s->row7;
1332  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1333  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1334  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1335  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1336  int vref, filter_id;
1337 
1338  if (!s->segmentation.enabled) {
1339  b->seg_id = 0;
1340  } else if (s->keyframe || s->intraonly) {
1342  } else if (!s->segmentation.update_map ||
1343  (s->segmentation.temporal &&
1345  s->prob.segpred[s->above_segpred_ctx[col] +
1346  s->left_segpred_ctx[row7]]))) {
1347  int pred = 8, x;
1348  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1349 
1350  if (!s->last_uses_2pass)
1351  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1352  for (y = 0; y < h4; y++)
1353  for (x = 0; x < w4; x++)
1354  pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1355  av_assert1(pred < 8);
1356  b->seg_id = pred;
1357 
1358  memset(&s->above_segpred_ctx[col], 1, w4);
1359  memset(&s->left_segpred_ctx[row7], 1, h4);
1360  } else {
1362  s->prob.seg);
1363 
1364  memset(&s->above_segpred_ctx[col], 0, w4);
1365  memset(&s->left_segpred_ctx[row7], 0, h4);
1366  }
1367  if (s->segmentation.enabled &&
1368  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1369  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1370  w4, h4, 8 * s->sb_cols, b->seg_id);
1371  }
1372 
1373  b->skip = s->segmentation.enabled &&
1374  s->segmentation.feat[b->seg_id].skip_enabled;
1375  if (!b->skip) {
1376  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1377  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1378  s->counts.skip[c][b->skip]++;
1379  }
1380 
1381  if (s->keyframe || s->intraonly) {
1382  b->intra = 1;
1383  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1384  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1385  } else {
1386  int c, bit;
1387 
1388  if (have_a && have_l) {
1389  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1390  c += (c == 2);
1391  } else {
1392  c = have_a ? 2 * s->above_intra_ctx[col] :
1393  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1394  }
1395  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1396  s->counts.intra[c][bit]++;
1397  b->intra = !bit;
1398  }
1399 
1400  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1401  int c;
1402  if (have_a) {
1403  if (have_l) {
1404  c = (s->above_skip_ctx[col] ? max_tx :
1405  s->above_txfm_ctx[col]) +
1406  (s->left_skip_ctx[row7] ? max_tx :
1407  s->left_txfm_ctx[row7]) > max_tx;
1408  } else {
1409  c = s->above_skip_ctx[col] ? 1 :
1410  (s->above_txfm_ctx[col] * 2 > max_tx);
1411  }
1412  } else if (have_l) {
1413  c = s->left_skip_ctx[row7] ? 1 :
1414  (s->left_txfm_ctx[row7] * 2 > max_tx);
1415  } else {
1416  c = 1;
1417  }
1418  switch (max_tx) {
1419  case TX_32X32:
1420  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1421  if (b->tx) {
1422  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1423  if (b->tx == 2)
1424  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1425  }
1426  s->counts.tx32p[c][b->tx]++;
1427  break;
1428  case TX_16X16:
1429  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1430  if (b->tx)
1431  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1432  s->counts.tx16p[c][b->tx]++;
1433  break;
1434  case TX_8X8:
1435  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1436  s->counts.tx8p[c][b->tx]++;
1437  break;
1438  case TX_4X4:
1439  b->tx = TX_4X4;
1440  break;
1441  }
1442  } else {
1443  b->tx = FFMIN(max_tx, s->txfmmode);
1444  }
1445 
1446  if (s->keyframe || s->intraonly) {
1447  uint8_t *a = &s->above_mode_ctx[col * 2];
1448  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1449 
1450  b->comp = 0;
1451  if (b->bs > BS_8x8) {
1452  // FIXME the memory storage intermediates here aren't really
1453  // necessary, they're just there to make the code slightly
1454  // simpler for now
1455  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1456  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1457  if (b->bs != BS_8x4) {
1459  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1460  l[0] = a[1] = b->mode[1];
1461  } else {
1462  l[0] = a[1] = b->mode[1] = b->mode[0];
1463  }
1464  if (b->bs != BS_4x8) {
1465  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1467  if (b->bs != BS_8x4) {
1469  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1470  l[1] = a[1] = b->mode[3];
1471  } else {
1472  l[1] = a[1] = b->mode[3] = b->mode[2];
1473  }
1474  } else {
1475  b->mode[2] = b->mode[0];
1476  l[1] = a[1] = b->mode[3] = b->mode[1];
1477  }
1478  } else {
1480  vp9_default_kf_ymode_probs[*a][*l]);
1481  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1482  // FIXME this can probably be optimized
1483  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1484  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1485  }
1488  } else if (b->intra) {
1489  b->comp = 0;
1490  if (b->bs > BS_8x8) {
1492  s->prob.p.y_mode[0]);
1493  s->counts.y_mode[0][b->mode[0]]++;
1494  if (b->bs != BS_8x4) {
1496  s->prob.p.y_mode[0]);
1497  s->counts.y_mode[0][b->mode[1]]++;
1498  } else {
1499  b->mode[1] = b->mode[0];
1500  }
1501  if (b->bs != BS_4x8) {
1503  s->prob.p.y_mode[0]);
1504  s->counts.y_mode[0][b->mode[2]]++;
1505  if (b->bs != BS_8x4) {
1507  s->prob.p.y_mode[0]);
1508  s->counts.y_mode[0][b->mode[3]]++;
1509  } else {
1510  b->mode[3] = b->mode[2];
1511  }
1512  } else {
1513  b->mode[2] = b->mode[0];
1514  b->mode[3] = b->mode[1];
1515  }
1516  } else {
1517  static const uint8_t size_group[10] = {
1518  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1519  };
1520  int sz = size_group[b->bs];
1521 
1523  s->prob.p.y_mode[sz]);
1524  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1525  s->counts.y_mode[sz][b->mode[3]]++;
1526  }
1528  s->prob.p.uv_mode[b->mode[3]]);
1529  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1530  } else {
1531  static const uint8_t inter_mode_ctx_lut[14][14] = {
1532  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1533  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1534  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1535  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1536  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1537  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1543  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1544  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1545  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1546  };
1547 
1548  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1549  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1550  b->comp = 0;
1551  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1552  } else {
1553  // read comp_pred flag
1554  if (s->comppredmode != PRED_SWITCHABLE) {
1555  b->comp = s->comppredmode == PRED_COMPREF;
1556  } else {
1557  int c;
1558 
1559  // FIXME add intra as ref=0xff (or -1) to make these easier?
1560  if (have_a) {
1561  if (have_l) {
1562  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1563  c = 4;
1564  } else if (s->above_comp_ctx[col]) {
1565  c = 2 + (s->left_intra_ctx[row7] ||
1566  s->left_ref_ctx[row7] == s->fixcompref);
1567  } else if (s->left_comp_ctx[row7]) {
1568  c = 2 + (s->above_intra_ctx[col] ||
1569  s->above_ref_ctx[col] == s->fixcompref);
1570  } else {
1571  c = (!s->above_intra_ctx[col] &&
1572  s->above_ref_ctx[col] == s->fixcompref) ^
1573  (!s->left_intra_ctx[row7] &&
1574  s->left_ref_ctx[row & 7] == s->fixcompref);
1575  }
1576  } else {
1577  c = s->above_comp_ctx[col] ? 3 :
1578  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1579  }
1580  } else if (have_l) {
1581  c = s->left_comp_ctx[row7] ? 3 :
1582  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1583  } else {
1584  c = 1;
1585  }
1586  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1587  s->counts.comp[c][b->comp]++;
1588  }
1589 
1590  // read actual references
1591  // FIXME probably cache a few variables here to prevent repetitive
1592  // memory accesses below
1593  if (b->comp) /* two references */ {
1594  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1595 
1596  b->ref[fix_idx] = s->fixcompref;
1597  // FIXME can this codeblob be replaced by some sort of LUT?
1598  if (have_a) {
1599  if (have_l) {
1600  if (s->above_intra_ctx[col]) {
1601  if (s->left_intra_ctx[row7]) {
1602  c = 2;
1603  } else {
1604  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1605  }
1606  } else if (s->left_intra_ctx[row7]) {
1607  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1608  } else {
1609  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1610 
1611  if (refl == refa && refa == s->varcompref[1]) {
1612  c = 0;
1613  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1614  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1615  (refl == s->fixcompref && refa == s->varcompref[0])) {
1616  c = 4;
1617  } else {
1618  c = (refa == refl) ? 3 : 1;
1619  }
1620  } else if (!s->left_comp_ctx[row7]) {
1621  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1622  c = 1;
1623  } else {
1624  c = (refl == s->varcompref[1] &&
1625  refa != s->varcompref[1]) ? 2 : 4;
1626  }
1627  } else if (!s->above_comp_ctx[col]) {
1628  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1629  c = 1;
1630  } else {
1631  c = (refa == s->varcompref[1] &&
1632  refl != s->varcompref[1]) ? 2 : 4;
1633  }
1634  } else {
1635  c = (refl == refa) ? 4 : 2;
1636  }
1637  }
1638  } else {
1639  if (s->above_intra_ctx[col]) {
1640  c = 2;
1641  } else if (s->above_comp_ctx[col]) {
1642  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1643  } else {
1644  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1645  }
1646  }
1647  } else if (have_l) {
1648  if (s->left_intra_ctx[row7]) {
1649  c = 2;
1650  } else if (s->left_comp_ctx[row7]) {
1651  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1652  } else {
1653  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1654  }
1655  } else {
1656  c = 2;
1657  }
1658  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1659  b->ref[var_idx] = s->varcompref[bit];
1660  s->counts.comp_ref[c][bit]++;
1661  } else /* single reference */ {
1662  int bit, c;
1663 
1664  if (have_a && !s->above_intra_ctx[col]) {
1665  if (have_l && !s->left_intra_ctx[row7]) {
1666  if (s->left_comp_ctx[row7]) {
1667  if (s->above_comp_ctx[col]) {
1668  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1669  !s->above_ref_ctx[col]);
1670  } else {
1671  c = (3 * !s->above_ref_ctx[col]) +
1672  (!s->fixcompref || !s->left_ref_ctx[row7]);
1673  }
1674  } else if (s->above_comp_ctx[col]) {
1675  c = (3 * !s->left_ref_ctx[row7]) +
1676  (!s->fixcompref || !s->above_ref_ctx[col]);
1677  } else {
1678  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1679  }
1680  } else if (s->above_intra_ctx[col]) {
1681  c = 2;
1682  } else if (s->above_comp_ctx[col]) {
1683  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1684  } else {
1685  c = 4 * (!s->above_ref_ctx[col]);
1686  }
1687  } else if (have_l && !s->left_intra_ctx[row7]) {
1688  if (s->left_intra_ctx[row7]) {
1689  c = 2;
1690  } else if (s->left_comp_ctx[row7]) {
1691  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1692  } else {
1693  c = 4 * (!s->left_ref_ctx[row7]);
1694  }
1695  } else {
1696  c = 2;
1697  }
1698  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1699  s->counts.single_ref[c][0][bit]++;
1700  if (!bit) {
1701  b->ref[0] = 0;
1702  } else {
1703  // FIXME can this codeblob be replaced by some sort of LUT?
1704  if (have_a) {
1705  if (have_l) {
1706  if (s->left_intra_ctx[row7]) {
1707  if (s->above_intra_ctx[col]) {
1708  c = 2;
1709  } else if (s->above_comp_ctx[col]) {
1710  c = 1 + 2 * (s->fixcompref == 1 ||
1711  s->above_ref_ctx[col] == 1);
1712  } else if (!s->above_ref_ctx[col]) {
1713  c = 3;
1714  } else {
1715  c = 4 * (s->above_ref_ctx[col] == 1);
1716  }
1717  } else if (s->above_intra_ctx[col]) {
1718  if (s->left_intra_ctx[row7]) {
1719  c = 2;
1720  } else if (s->left_comp_ctx[row7]) {
1721  c = 1 + 2 * (s->fixcompref == 1 ||
1722  s->left_ref_ctx[row7] == 1);
1723  } else if (!s->left_ref_ctx[row7]) {
1724  c = 3;
1725  } else {
1726  c = 4 * (s->left_ref_ctx[row7] == 1);
1727  }
1728  } else if (s->above_comp_ctx[col]) {
1729  if (s->left_comp_ctx[row7]) {
1730  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1731  c = 3 * (s->fixcompref == 1 ||
1732  s->left_ref_ctx[row7] == 1);
1733  } else {
1734  c = 2;
1735  }
1736  } else if (!s->left_ref_ctx[row7]) {
1737  c = 1 + 2 * (s->fixcompref == 1 ||
1738  s->above_ref_ctx[col] == 1);
1739  } else {
1740  c = 3 * (s->left_ref_ctx[row7] == 1) +
1741  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1742  }
1743  } else if (s->left_comp_ctx[row7]) {
1744  if (!s->above_ref_ctx[col]) {
1745  c = 1 + 2 * (s->fixcompref == 1 ||
1746  s->left_ref_ctx[row7] == 1);
1747  } else {
1748  c = 3 * (s->above_ref_ctx[col] == 1) +
1749  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1750  }
1751  } else if (!s->above_ref_ctx[col]) {
1752  if (!s->left_ref_ctx[row7]) {
1753  c = 3;
1754  } else {
1755  c = 4 * (s->left_ref_ctx[row7] == 1);
1756  }
1757  } else if (!s->left_ref_ctx[row7]) {
1758  c = 4 * (s->above_ref_ctx[col] == 1);
1759  } else {
1760  c = 2 * (s->left_ref_ctx[row7] == 1) +
1761  2 * (s->above_ref_ctx[col] == 1);
1762  }
1763  } else {
1764  if (s->above_intra_ctx[col] ||
1765  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1766  c = 2;
1767  } else if (s->above_comp_ctx[col]) {
1768  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1769  } else {
1770  c = 4 * (s->above_ref_ctx[col] == 1);
1771  }
1772  }
1773  } else if (have_l) {
1774  if (s->left_intra_ctx[row7] ||
1775  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1776  c = 2;
1777  } else if (s->left_comp_ctx[row7]) {
1778  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1779  } else {
1780  c = 4 * (s->left_ref_ctx[row7] == 1);
1781  }
1782  } else {
1783  c = 2;
1784  }
1785  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1786  s->counts.single_ref[c][1][bit]++;
1787  b->ref[0] = 1 + bit;
1788  }
1789  }
1790  }
1791 
1792  if (b->bs <= BS_8x8) {
1793  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1794  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1795  } else {
1796  static const uint8_t off[10] = {
1797  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1798  };
1799 
1800  // FIXME this needs to use the LUT tables from find_ref_mvs
1801  // because not all are -1,0/0,-1
1802  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1803  [s->left_mode_ctx[row7 + off[b->bs]]];
1804 
1806  s->prob.p.mv_mode[c]);
1807  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1808  s->counts.mv_mode[c][b->mode[0] - 10]++;
1809  }
1810  }
1811 
1812  if (s->filtermode == FILTER_SWITCHABLE) {
1813  int c;
1814 
1815  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1816  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1817  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1818  s->left_filter_ctx[row7] : 3;
1819  } else {
1820  c = s->above_filter_ctx[col];
1821  }
1822  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1823  c = s->left_filter_ctx[row7];
1824  } else {
1825  c = 3;
1826  }
1827 
1828  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1829  s->prob.p.filter[c]);
1830  s->counts.filter[c][filter_id]++;
1831  b->filter = vp9_filter_lut[filter_id];
1832  } else {
1833  b->filter = s->filtermode;
1834  }
1835 
1836  if (b->bs > BS_8x8) {
1837  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1838 
1840  s->prob.p.mv_mode[c]);
1841  s->counts.mv_mode[c][b->mode[0] - 10]++;
1842  fill_mv(s, b->mv[0], b->mode[0], 0);
1843 
1844  if (b->bs != BS_8x4) {
1846  s->prob.p.mv_mode[c]);
1847  s->counts.mv_mode[c][b->mode[1] - 10]++;
1848  fill_mv(s, b->mv[1], b->mode[1], 1);
1849  } else {
1850  b->mode[1] = b->mode[0];
1851  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1852  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1853  }
1854 
1855  if (b->bs != BS_4x8) {
1857  s->prob.p.mv_mode[c]);
1858  s->counts.mv_mode[c][b->mode[2] - 10]++;
1859  fill_mv(s, b->mv[2], b->mode[2], 2);
1860 
1861  if (b->bs != BS_8x4) {
1863  s->prob.p.mv_mode[c]);
1864  s->counts.mv_mode[c][b->mode[3] - 10]++;
1865  fill_mv(s, b->mv[3], b->mode[3], 3);
1866  } else {
1867  b->mode[3] = b->mode[2];
1868  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1869  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1870  }
1871  } else {
1872  b->mode[2] = b->mode[0];
1873  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1874  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1875  b->mode[3] = b->mode[1];
1876  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1877  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1878  }
1879  } else {
1880  fill_mv(s, b->mv[0], b->mode[0], -1);
1881  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1882  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1883  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1884  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1885  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1886  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1887  }
1888 
1889  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1890  }
1891 
1892 #if HAVE_FAST_64BIT
1893 #define SPLAT_CTX(var, val, n) \
1894  switch (n) { \
1895  case 1: var = val; break; \
1896  case 2: AV_WN16A(&var, val * 0x0101); break; \
1897  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1898  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1899  case 16: { \
1900  uint64_t v64 = val * 0x0101010101010101ULL; \
1901  AV_WN64A( &var, v64); \
1902  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1903  break; \
1904  } \
1905  }
1906 #else
1907 #define SPLAT_CTX(var, val, n) \
1908  switch (n) { \
1909  case 1: var = val; break; \
1910  case 2: AV_WN16A(&var, val * 0x0101); break; \
1911  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1912  case 8: { \
1913  uint32_t v32 = val * 0x01010101; \
1914  AV_WN32A( &var, v32); \
1915  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1916  break; \
1917  } \
1918  case 16: { \
1919  uint32_t v32 = val * 0x01010101; \
1920  AV_WN32A( &var, v32); \
1921  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1922  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1923  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1924  break; \
1925  } \
1926  }
1927 #endif
1928 
1929  switch (bwh_tab[1][b->bs][0]) {
1930 #define SET_CTXS(dir, off, n) \
1931  do { \
1932  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1933  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1934  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1935  if (!s->keyframe && !s->intraonly) { \
1936  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1937  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1938  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1939  if (!b->intra) { \
1940  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1941  if (s->filtermode == FILTER_SWITCHABLE) { \
1942  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1943  } \
1944  } \
1945  } \
1946  } while (0)
1947  case 1: SET_CTXS(above, col, 1); break;
1948  case 2: SET_CTXS(above, col, 2); break;
1949  case 4: SET_CTXS(above, col, 4); break;
1950  case 8: SET_CTXS(above, col, 8); break;
1951  }
1952  switch (bwh_tab[1][b->bs][1]) {
1953  case 1: SET_CTXS(left, row7, 1); break;
1954  case 2: SET_CTXS(left, row7, 2); break;
1955  case 4: SET_CTXS(left, row7, 4); break;
1956  case 8: SET_CTXS(left, row7, 8); break;
1957  }
1958 #undef SPLAT_CTX
1959 #undef SET_CTXS
1960 
1961  if (!s->keyframe && !s->intraonly) {
1962  if (b->bs > BS_8x8) {
1963  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1964 
1965  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1966  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1967  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1968  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1969  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1970  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1971  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1972  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1973  } else {
1974  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1975 
1976  for (n = 0; n < w4 * 2; n++) {
1977  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1978  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1979  }
1980  for (n = 0; n < h4 * 2; n++) {
1981  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1982  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1983  }
1984  }
1985  }
1986 
1987  // FIXME kinda ugly
1988  for (y = 0; y < h4; y++) {
1989  int x, o = (row + y) * s->sb_cols * 8 + col;
1990  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1991 
1992  if (b->intra) {
1993  for (x = 0; x < w4; x++) {
1994  mv[x].ref[0] =
1995  mv[x].ref[1] = -1;
1996  }
1997  } else if (b->comp) {
1998  for (x = 0; x < w4; x++) {
1999  mv[x].ref[0] = b->ref[0];
2000  mv[x].ref[1] = b->ref[1];
2001  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2002  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2003  }
2004  } else {
2005  for (x = 0; x < w4; x++) {
2006  mv[x].ref[0] = b->ref[0];
2007  mv[x].ref[1] = -1;
2008  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2009  }
2010  }
2011  }
2012 }
2013 
2014 // FIXME merge cnt/eob arguments?
2015 static av_always_inline int
2016 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2017  int is_tx32x32, unsigned (*cnt)[6][3],
2018  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2019  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2020  const int16_t *band_counts, const int16_t *qmul)
2021 {
2022  int i = 0, band = 0, band_left = band_counts[band];
2023  uint8_t *tp = p[0][nnz];
2024  uint8_t cache[1024];
2025 
2026  do {
2027  int val, rc;
2028 
2029  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2030  eob[band][nnz][val]++;
2031  if (!val)
2032  break;
2033 
2034  skip_eob:
2035  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2036  cnt[band][nnz][0]++;
2037  if (!--band_left)
2038  band_left = band_counts[++band];
2039  cache[scan[i]] = 0;
2040  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2041  tp = p[band][nnz];
2042  if (++i == n_coeffs)
2043  break; //invalid input; blocks should end with EOB
2044  goto skip_eob;
2045  }
2046 
2047  rc = scan[i];
2048  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2049  cnt[band][nnz][1]++;
2050  val = 1;
2051  cache[rc] = 1;
2052  } else {
2053  // fill in p[3-10] (model fill) - only once per frame for each pos
2054  if (!tp[3])
2055  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2056 
2057  cnt[band][nnz][2]++;
2058  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2059  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2060  cache[rc] = val = 2;
2061  } else {
2062  val = 3 + vp56_rac_get_prob(c, tp[5]);
2063  cache[rc] = 3;
2064  }
2065  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2066  cache[rc] = 4;
2067  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2068  val = 5 + vp56_rac_get_prob(c, 159);
2069  } else {
2070  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2071  val += vp56_rac_get_prob(c, 145);
2072  }
2073  } else { // cat 3-6
2074  cache[rc] = 5;
2075  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2076  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2077  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2078  val += (vp56_rac_get_prob(c, 148) << 1);
2079  val += vp56_rac_get_prob(c, 140);
2080  } else {
2081  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2082  val += (vp56_rac_get_prob(c, 155) << 2);
2083  val += (vp56_rac_get_prob(c, 140) << 1);
2084  val += vp56_rac_get_prob(c, 135);
2085  }
2086  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2087  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2088  val += (vp56_rac_get_prob(c, 157) << 3);
2089  val += (vp56_rac_get_prob(c, 141) << 2);
2090  val += (vp56_rac_get_prob(c, 134) << 1);
2091  val += vp56_rac_get_prob(c, 130);
2092  } else {
2093  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2094  val += (vp56_rac_get_prob(c, 254) << 12);
2095  val += (vp56_rac_get_prob(c, 254) << 11);
2096  val += (vp56_rac_get_prob(c, 252) << 10);
2097  val += (vp56_rac_get_prob(c, 249) << 9);
2098  val += (vp56_rac_get_prob(c, 243) << 8);
2099  val += (vp56_rac_get_prob(c, 230) << 7);
2100  val += (vp56_rac_get_prob(c, 196) << 6);
2101  val += (vp56_rac_get_prob(c, 177) << 5);
2102  val += (vp56_rac_get_prob(c, 153) << 4);
2103  val += (vp56_rac_get_prob(c, 140) << 3);
2104  val += (vp56_rac_get_prob(c, 133) << 2);
2105  val += (vp56_rac_get_prob(c, 130) << 1);
2106  val += vp56_rac_get_prob(c, 129);
2107  }
2108  }
2109  }
2110  if (!--band_left)
2111  band_left = band_counts[++band];
2112  if (is_tx32x32)
2113  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2114  else
2115  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2116  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2117  tp = p[band][nnz];
2118  } while (++i < n_coeffs);
2119 
2120  return i;
2121 }
2122 
2123 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2124  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2125  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2126  const int16_t (*nb)[2], const int16_t *band_counts,
2127  const int16_t *qmul)
2128 {
2129  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2130  nnz, scan, nb, band_counts, qmul);
2131 }
2132 
2133 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2134  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2135  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2136  const int16_t (*nb)[2], const int16_t *band_counts,
2137  const int16_t *qmul)
2138 {
2139  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2140  nnz, scan, nb, band_counts, qmul);
2141 }
2142 
2144 {
2145  VP9Context *s = ctx->priv_data;
2146  VP9Block *b = s->b;
2147  int row = s->row, col = s->col;
2148  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2149  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2150  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2151  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2152  int end_x = FFMIN(2 * (s->cols - col), w4);
2153  int end_y = FFMIN(2 * (s->rows - row), h4);
2154  int n, pl, x, y, res;
2155  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2156  int tx = 4 * s->lossless + b->tx;
2157  const int16_t * const *yscans = vp9_scans[tx];
2158  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2159  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2160  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2161  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2162  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2163  static const int16_t band_counts[4][8] = {
2164  { 1, 2, 3, 4, 3, 16 - 13 },
2165  { 1, 2, 3, 4, 11, 64 - 21 },
2166  { 1, 2, 3, 4, 11, 256 - 21 },
2167  { 1, 2, 3, 4, 11, 1024 - 21 },
2168  };
2169  const int16_t *y_band_counts = band_counts[b->tx];
2170  const int16_t *uv_band_counts = band_counts[b->uvtx];
2171 
2172 #define MERGE(la, end, step, rd) \
2173  for (n = 0; n < end; n += step) \
2174  la[n] = !!rd(&la[n])
2175 #define MERGE_CTX(step, rd) \
2176  do { \
2177  MERGE(l, end_y, step, rd); \
2178  MERGE(a, end_x, step, rd); \
2179  } while (0)
2180 
2181 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2182  for (n = 0, y = 0; y < end_y; y += step) { \
2183  for (x = 0; x < end_x; x += step, n += step * step) { \
2184  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2185  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2186  c, e, p, a[x] + l[y], yscans[txtp], \
2187  ynbs[txtp], y_band_counts, qmul[0]); \
2188  a[x] = l[y] = !!res; \
2189  if (step >= 4) { \
2190  AV_WN16A(&s->eob[n], res); \
2191  } else { \
2192  s->eob[n] = res; \
2193  } \
2194  } \
2195  }
2196 
2197 #define SPLAT(la, end, step, cond) \
2198  if (step == 2) { \
2199  for (n = 1; n < end; n += step) \
2200  la[n] = la[n - 1]; \
2201  } else if (step == 4) { \
2202  if (cond) { \
2203  for (n = 0; n < end; n += step) \
2204  AV_WN32A(&la[n], la[n] * 0x01010101); \
2205  } else { \
2206  for (n = 0; n < end; n += step) \
2207  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2208  } \
2209  } else /* step == 8 */ { \
2210  if (cond) { \
2211  if (HAVE_FAST_64BIT) { \
2212  for (n = 0; n < end; n += step) \
2213  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2214  } else { \
2215  for (n = 0; n < end; n += step) { \
2216  uint32_t v32 = la[n] * 0x01010101; \
2217  AV_WN32A(&la[n], v32); \
2218  AV_WN32A(&la[n + 4], v32); \
2219  } \
2220  } \
2221  } else { \
2222  for (n = 0; n < end; n += step) \
2223  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2224  } \
2225  }
2226 #define SPLAT_CTX(step) \
2227  do { \
2228  SPLAT(a, end_x, step, end_x == w4); \
2229  SPLAT(l, end_y, step, end_y == h4); \
2230  } while (0)
2231 
2232  /* y tokens */
2233  switch (b->tx) {
2234  case TX_4X4:
2235  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2236  break;
2237  case TX_8X8:
2238  MERGE_CTX(2, AV_RN16A);
2239  DECODE_Y_COEF_LOOP(2, 0,);
2240  SPLAT_CTX(2);
2241  break;
2242  case TX_16X16:
2243  MERGE_CTX(4, AV_RN32A);
2244  DECODE_Y_COEF_LOOP(4, 0,);
2245  SPLAT_CTX(4);
2246  break;
2247  case TX_32X32:
2248  MERGE_CTX(8, AV_RN64A);
2249  DECODE_Y_COEF_LOOP(8, 0, 32);
2250  SPLAT_CTX(8);
2251  break;
2252  }
2253 
2254 #define DECODE_UV_COEF_LOOP(step) \
2255  for (n = 0, y = 0; y < end_y; y += step) { \
2256  for (x = 0; x < end_x; x += step, n += step * step) { \
2257  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2258  16 * step * step, c, e, p, a[x] + l[y], \
2259  uvscan, uvnb, uv_band_counts, qmul[1]); \
2260  a[x] = l[y] = !!res; \
2261  if (step >= 4) { \
2262  AV_WN16A(&s->uveob[pl][n], res); \
2263  } else { \
2264  s->uveob[pl][n] = res; \
2265  } \
2266  } \
2267  }
2268 
2269  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2270  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2271  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2272  w4 >>= 1;
2273  h4 >>= 1;
2274  end_x >>= 1;
2275  end_y >>= 1;
2276  for (pl = 0; pl < 2; pl++) {
2277  a = &s->above_uv_nnz_ctx[pl][col];
2278  l = &s->left_uv_nnz_ctx[pl][row & 7];
2279  switch (b->uvtx) {
2280  case TX_4X4:
2282  break;
2283  case TX_8X8:
2284  MERGE_CTX(2, AV_RN16A);
2286  SPLAT_CTX(2);
2287  break;
2288  case TX_16X16:
2289  MERGE_CTX(4, AV_RN32A);
2291  SPLAT_CTX(4);
2292  break;
2293  case TX_32X32:
2294  MERGE_CTX(8, AV_RN64A);
2295  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2296  // so there is no need to loop
2297  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2298  1024, c, e, p, a[0] + l[0],
2299  uvscan, uvnb, uv_band_counts, qmul[1]);
2300  a[0] = l[0] = !!res;
2301  AV_WN16A(&s->uveob[pl][0], res);
2302  SPLAT_CTX(8);
2303  break;
2304  }
2305  }
2306 }
2307 
2309  uint8_t *dst_edge, ptrdiff_t stride_edge,
2310  uint8_t *dst_inner, ptrdiff_t stride_inner,
2311  uint8_t *l, int col, int x, int w,
2312  int row, int y, enum TxfmMode tx,
2313  int p)
2314 {
2315  int have_top = row > 0 || y > 0;
2316  int have_left = col > s->tiling.tile_col_start || x > 0;
2317  int have_right = x < w - 1;
2318  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2319  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2320  { DC_127_PRED, VERT_PRED } },
2321  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2322  { HOR_PRED, HOR_PRED } },
2323  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2324  { LEFT_DC_PRED, DC_PRED } },
2334  { DC_127_PRED, VERT_LEFT_PRED } },
2335  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2336  { HOR_UP_PRED, HOR_UP_PRED } },
2337  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2338  { HOR_PRED, TM_VP8_PRED } },
2339  };
2340  static const struct {
2341  uint8_t needs_left:1;
2342  uint8_t needs_top:1;
2343  uint8_t needs_topleft:1;
2344  uint8_t needs_topright:1;
2345  } edges[N_INTRA_PRED_MODES] = {
2346  [VERT_PRED] = { .needs_top = 1 },
2347  [HOR_PRED] = { .needs_left = 1 },
2348  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2349  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2350  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2351  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2352  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2353  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2354  [HOR_UP_PRED] = { .needs_left = 1 },
2355  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2356  [LEFT_DC_PRED] = { .needs_left = 1 },
2357  [TOP_DC_PRED] = { .needs_top = 1 },
2358  [DC_128_PRED] = { 0 },
2359  [DC_127_PRED] = { 0 },
2360  [DC_129_PRED] = { 0 }
2361  };
2362 
2363  av_assert2(mode >= 0 && mode < 10);
2364  mode = mode_conv[mode][have_left][have_top];
2365  if (edges[mode].needs_top) {
2366  uint8_t *top, *topleft;
2367  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2368  int n_px_need_tr = 0;
2369 
2370  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2371  n_px_need_tr = 4;
2372 
2373  // if top of sb64-row, use s->intra_pred_data[] instead of
2374  // dst[-stride] for intra prediction (it contains pre- instead of
2375  // post-loopfilter data)
2376  if (have_top) {
2377  top = !(row & 7) && !y ?
2378  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2379  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2380  if (have_left)
2381  topleft = !(row & 7) && !y ?
2382  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2383  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2384  &dst_inner[-stride_inner];
2385  }
2386 
2387  if (have_top &&
2388  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2389  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2390  n_px_need + n_px_need_tr <= n_px_have) {
2391  *a = top;
2392  } else {
2393  if (have_top) {
2394  if (n_px_need <= n_px_have) {
2395  memcpy(*a, top, n_px_need);
2396  } else {
2397  memcpy(*a, top, n_px_have);
2398  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2399  n_px_need - n_px_have);
2400  }
2401  } else {
2402  memset(*a, 127, n_px_need);
2403  }
2404  if (edges[mode].needs_topleft) {
2405  if (have_left && have_top) {
2406  (*a)[-1] = topleft[-1];
2407  } else {
2408  (*a)[-1] = have_top ? 129 : 127;
2409  }
2410  }
2411  if (tx == TX_4X4 && edges[mode].needs_topright) {
2412  if (have_top && have_right &&
2413  n_px_need + n_px_need_tr <= n_px_have) {
2414  memcpy(&(*a)[4], &top[4], 4);
2415  } else {
2416  memset(&(*a)[4], (*a)[3], 4);
2417  }
2418  }
2419  }
2420  }
2421  if (edges[mode].needs_left) {
2422  if (have_left) {
2423  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2424  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2425  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2426 
2427  if (n_px_need <= n_px_have) {
2428  for (i = 0; i < n_px_need; i++)
2429  l[n_px_need - 1 - i] = dst[i * stride - 1];
2430  } else {
2431  for (i = 0; i < n_px_have; i++)
2432  l[n_px_need - 1 - i] = dst[i * stride - 1];
2433  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2434  }
2435  } else {
2436  memset(l, 129, 4 << tx);
2437  }
2438  }
2439 
2440  return mode;
2441 }
2442 
2443 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2444 {
2445  VP9Context *s = ctx->priv_data;
2446  VP9Block *b = s->b;
2447  int row = s->row, col = s->col;
2448  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2449  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2450  int end_x = FFMIN(2 * (s->cols - col), w4);
2451  int end_y = FFMIN(2 * (s->rows - row), h4);
2452  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2453  int uvstep1d = 1 << b->uvtx, p;
2454  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2455  LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2456  LOCAL_ALIGNED_32(uint8_t, l, [32]);
2457 
2458  for (n = 0, y = 0; y < end_y; y += step1d) {
2459  uint8_t *ptr = dst, *ptr_r = dst_r;
2460  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2461  ptr_r += 4 * step1d, n += step) {
2462  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2463  y * 2 + x : 0];
2464  uint8_t *a = &a_buf[32];
2465  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2466  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2467 
2468  mode = check_intra_mode(s, mode, &a, ptr_r,
2469  s->frames[CUR_FRAME].tf.f->linesize[0],
2470  ptr, s->y_stride, l,
2471  col, x, w4, row, y, b->tx, 0);
2472  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2473  if (eob)
2474  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2475  s->block + 16 * n, eob);
2476  }
2477  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2478  dst += 4 * step1d * s->y_stride;
2479  }
2480 
2481  // U/V
2482  w4 >>= 1;
2483  end_x >>= 1;
2484  end_y >>= 1;
2485  step = 1 << (b->uvtx * 2);
2486  for (p = 0; p < 2; p++) {
2487  dst = s->dst[1 + p];
2488  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2489  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2490  uint8_t *ptr = dst, *ptr_r = dst_r;
2491  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2492  ptr_r += 4 * uvstep1d, n += step) {
2493  int mode = b->uvmode;
2494  uint8_t *a = &a_buf[16];
2495  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2496 
2497  mode = check_intra_mode(s, mode, &a, ptr_r,
2498  s->frames[CUR_FRAME].tf.f->linesize[1],
2499  ptr, s->uv_stride, l,
2500  col, x, w4, row, y, b->uvtx, p + 1);
2501  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2502  if (eob)
2503  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2504  s->uvblock[p] + 16 * n, eob);
2505  }
2506  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2507  dst += 4 * uvstep1d * s->uv_stride;
2508  }
2509  }
2510 }
2511 
2513  uint8_t *dst, ptrdiff_t dst_stride,
2514  const uint8_t *ref, ptrdiff_t ref_stride,
2516  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2517  int bw, int bh, int w, int h)
2518 {
2519  int mx = mv->x, my = mv->y, th;
2520 
2521  y += my >> 3;
2522  x += mx >> 3;
2523  ref += y * ref_stride + x;
2524  mx &= 7;
2525  my &= 7;
2526  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2527  // we use +7 because the last 7 pixels of each sbrow can be changed in
2528  // the longest loopfilter of the next sbrow
2529  th = (y + bh + 4 * !!my + 7) >> 6;
2530  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2531  if (x < !!mx * 3 || y < !!my * 3 ||
2532  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2534  ref - !!my * 3 * ref_stride - !!mx * 3,
2535  80, ref_stride,
2536  bw + !!mx * 7, bh + !!my * 7,
2537  x - !!mx * 3, y - !!my * 3, w, h);
2538  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2539  ref_stride = 80;
2540  }
2541  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2542 }
2543 
2545  uint8_t *dst_u, uint8_t *dst_v,
2546  ptrdiff_t dst_stride,
2547  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2548  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2550  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2551  int bw, int bh, int w, int h)
2552 {
2553  int mx = mv->x, my = mv->y, th;
2554 
2555  y += my >> 4;
2556  x += mx >> 4;
2557  ref_u += y * src_stride_u + x;
2558  ref_v += y * src_stride_v + x;
2559  mx &= 15;
2560  my &= 15;
2561  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2562  // we use +7 because the last 7 pixels of each sbrow can be changed in
2563  // the longest loopfilter of the next sbrow
2564  th = (y + bh + 4 * !!my + 7) >> 5;
2565  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2566  if (x < !!mx * 3 || y < !!my * 3 ||
2567  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2569  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2570  80, src_stride_u,
2571  bw + !!mx * 7, bh + !!my * 7,
2572  x - !!mx * 3, y - !!my * 3, w, h);
2573  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2574  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2575 
2577  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2578  80, src_stride_v,
2579  bw + !!mx * 7, bh + !!my * 7,
2580  x - !!mx * 3, y - !!my * 3, w, h);
2581  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2582  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2583  } else {
2584  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2585  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2586  }
2587 }
2588 
2589 static void inter_recon(AVCodecContext *ctx)
2590 {
2591  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2592  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2593  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2594  };
2595  VP9Context *s = ctx->priv_data;
2596  VP9Block *b = s->b;
2597  int row = s->row, col = s->col;
2598  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2599  AVFrame *ref1 = tref1->f, *ref2;
2600  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2601  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2602 
2603  if (b->comp) {
2604  tref2 = &s->refs[s->refidx[b->ref[1]]];
2605  ref2 = tref2->f;
2606  w2 = ref2->width;
2607  h2 = ref2->height;
2608  }
2609 
2610  // y inter pred
2611  if (b->bs > BS_8x8) {
2612  if (b->bs == BS_8x4) {
2613  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2614  ref1->data[0], ref1->linesize[0], tref1,
2615  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2616  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2617  s->dst[0] + 4 * ls_y, ls_y,
2618  ref1->data[0], ref1->linesize[0], tref1,
2619  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2620 
2621  if (b->comp) {
2622  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2623  ref2->data[0], ref2->linesize[0], tref2,
2624  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2625  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2626  s->dst[0] + 4 * ls_y, ls_y,
2627  ref2->data[0], ref2->linesize[0], tref2,
2628  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2629  }
2630  } else if (b->bs == BS_4x8) {
2631  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2632  ref1->data[0], ref1->linesize[0], tref1,
2633  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2634  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2635  ref1->data[0], ref1->linesize[0], tref1,
2636  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2637 
2638  if (b->comp) {
2639  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2640  ref2->data[0], ref2->linesize[0], tref2,
2641  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2642  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2643  ref2->data[0], ref2->linesize[0], tref2,
2644  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2645  }
2646  } else {
2647  av_assert2(b->bs == BS_4x4);
2648 
2649  // FIXME if two horizontally adjacent blocks have the same MV,
2650  // do a w8 instead of a w4 call
2651  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2652  ref1->data[0], ref1->linesize[0], tref1,
2653  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2654  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2655  ref1->data[0], ref1->linesize[0], tref1,
2656  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2657  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2658  s->dst[0] + 4 * ls_y, ls_y,
2659  ref1->data[0], ref1->linesize[0], tref1,
2660  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2661  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2662  s->dst[0] + 4 * ls_y + 4, ls_y,
2663  ref1->data[0], ref1->linesize[0], tref1,
2664  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2665 
2666  if (b->comp) {
2667  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2668  ref2->data[0], ref2->linesize[0], tref2,
2669  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2670  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2671  ref2->data[0], ref2->linesize[0], tref2,
2672  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2673  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2674  s->dst[0] + 4 * ls_y, ls_y,
2675  ref2->data[0], ref2->linesize[0], tref2,
2676  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2677  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2678  s->dst[0] + 4 * ls_y + 4, ls_y,
2679  ref2->data[0], ref2->linesize[0], tref2,
2680  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2681  }
2682  }
2683  } else {
2684  int bwl = bwlog_tab[0][b->bs];
2685  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2686 
2687  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2688  ref1->data[0], ref1->linesize[0], tref1,
2689  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2690 
2691  if (b->comp)
2692  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2693  ref2->data[0], ref2->linesize[0], tref2,
2694  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2695  }
2696 
2697  // uv inter pred
2698  {
2699  int bwl = bwlog_tab[1][b->bs];
2700  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2701  VP56mv mvuv;
2702 
2703  w1 = (w1 + 1) >> 1;
2704  h1 = (h1 + 1) >> 1;
2705  if (b->comp) {
2706  w2 = (w2 + 1) >> 1;
2707  h2 = (h2 + 1) >> 1;
2708  }
2709  if (b->bs > BS_8x8) {
2710  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2711  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2712  } else {
2713  mvuv = b->mv[0][0];
2714  }
2715 
2716  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2717  s->dst[1], s->dst[2], ls_uv,
2718  ref1->data[1], ref1->linesize[1],
2719  ref1->data[2], ref1->linesize[2], tref1,
2720  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2721 
2722  if (b->comp) {
2723  if (b->bs > BS_8x8) {
2724  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2725  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2726  } else {
2727  mvuv = b->mv[0][1];
2728  }
2729  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2730  s->dst[1], s->dst[2], ls_uv,
2731  ref2->data[1], ref2->linesize[1],
2732  ref2->data[2], ref2->linesize[2], tref2,
2733  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2734  }
2735  }
2736 
2737  if (!b->skip) {
2738  /* mostly copied intra_reconn() */
2739 
2740  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2741  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2742  int end_x = FFMIN(2 * (s->cols - col), w4);
2743  int end_y = FFMIN(2 * (s->rows - row), h4);
2744  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2745  int uvstep1d = 1 << b->uvtx, p;
2746  uint8_t *dst = s->dst[0];
2747 
2748  // y itxfm add
2749  for (n = 0, y = 0; y < end_y; y += step1d) {
2750  uint8_t *ptr = dst;
2751  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2752  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2753 
2754  if (eob)
2755  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2756  s->block + 16 * n, eob);
2757  }
2758  dst += 4 * s->y_stride * step1d;
2759  }
2760 
2761  // uv itxfm add
2762  end_x >>= 1;
2763  end_y >>= 1;
2764  step = 1 << (b->uvtx * 2);
2765  for (p = 0; p < 2; p++) {
2766  dst = s->dst[p + 1];
2767  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2768  uint8_t *ptr = dst;
2769  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2770  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2771 
2772  if (eob)
2773  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2774  s->uvblock[p] + 16 * n, eob);
2775  }
2776  dst += 4 * uvstep1d * s->uv_stride;
2777  }
2778  }
2779  }
2780 }
2781 
2782 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2783  int row_and_7, int col_and_7,
2784  int w, int h, int col_end, int row_end,
2785  enum TxfmMode tx, int skip_inter)
2786 {
2787  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2788  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2789  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2790  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2791 
2792  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2793  // edges. This means that for UV, we work on two subsampled blocks at
2794  // a time, and we only use the topleft block's mode information to set
2795  // things like block strength. Thus, for any block size smaller than
2796  // 16x16, ignore the odd portion of the block.
2797  if (tx == TX_4X4 && is_uv) {
2798  if (h == 1) {
2799  if (row_and_7 & 1)
2800  return;
2801  if (!row_end)
2802  h += 1;
2803  }
2804  if (w == 1) {
2805  if (col_and_7 & 1)
2806  return;
2807  if (!col_end)
2808  w += 1;
2809  }
2810  }
2811 
2812  if (tx == TX_4X4 && !skip_inter) {
2813  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2814  int m_col_odd = (t << (w - 1)) - t;
2815 
2816  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2817  if (is_uv) {
2818  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2819 
2820  for (y = row_and_7; y < h + row_and_7; y++) {
2821  int col_mask_id = 2 - !(y & 7);
2822 
2823  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2824  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2825  // for odd lines, if the odd col is not being filtered,
2826  // skip odd row also:
2827  // .---. <-- a
2828  // | |
2829  // |___| <-- b
2830  // ^ ^
2831  // c d
2832  //
2833  // if a/c are even row/col and b/d are odd, and d is skipped,
2834  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2835  if ((col_end & 1) && (y & 1)) {
2836  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2837  } else {
2838  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2839  }
2840  }
2841  } else {
2842  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2843 
2844  for (y = row_and_7; y < h + row_and_7; y++) {
2845  int col_mask_id = 2 - !(y & 3);
2846 
2847  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2848  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2849  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2850  lflvl->mask[is_uv][0][y][3] |= m_col;
2851  lflvl->mask[is_uv][1][y][3] |= m_col;
2852  }
2853  }
2854  } else {
2855  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2856 
2857  if (!skip_inter) {
2858  int mask_id = (tx == TX_8X8);
2859  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2860  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2861  int m_row = m_col & masks[l2];
2862 
2863  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2864  // 8wd loopfilter to prevent going off the visible edge.
2865  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2866  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2867  int m_row_8 = m_row - m_row_16;
2868 
2869  for (y = row_and_7; y < h + row_and_7; y++) {
2870  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2871  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2872  }
2873  } else {
2874  for (y = row_and_7; y < h + row_and_7; y++)
2875  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2876  }
2877 
2878  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2879  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2880  lflvl->mask[is_uv][1][y][0] |= m_col;
2881  if (y - row_and_7 == h - 1)
2882  lflvl->mask[is_uv][1][y][1] |= m_col;
2883  } else {
2884  for (y = row_and_7; y < h + row_and_7; y += step1d)
2885  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2886  }
2887  } else if (tx != TX_4X4) {
2888  int mask_id;
2889 
2890  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2891  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2892  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2893  for (y = row_and_7; y < h + row_and_7; y++)
2894  lflvl->mask[is_uv][0][y][mask_id] |= t;
2895  } else if (is_uv) {
2896  int t8 = t & 0x01, t4 = t - t8;
2897 
2898  for (y = row_and_7; y < h + row_and_7; y++) {
2899  lflvl->mask[is_uv][0][y][2] |= t4;
2900  lflvl->mask[is_uv][0][y][1] |= t8;
2901  }
2902  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2903  } else {
2904  int t8 = t & 0x11, t4 = t - t8;
2905 
2906  for (y = row_and_7; y < h + row_and_7; y++) {
2907  lflvl->mask[is_uv][0][y][2] |= t4;
2908  lflvl->mask[is_uv][0][y][1] |= t8;
2909  }
2910  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2911  }
2912  }
2913 }
2914 
2915 static void decode_b(AVCodecContext *ctx, int row, int col,
2916  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2917  enum BlockLevel bl, enum BlockPartition bp)
2918 {
2919  VP9Context *s = ctx->priv_data;
2920  VP9Block *b = s->b;
2921  enum BlockSize bs = bl * 3 + bp;
2922  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2923  int emu[2];
2924  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2925 
2926  s->row = row;
2927  s->row7 = row & 7;
2928  s->col = col;
2929  s->col7 = col & 7;
2930  s->min_mv.x = -(128 + col * 64);
2931  s->min_mv.y = -(128 + row * 64);
2932  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2933  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2934  if (s->pass < 2) {
2935  b->bs = bs;
2936  b->bl = bl;
2937  b->bp = bp;
2938  decode_mode(ctx);
2939  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2940 
2941  if (!b->skip) {
2942  decode_coeffs(ctx);
2943  } else {
2944  int row7 = s->row7;
2945 
2946 #define SPLAT_ZERO_CTX(v, n) \
2947  switch (n) { \
2948  case 1: v = 0; break; \
2949  case 2: AV_ZERO16(&v); break; \
2950  case 4: AV_ZERO32(&v); break; \
2951  case 8: AV_ZERO64(&v); break; \
2952  case 16: AV_ZERO128(&v); break; \
2953  }
2954 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2955  do { \
2956  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2957  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2958  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2959  } while (0)
2960 
2961  switch (w4) {
2962  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2963  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2964  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2965  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2966  }
2967  switch (h4) {
2968  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2969  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2970  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2971  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2972  }
2973  }
2974  if (s->pass == 1) {
2975  s->b++;
2976  s->block += w4 * h4 * 64;
2977  s->uvblock[0] += w4 * h4 * 16;
2978  s->uvblock[1] += w4 * h4 * 16;
2979  s->eob += 4 * w4 * h4;
2980  s->uveob[0] += w4 * h4;
2981  s->uveob[1] += w4 * h4;
2982 
2983  return;
2984  }
2985  }
2986 
2987  // emulated overhangs if the stride of the target buffer can't hold. This
2988  // allows to support emu-edge and so on even if we have large block
2989  // overhangs
2990  emu[0] = (col + w4) * 8 > f->linesize[0] ||
2991  (row + h4) > s->rows;
2992  emu[1] = (col + w4) * 4 > f->linesize[1] ||
2993  (row + h4) > s->rows;
2994  if (emu[0]) {
2995  s->dst[0] = s->tmp_y;
2996  s->y_stride = 64;
2997  } else {
2998  s->dst[0] = f->data[0] + yoff;
2999  s->y_stride = f->linesize[0];
3000  }
3001  if (emu[1]) {
3002  s->dst[1] = s->tmp_uv[0];
3003  s->dst[2] = s->tmp_uv[1];
3004  s->uv_stride = 32;
3005  } else {
3006  s->dst[1] = f->data[1] + uvoff;
3007  s->dst[2] = f->data[2] + uvoff;
3008  s->uv_stride = f->linesize[1];
3009  }
3010  if (b->intra) {
3011  intra_recon(ctx, yoff, uvoff);
3012  } else {
3013  inter_recon(ctx);
3014  }
3015  if (emu[0]) {
3016  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3017 
3018  for (n = 0; o < w; n++) {
3019  int bw = 64 >> n;
3020 
3021  av_assert2(n <= 4);
3022  if (w & bw) {
3023  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3024  s->tmp_y + o, 64, h, 0, 0);
3025  o += bw;
3026  }
3027  }
3028  }
3029  if (emu[1]) {
3030  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3031 
3032  for (n = 1; o < w; n++) {
3033  int bw = 64 >> n;
3034 
3035  av_assert2(n <= 4);
3036  if (w & bw) {
3037  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3038  s->tmp_uv[0] + o, 32, h, 0, 0);
3039  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3040  s->tmp_uv[1] + o, 32, h, 0, 0);
3041  o += bw;
3042  }
3043  }
3044  }
3045 
3046  // pick filter level and find edges to apply filter to
3047  if (s->filter.level &&
3048  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3049  [b->mode[3] != ZEROMV]) > 0) {
3050  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3051  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3052 
3053  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3054  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3055  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3056  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3057  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3058  b->uvtx, skip_inter);
3059 
3060  if (!s->filter.lim_lut[lvl]) {
3061  int sharp = s->filter.sharpness;
3062  int limit = lvl;
3063 
3064  if (sharp > 0) {
3065  limit >>= (sharp + 3) >> 2;
3066  limit = FFMIN(limit, 9 - sharp);
3067  }
3068  limit = FFMAX(limit, 1);
3069 
3070  s->filter.lim_lut[lvl] = limit;
3071  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3072  }
3073  }
3074 
3075  if (s->pass == 2) {
3076  s->b++;
3077  s->block += w4 * h4 * 64;
3078  s->uvblock[0] += w4 * h4 * 16;
3079  s->uvblock[1] += w4 * h4 * 16;
3080  s->eob += 4 * w4 * h4;
3081  s->uveob[0] += w4 * h4;
3082  s->uveob[1] += w4 * h4;
3083  }
3084 }
3085 
3086 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3087  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3088 {
3089  VP9Context *s = ctx->priv_data;
3090  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3091  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3092  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3093  s->prob.p.partition[bl][c];
3094  enum BlockPartition bp;
3095  ptrdiff_t hbs = 4 >> bl;
3096  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3097  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3098 
3099  if (bl == BL_8X8) {
3100  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3101  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3102  } else if (col + hbs < s->cols) { // FIXME why not <=?
3103  if (row + hbs < s->rows) { // FIXME why not <=?
3104  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3105  switch (bp) {
3106  case PARTITION_NONE:
3107  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3108  break;
3109  case PARTITION_H:
3110  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3111  yoff += hbs * 8 * y_stride;
3112  uvoff += hbs * 4 * uv_stride;
3113  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3114  break;
3115  case PARTITION_V:
3116  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3117  yoff += hbs * 8;
3118  uvoff += hbs * 4;
3119  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3120  break;
3121  case PARTITION_SPLIT:
3122  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3123  decode_sb(ctx, row, col + hbs, lflvl,
3124  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3125  yoff += hbs * 8 * y_stride;
3126  uvoff += hbs * 4 * uv_stride;
3127  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3128  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3129  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3130  break;
3131  default:
3132  av_assert0(0);
3133  }
3134  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3135  bp = PARTITION_SPLIT;
3136  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3137  decode_sb(ctx, row, col + hbs, lflvl,
3138  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3139  } else {
3140  bp = PARTITION_H;
3141  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3142  }
3143  } else if (row + hbs < s->rows) { // FIXME why not <=?
3144  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3145  bp = PARTITION_SPLIT;
3146  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3147  yoff += hbs * 8 * y_stride;
3148  uvoff += hbs * 4 * uv_stride;
3149  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3150  } else {
3151  bp = PARTITION_V;
3152  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3153  }
3154  } else {
3155  bp = PARTITION_SPLIT;
3156  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3157  }
3158  s->counts.partition[bl][c][bp]++;
3159 }
3160 
3161 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3162  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3163 {
3164  VP9Context *s = ctx->priv_data;
3165  VP9Block *b = s->b;
3166  ptrdiff_t hbs = 4 >> bl;
3167  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3168  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3169 
3170  if (bl == BL_8X8) {
3171  av_assert2(b->bl == BL_8X8);
3172  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3173  } else if (s->b->bl == bl) {
3174  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3175  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3176  yoff += hbs * 8 * y_stride;
3177  uvoff += hbs * 4 * uv_stride;
3178  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3179  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3180  yoff += hbs * 8;
3181  uvoff += hbs * 4;
3182  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3183  }
3184  } else {
3185  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3186  if (col + hbs < s->cols) { // FIXME why not <=?
3187  if (row + hbs < s->rows) {
3188  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3189  uvoff + 4 * hbs, bl + 1);
3190  yoff += hbs * 8 * y_stride;
3191  uvoff += hbs * 4 * uv_stride;
3192  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3193  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3194  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3195  } else {
3196  yoff += hbs * 8;
3197  uvoff += hbs * 4;
3198  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3199  }
3200  } else if (row + hbs < s->rows) {
3201  yoff += hbs * 8 * y_stride;
3202  uvoff += hbs * 4 * uv_stride;
3203  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3204  }
3205  }
3206 }
3207 
3208 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3209  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3210 {
3211  VP9Context *s = ctx->priv_data;
3212  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3213  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3214  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3215  int y, x, p;
3216 
3217  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3218  // if you think of them as acting on a 8x8 block max, we can interleave
3219  // each v/h within the single x loop, but that only works if we work on
3220  // 8 pixel blocks, and we won't always do that (we want at least 16px
3221  // to use SSE2 optimizations, perhaps 32 for AVX2)
3222 
3223  // filter edges between columns, Y plane (e.g. block1 | block2)
3224  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3225  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3226  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3227  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3228  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3229  unsigned hm = hm1 | hm2 | hm13 | hm23;
3230 
3231  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3232  if (hm1 & x) {
3233  int L = *l, H = L >> 4;
3234  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3235 
3236  if (col || x > 1) {
3237  if (hmask1[0] & x) {
3238  if (hmask2[0] & x) {
3239  av_assert2(l[8] == L);
3240  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3241  } else {
3242  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3243  }
3244  } else if (hm2 & x) {
3245  L = l[8];
3246  H |= (L >> 4) << 8;
3247  E |= s->filter.mblim_lut[L] << 8;
3248  I |= s->filter.lim_lut[L] << 8;
3249  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3250  [!!(hmask2[1] & x)]
3251  [0](ptr, ls_y, E, I, H);
3252  } else {
3253  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3254  [0](ptr, ls_y, E, I, H);
3255  }
3256  }
3257  } else if (hm2 & x) {
3258  int L = l[8], H = L >> 4;
3259  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3260 
3261  if (col || x > 1) {
3262  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3263  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3264  }
3265  }
3266  if (hm13 & x) {
3267  int L = *l, H = L >> 4;
3268  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3269 
3270  if (hm23 & x) {
3271  L = l[8];
3272  H |= (L >> 4) << 8;
3273  E |= s->filter.mblim_lut[L] << 8;
3274  I |= s->filter.lim_lut[L] << 8;
3275  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3276  } else {
3277  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3278  }
3279  } else if (hm23 & x) {
3280  int L = l[8], H = L >> 4;
3281  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3282 
3283  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3284  }
3285  }
3286  }
3287 
3288  // block1
3289  // filter edges between rows, Y plane (e.g. ------)
3290  // block2
3291  dst = f->data[0] + yoff;
3292  lvl = lflvl->level;
3293  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3294  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3295  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3296 
3297  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3298  if (row || y) {
3299  if (vm & x) {
3300  int L = *l, H = L >> 4;
3301  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3302 
3303  if (vmask[0] & x) {
3304  if (vmask[0] & (x << 1)) {
3305  av_assert2(l[1] == L);
3306  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3307  } else {
3308  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3309  }
3310  } else if (vm & (x << 1)) {
3311  L = l[1];
3312  H |= (L >> 4) << 8;
3313  E |= s->filter.mblim_lut[L] << 8;
3314  I |= s->filter.lim_lut[L] << 8;
3315  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3316  [!!(vmask[1] & (x << 1))]
3317  [1](ptr, ls_y, E, I, H);
3318  } else {
3319  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3320  [1](ptr, ls_y, E, I, H);
3321  }
3322  } else if (vm & (x << 1)) {
3323  int L = l[1], H = L >> 4;
3324  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3325 
3326  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3327  [1](ptr + 8, ls_y, E, I, H);
3328  }
3329  }
3330  if (vm3 & x) {
3331  int L = *l, H = L >> 4;
3332  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3333 
3334  if (vm3 & (x << 1)) {
3335  L = l[1];
3336  H |= (L >> 4) << 8;
3337  E |= s->filter.mblim_lut[L] << 8;
3338  I |= s->filter.lim_lut[L] << 8;
3339  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3340  } else {
3341  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3342  }
3343  } else if (vm3 & (x << 1)) {
3344  int L = l[1], H = L >> 4;
3345  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3346 
3347  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3348  }
3349  }
3350  }
3351 
3352  // same principle but for U/V planes
3353  for (p = 0; p < 2; p++) {
3354  lvl = lflvl->level;
3355  dst = f->data[1 + p] + uvoff;
3356  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3357  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3358  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3359  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3360  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3361 
3362  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3363  if (col || x > 1) {
3364  if (hm1 & x) {
3365  int L = *l, H = L >> 4;
3366  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3367 
3368  if (hmask1[0] & x) {
3369  if (hmask2[0] & x) {
3370  av_assert2(l[16] == L);
3371  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3372  } else {
3373  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3374  }
3375  } else if (hm2 & x) {
3376  L = l[16];
3377  H |= (L >> 4) << 8;
3378  E |= s->filter.mblim_lut[L] << 8;
3379  I |= s->filter.lim_lut[L] << 8;
3380  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3381  [!!(hmask2[1] & x)]
3382  [0](ptr, ls_uv, E, I, H);
3383  } else {
3384  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3385  [0](ptr, ls_uv, E, I, H);
3386  }
3387  } else if (hm2 & x) {
3388  int L = l[16], H = L >> 4;
3389  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3390 
3391  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3392  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3393  }
3394  }
3395  if (x & 0xAA)
3396  l += 2;
3397  }
3398  }
3399  lvl = lflvl->level;
3400  dst = f->data[1 + p] + uvoff;
3401  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3402  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3403  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3404 
3405  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3406  if (row || y) {
3407  if (vm & x) {
3408  int L = *l, H = L >> 4;
3409  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3410 
3411  if (vmask[0] & x) {
3412  if (vmask[0] & (x << 2)) {
3413  av_assert2(l[2] == L);
3414  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3415  } else {
3416  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3417  }
3418  } else if (vm & (x << 2)) {
3419  L = l[2];
3420  H |= (L >> 4) << 8;
3421  E |= s->filter.mblim_lut[L] << 8;
3422  I |= s->filter.lim_lut[L] << 8;
3423  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3424  [!!(vmask[1] & (x << 2))]
3425  [1](ptr, ls_uv, E, I, H);
3426  } else {
3427  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3428  [1](ptr, ls_uv, E, I, H);
3429  }
3430  } else if (vm & (x << 2)) {
3431  int L = l[2], H = L >> 4;
3432  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3433 
3434  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3435  [1](ptr + 8, ls_uv, E, I, H);
3436  }
3437  }
3438  }
3439  if (y & 1)
3440  lvl += 16;
3441  }
3442  }
3443 }
3444 
3445 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3446 {
3447  int sb_start = ( idx * n) >> log2_n;
3448  int sb_end = ((idx + 1) * n) >> log2_n;
3449  *start = FFMIN(sb_start, n) << 3;
3450  *end = FFMIN(sb_end, n) << 3;
3451 }
3452 
3453 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3454  int max_count, int update_factor)
3455 {
3456  unsigned ct = ct0 + ct1, p2, p1;
3457 
3458  if (!ct)
3459  return;
3460 
3461  p1 = *p;
3462  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3463  p2 = av_clip(p2, 1, 255);
3464  ct = FFMIN(ct, max_count);
3465  update_factor = FASTDIV(update_factor * ct, max_count);
3466 
3467  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3468  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3469 }
3470 
3471 static void adapt_probs(VP9Context *s)
3472 {
3473  int i, j, k, l, m;
3474  prob_context *p = &s->prob_ctx[s->framectxid].p;
3475  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3476 
3477  // coefficients
3478  for (i = 0; i < 4; i++)
3479  for (j = 0; j < 2; j++)
3480  for (k = 0; k < 2; k++)
3481  for (l = 0; l < 6; l++)
3482  for (m = 0; m < 6; m++) {
3483  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3484  unsigned *e = s->counts.eob[i][j][k][l][m];
3485  unsigned *c = s->counts.coef[i][j][k][l][m];
3486 
3487  if (l == 0 && m >= 3) // dc only has 3 pt
3488  break;
3489 
3490  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3491  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3492  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3493  }
3494 
3495  if (s->keyframe || s->intraonly) {
3496  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3497  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3498  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3499  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3500  return;
3501  }
3502 
3503  // skip flag
3504  for (i = 0; i < 3; i++)
3505  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3506 
3507  // intra/inter flag
3508  for (i = 0; i < 4; i++)
3509  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3510 
3511  // comppred flag
3512  if (s->comppredmode == PRED_SWITCHABLE) {
3513  for (i = 0; i < 5; i++)
3514  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3515  }
3516 
3517  // reference frames
3518  if (s->comppredmode != PRED_SINGLEREF) {
3519  for (i = 0; i < 5; i++)
3520  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3521  s->counts.comp_ref[i][1], 20, 128);
3522  }
3523 
3524  if (s->comppredmode != PRED_COMPREF) {
3525  for (i = 0; i < 5; i++) {
3526  uint8_t *pp = p->single_ref[i];
3527  unsigned (*c)[2] = s->counts.single_ref[i];
3528 
3529  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3530  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3531  }
3532  }
3533 
3534  // block partitioning
3535  for (i = 0; i < 4; i++)
3536  for (j = 0; j < 4; j++) {
3537  uint8_t *pp = p->partition[i][j];
3538  unsigned *c = s->counts.partition[i][j];
3539 
3540  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3541  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3542  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3543  }
3544 
3545  // tx size
3546  if (s->txfmmode == TX_SWITCHABLE) {
3547  for (i = 0; i < 2; i++) {
3548  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3549 
3550  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3551  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3552  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3553  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3554  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3555  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3556  }
3557  }
3558 
3559  // interpolation filter
3560  if (s->filtermode == FILTER_SWITCHABLE) {
3561  for (i = 0; i < 4; i++) {
3562  uint8_t *pp = p->filter[i];
3563  unsigned *c = s->counts.filter[i];
3564 
3565  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3566  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3567  }
3568  }
3569 
3570  // inter modes
3571  for (i = 0; i < 7; i++) {
3572  uint8_t *pp = p->mv_mode[i];
3573  unsigned *c = s->counts.mv_mode[i];
3574 
3575  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3576  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3577  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3578  }
3579 
3580  // mv joints
3581  {
3582  uint8_t *pp = p->mv_joint;
3583  unsigned *c = s->counts.mv_joint;
3584 
3585  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3586  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3587  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3588  }
3589 
3590  // mv components
3591  for (i = 0; i < 2; i++) {
3592  uint8_t *pp;
3593  unsigned *c, (*c2)[2], sum;
3594 
3595  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3596  s->counts.mv_comp[i].sign[1], 20, 128);
3597 
3598  pp = p->mv_comp[i].classes;
3599  c = s->counts.mv_comp[i].classes;
3600  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3601  adapt_prob(&pp[0], c[0], sum, 20, 128);
3602  sum -= c[1];
3603  adapt_prob(&pp[1], c[1], sum, 20, 128);
3604  sum -= c[2] + c[3];
3605  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3606  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3607  sum -= c[4] + c[5];
3608  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3609  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3610  sum -= c[6];
3611  adapt_prob(&pp[6], c[6], sum, 20, 128);
3612  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3613  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3614  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3615 
3616  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3617  s->counts.mv_comp[i].class0[1], 20, 128);
3618  pp = p->mv_comp[i].bits;
3619  c2 = s->counts.mv_comp[i].bits;
3620  for (j = 0; j < 10; j++)
3621  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3622 
3623  for (j = 0; j < 2; j++) {
3624  pp = p->mv_comp[i].class0_fp[j];
3625  c = s->counts.mv_comp[i].class0_fp[j];
3626  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3627  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3628  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3629  }
3630  pp = p->mv_comp[i].fp;
3631  c = s->counts.mv_comp[i].fp;
3632  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3633  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3634  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3635 
3636  if (s->highprecisionmvs) {
3637  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3638  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3639  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3640  s->counts.mv_comp[i].hp[1], 20, 128);
3641  }
3642  }
3643 
3644  // y intra modes
3645  for (i = 0; i < 4; i++) {
3646  uint8_t *pp = p->y_mode[i];
3647  unsigned *c = s->counts.y_mode[i], sum, s2;
3648 
3649  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3650  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3651  sum -= c[TM_VP8_PRED];
3652  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3653  sum -= c[VERT_PRED];
3654  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3655  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3656  sum -= s2;
3657  adapt_prob(&pp[3], s2, sum, 20, 128);
3658  s2 -= c[HOR_PRED];
3659  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3660  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3661  sum -= c[DIAG_DOWN_LEFT_PRED];
3662  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3663  sum -= c[VERT_LEFT_PRED];
3664  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3665  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3666  }
3667 
3668  // uv intra modes
3669  for (i = 0; i < 10; i++) {
3670  uint8_t *pp = p->uv_mode[i];
3671  unsigned *c = s->counts.uv_mode[i], sum, s2;
3672 
3673  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3674  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3675  sum -= c[TM_VP8_PRED];
3676  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3677  sum -= c[VERT_PRED];
3678  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3679  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3680  sum -= s2;
3681  adapt_prob(&pp[3], s2, sum, 20, 128);
3682  s2 -= c[HOR_PRED];
3683  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3684  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3685  sum -= c[DIAG_DOWN_LEFT_PRED];
3686  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3687  sum -= c[VERT_LEFT_PRED];
3688  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3689  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3690  }
3691 }
3692 
3693 static void free_buffers(VP9Context *s)
3694 {
3695  av_freep(&s->intra_pred_data[0]);
3696  av_freep(&s->b_base);
3697  av_freep(&s->block_base);
3698 }
3699 
3701 {
3702  VP9Context *s = ctx->priv_data;
3703  int i;
3704 
3705  for (i = 0; i < 2; i++) {
3706  if (s->frames[i].tf.f->data[0])
3707  vp9_unref_frame(ctx, &s->frames[i]);
3708  av_frame_free(&s->frames[i].tf.f);
3709  }
3710  for (i = 0; i < 8; i++) {
3711  if (s->refs[i].f->data[0])
3712  ff_thread_release_buffer(ctx, &s->refs[i]);
3713  av_frame_free(&s->refs[i].f);
3714  if (s->next_refs[i].f->data[0])
3715  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3716  av_frame_free(&s->next_refs[i].f);
3717  }
3718  free_buffers(s);
3719  av_freep(&s->c_b);
3720  s->c_b_size = 0;
3721 
3722  return 0;
3723 }
3724 
3725 
3726 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3727  int *got_frame, AVPacket *pkt)
3728 {
3729  const uint8_t *data = pkt->data;
3730  int size = pkt->size;
3731  VP9Context *s = ctx->priv_data;
3732  int res, tile_row, tile_col, i, ref, row, col;
3733  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3734  AVFrame *f;
3735 
3736  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3737  return res;
3738  } else if (res == 0) {
3739  if (!s->refs[ref].f->data[0]) {
3740  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3741  return AVERROR_INVALIDDATA;
3742  }
3743  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3744  return res;
3745  *got_frame = 1;
3746  return 0;
3747  }
3748  data += res;
3749  size -= res;
3750 
3751  if (s->frames[LAST_FRAME].tf.f->data[0])
3752  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3753  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3754  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3755  return res;
3756  if (s->frames[CUR_FRAME].tf.f->data[0])
3757  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3758  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3759  return res;
3760  f = s->frames[CUR_FRAME].tf.f;
3761  f->key_frame = s->keyframe;
3763  ls_y = f->linesize[0];
3764  ls_uv =f->linesize[1];
3765 
3766  // ref frame setup
3767  for (i = 0; i < 8; i++) {
3768  if (s->next_refs[i].f->data[0])
3769  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3770  if (s->refreshrefmask & (1 << i)) {
3771  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3772  } else {
3773  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3774  }
3775  if (res < 0)
3776  return res;
3777  }
3778 
3779  // main tile decode loop
3780  memset(s->above_partition_ctx, 0, s->cols);
3781  memset(s->above_skip_ctx, 0, s->cols);
3782  if (s->keyframe || s->intraonly) {
3783  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3784  } else {
3785  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3786  }
3787  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3788  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3789  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3790  memset(s->above_segpred_ctx, 0, s->cols);
3791  s->pass = s->uses_2pass =
3793  if ((res = update_block_buffers(ctx)) < 0) {
3794  av_log(ctx, AV_LOG_ERROR,
3795  "Failed to allocate block buffers\n");
3796  return res;
3797  }
3798  if (s->refreshctx && s->parallelmode) {
3799  int j, k, l, m;
3800 
3801  for (i = 0; i < 4; i++) {
3802  for (j = 0; j < 2; j++)
3803  for (k = 0; k < 2; k++)
3804  for (l = 0; l < 6; l++)
3805  for (m = 0; m < 6; m++)
3806  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3807  s->prob.coef[i][j][k][l][m], 3);
3808  if (s->txfmmode == i)
3809  break;
3810  }
3811  s->prob_ctx[s->framectxid].p = s->prob.p;
3813  }
3814 
3815  do {
3816  yoff = uvoff = 0;
3817  s->b = s->b_base;
3818  s->block = s->block_base;
3819  s->uvblock[0] = s->uvblock_base[0];
3820  s->uvblock[1] = s->uvblock_base[1];
3821  s->eob = s->eob_base;
3822  s->uveob[0] = s->uveob_base[0];
3823  s->uveob[1] = s->uveob_base[1];
3824 
3825  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3827  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3828  if (s->pass != 2) {
3829  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3830  unsigned tile_size;
3831 
3832  if (tile_col == s->tiling.tile_cols - 1 &&
3833  tile_row == s->tiling.tile_rows - 1) {
3834  tile_size = size;
3835  } else {
3836  tile_size = AV_RB32(data);
3837  data += 4;
3838  size -= 4;
3839  }
3840  if (tile_size > size) {
3841  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3842  return AVERROR_INVALIDDATA;
3843  }
3844  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3845  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3846  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3847  return AVERROR_INVALIDDATA;
3848  }
3849  data += tile_size;
3850  size -= tile_size;
3851  }
3852  }
3853 
3854  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3855  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3856  struct VP9Filter *lflvl_ptr = s->lflvl;
3857  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3858 
3859  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3861  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3862 
3863  if (s->pass != 2) {
3864  memset(s->left_partition_ctx, 0, 8);
3865  memset(s->left_skip_ctx, 0, 8);
3866  if (s->keyframe || s->intraonly) {
3867  memset(s->left_mode_ctx, DC_PRED, 16);
3868  } else {
3869  memset(s->left_mode_ctx, NEARESTMV, 8);
3870  }
3871  memset(s->left_y_nnz_ctx, 0, 16);
3872  memset(s->left_uv_nnz_ctx, 0, 16);
3873  memset(s->left_segpred_ctx, 0, 8);
3874 
3875  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3876  }
3877 
3878  for (col = s->tiling.tile_col_start;
3879  col < s->tiling.tile_col_end;
3880  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3881  // FIXME integrate with lf code (i.e. zero after each
3882  // use, similar to invtxfm coefficients, or similar)
3883  if (s->pass != 1) {
3884  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3885  }
3886 
3887  if (s->pass == 2) {
3888  decode_sb_mem(ctx, row, col, lflvl_ptr,
3889  yoff2, uvoff2, BL_64X64);
3890  } else {
3891  decode_sb(ctx, row, col, lflvl_ptr,
3892  yoff2, uvoff2, BL_64X64);
3893  }
3894  }
3895  if (s->pass != 2) {
3896  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3897  }
3898  }
3899 
3900  if (s->pass == 1) {
3901  continue;
3902  }
3903 
3904  // backup pre-loopfilter reconstruction data for intra
3905  // prediction of next row of sb64s
3906  if (row + 8 < s->rows) {
3907  memcpy(s->intra_pred_data[0],
3908  f->data[0] + yoff + 63 * ls_y,
3909  8 * s->cols);
3910  memcpy(s->intra_pred_data[1],
3911  f->data[1] + uvoff + 31 * ls_uv,
3912  4 * s->cols);
3913  memcpy(s->intra_pred_data[2],
3914  f->data[2] + uvoff + 31 * ls_uv,
3915  4 * s->cols);
3916  }
3917 
3918  // loopfilter one row
3919  if (s->filter.level) {
3920  yoff2 = yoff;
3921  uvoff2 = uvoff;
3922  lflvl_ptr = s->lflvl;
3923  for (col = 0; col < s->cols;
3924  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3925  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3926  }
3927  }
3928 
3929  // FIXME maybe we can make this more finegrained by running the
3930  // loopfilter per-block instead of after each sbrow
3931  // In fact that would also make intra pred left preparation easier?
3932  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3933  }
3934  }
3935 
3936  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3937  adapt_probs(s);
3939  }
3940  } while (s->pass++ == 1);
3941  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3942 
3943  // ref frame setup
3944  for (i = 0; i < 8; i++) {
3945  if (s->refs[i].f->data[0])
3946  ff_thread_release_buffer(ctx, &s->refs[i]);
3947  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3948  }
3949 
3950  if (!s->invisible) {
3951  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3952  return res;
3953  *got_frame = 1;
3954  }
3955 
3956  return 0;
3957 }
3958 
3960 {
3961  VP9Context *s = ctx->priv_data;
3962  int i;
3963 
3964  for (i = 0; i < 2; i++)
3965  vp9_unref_frame(ctx, &s->frames[i]);
3966  for (i = 0; i < 8; i++)
3967  ff_thread_release_buffer(ctx, &s->refs[i]);
3968 }
3969 
3970 static int init_frames(AVCodecContext *ctx)
3971 {
3972  VP9Context *s = ctx->priv_data;
3973  int i;
3974 
3975  for (i = 0; i < 2; i++) {
3976  s->frames[i].tf.f = av_frame_alloc();
3977  if (!s->frames[i].tf.f) {
3978  vp9_decode_free(ctx);
3979  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3980  return AVERROR(ENOMEM);
3981  }
3982  }
3983  for (i = 0; i < 8; i++) {
3984  s->refs[i].f = av_frame_alloc();
3985  s->next_refs[i].f = av_frame_alloc();
3986  if (!s->refs[i].f || !s->next_refs[i].f) {
3987  vp9_decode_free(ctx);
3988  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3989  return AVERROR(ENOMEM);
3990  }
3991  }
3992 
3993  return 0;
3994 }
3995 
3997 {
3998  VP9Context *s = ctx->priv_data;
3999 
4000  ctx->internal->allocate_progress = 1;
4001  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4002  ff_vp9dsp_init(&s->dsp);
4003  ff_videodsp_init(&s->vdsp, 8);
4004  s->filter.sharpness = -1;
4005 
4006  return init_frames(ctx);
4007 }
4008 
4010 {
4011  return init_frames(avctx);
4012 }
4013 
4015 {
4016  int i, res;
4017  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4018 
4019  // detect size changes in other threads
4020  if (s->intra_pred_data[0] &&
4021  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4022  free_buffers(s);
4023  }
4024 
4025  for (i = 0; i < 2; i++) {
4026  if (s->frames[i].tf.f->data[0])
4027  vp9_unref_frame(dst, &s->frames[i]);
4028  if (ssrc->frames[i].tf.f->data[0]) {
4029  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4030  return res;
4031  }
4032  }
4033  for (i = 0; i < 8; i++) {
4034  if (s->refs[i].f->data[0])
4035  ff_thread_release_buffer(dst, &s->refs[i]);
4036  if (ssrc->next_refs[i].f->data[0]) {
4037  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4038  return res;
4039  }
4040  }
4041 
4042  s->invisible = ssrc->invisible;
4043  s->keyframe = ssrc->keyframe;
4044  s->uses_2pass = ssrc->uses_2pass;
4045  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4046  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4047  if (ssrc->segmentation.enabled) {
4048  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4049  sizeof(s->segmentation.feat));
4050  }
4051 
4052  return 0;
4053 }
4054 
4056  .name = "vp9",
4057  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4058  .type = AVMEDIA_TYPE_VIDEO,
4059  .id = AV_CODEC_ID_VP9,
4060  .priv_data_size = sizeof(VP9Context),
4061  .init = vp9_decode_init,
4064  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4068 };