FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "profiles.h"
28 #include "thread.h"
29 #include "videodsp.h"
30 #include "vp56.h"
31 #include "vp9.h"
32 #include "vp9data.h"
33 #include "vp9dsp.h"
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
36 
37 #define VP9_SYNCCODE 0x498342
38 
39 struct VP9Filter {
40  uint8_t level[8 * 8];
41  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
43 };
44 
45 typedef struct VP9Block {
48  VP56mv mv[4 /* b_idx */][2 /* ref */];
49  enum BlockSize bs;
50  enum TxfmMode tx, uvtx;
51  enum BlockLevel bl;
53 } VP9Block;
54 
55 typedef struct VP9Context {
57 
63  unsigned c_b_size;
65  int pass;
66  int row, row7, col, col7;
67  uint8_t *dst[3];
68  ptrdiff_t y_stride, uv_stride;
69 
73  // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74  // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75  // and are therefore per-stream. pix_fmt represents the value in the header
76  // of the currently processed frame.
77  int w, h;
78  enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79  unsigned sb_cols, sb_rows, rows, cols;
81 
82  struct {
85  } filter_lut;
87  struct {
89  uint8_t coef[4][2][2][6][6][3];
90  } prob_ctx[4];
91  struct {
93  uint8_t coef[4][2][2][6][6][11];
94  } prob;
95  struct {
96  unsigned y_mode[4][10];
97  unsigned uv_mode[10][10];
98  unsigned filter[4][3];
99  unsigned mv_mode[7][4];
100  unsigned intra[4][2];
101  unsigned comp[5][2];
102  unsigned single_ref[5][2][2];
103  unsigned comp_ref[5][2];
104  unsigned tx32p[2][4];
105  unsigned tx16p[2][3];
106  unsigned tx8p[2][2];
107  unsigned skip[3][2];
108  unsigned mv_joint[4];
109  struct {
110  unsigned sign[2];
111  unsigned classes[11];
112  unsigned class0[2];
113  unsigned bits[10][2];
114  unsigned class0_fp[2][4];
115  unsigned fp[4];
116  unsigned class0_hp[2];
117  unsigned hp[2];
118  } mv_comp[2];
119  unsigned partition[4][4][4];
120  unsigned coef[4][2][2][6][6][3];
121  unsigned eob[4][2][2][6][6][2];
122  } counts;
123 
124  // contextual (left/above) cache
139  // FIXME maybe merge some of the below in a flags field?
150 
151  // whole-frame cache
153  struct VP9Filter *lflvl;
155 
156  // block reconstruction intermediates
158  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
160  struct { int x, y; } min_mv, max_mv;
161  DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163  uint16_t mvscale[3][2];
165 } VP9Context;
166 
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
168  {
169  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
171  }, {
172  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
174  }
175 };
176 
178 {
179  ff_thread_release_buffer(ctx, &f->tf);
182  f->segmentation_map = NULL;
184 }
185 
187 {
188  VP9Context *s = ctx->priv_data;
189  int ret, sz;
190 
191  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
192  return ret;
193  sz = 64 * s->sb_cols * s->sb_rows;
194  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
195  goto fail;
196  }
197 
199  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
200 
201  if (ctx->hwaccel) {
202  const AVHWAccel *hwaccel = ctx->hwaccel;
204  if (hwaccel->frame_priv_data_size) {
206  if (!f->hwaccel_priv_buf)
207  goto fail;
209  }
210  }
211 
212  return 0;
213 
214 fail:
215  vp9_unref_frame(ctx, f);
216  return AVERROR(ENOMEM);
217 }
218 
220 {
221  int res;
222 
223  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
224  return res;
225  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
226  goto fail;
227  }
228 
230  dst->mv = src->mv;
231  dst->uses_2pass = src->uses_2pass;
232 
233  if (src->hwaccel_picture_private) {
235  if (!dst->hwaccel_priv_buf)
236  goto fail;
238  }
239 
240  return 0;
241 
242 fail:
243  vp9_unref_frame(ctx, dst);
244  return AVERROR(ENOMEM);
245 }
246 
247 static int update_size(AVCodecContext *ctx, int w, int h)
248 {
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250  enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251  VP9Context *s = ctx->priv_data;
252  uint8_t *p;
253  int bytesperpixel = s->bytesperpixel, res, cols, rows;
254 
255  av_assert0(w > 0 && h > 0);
256 
257  if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258  if ((res = ff_set_dimensions(ctx, w, h)) < 0)
259  return res;
260 
261  if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
262 #if CONFIG_VP9_DXVA2_HWACCEL
263  *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
264 #endif
265 #if CONFIG_VP9_D3D11VA_HWACCEL
266  *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
267 #endif
268 #if CONFIG_VP9_VAAPI_HWACCEL
269  *fmtp++ = AV_PIX_FMT_VAAPI;
270 #endif
271  }
272 
273  *fmtp++ = s->pix_fmt;
274  *fmtp = AV_PIX_FMT_NONE;
275 
276  res = ff_thread_get_format(ctx, pix_fmts);
277  if (res < 0)
278  return res;
279 
280  ctx->pix_fmt = res;
281  s->gf_fmt = s->pix_fmt;
282  s->w = w;
283  s->h = h;
284  }
285 
286  cols = (w + 7) >> 3;
287  rows = (h + 7) >> 3;
288 
289  if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
290  return 0;
291 
292  s->last_fmt = s->pix_fmt;
293  s->sb_cols = (w + 63) >> 6;
294  s->sb_rows = (h + 63) >> 6;
295  s->cols = (w + 7) >> 3;
296  s->rows = (h + 7) >> 3;
297 
298 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
299  av_freep(&s->intra_pred_data[0]);
300  // FIXME we slightly over-allocate here for subsampled chroma, but a little
301  // bit of padding shouldn't affect performance...
302  p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
303  sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
304  if (!p)
305  return AVERROR(ENOMEM);
306  assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
307  assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
308  assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
309  assign(s->above_y_nnz_ctx, uint8_t *, 16);
310  assign(s->above_mode_ctx, uint8_t *, 16);
311  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
312  assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
313  assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
315  assign(s->above_skip_ctx, uint8_t *, 8);
316  assign(s->above_txfm_ctx, uint8_t *, 8);
317  assign(s->above_segpred_ctx, uint8_t *, 8);
318  assign(s->above_intra_ctx, uint8_t *, 8);
319  assign(s->above_comp_ctx, uint8_t *, 8);
320  assign(s->above_ref_ctx, uint8_t *, 8);
321  assign(s->above_filter_ctx, uint8_t *, 8);
322  assign(s->lflvl, struct VP9Filter *, 1);
323 #undef assign
324 
325  // these will be re-allocated a little later
326  av_freep(&s->b_base);
327  av_freep(&s->block_base);
328 
329  if (s->bpp != s->last_bpp) {
331  ff_videodsp_init(&s->vdsp, s->bpp);
332  s->last_bpp = s->bpp;
333  }
334 
335  return 0;
336 }
337 
339 {
340  VP9Context *s = ctx->priv_data;
341  int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
342 
344  return 0;
345 
346  av_free(s->b_base);
347  av_free(s->block_base);
348  chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
349  chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
350  if (s->s.frames[CUR_FRAME].uses_2pass) {
351  int sbs = s->sb_cols * s->sb_rows;
352 
353  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
354  s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
355  16 * 16 + 2 * chroma_eobs) * sbs);
356  if (!s->b_base || !s->block_base)
357  return AVERROR(ENOMEM);
358  s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
359  s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
360  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
361  s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
362  s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
363  } else {
364  s->b_base = av_malloc(sizeof(VP9Block));
365  s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
366  16 * 16 + 2 * chroma_eobs);
367  if (!s->b_base || !s->block_base)
368  return AVERROR(ENOMEM);
369  s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
370  s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
371  s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
372  s->uveob_base[0] = s->eob_base + 16 * 16;
373  s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
374  }
376 
377  return 0;
378 }
379 
380 // for some reason the sign bit is at the end, not the start, of a bit sequence
382 {
383  int v = get_bits(gb, n);
384  return get_bits1(gb) ? -v : v;
385 }
386 
387 static av_always_inline int inv_recenter_nonneg(int v, int m)
388 {
389  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
390 }
391 
392 // differential forward probability updates
393 static int update_prob(VP56RangeCoder *c, int p)
394 {
395  static const int inv_map_table[255] = {
396  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
397  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
398  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
399  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
400  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
401  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
402  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
403  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
404  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
405  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
406  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
407  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
408  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
409  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
410  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
411  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
412  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
413  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
414  252, 253, 253,
415  };
416  int d;
417 
418  /* This code is trying to do a differential probability update. For a
419  * current probability A in the range [1, 255], the difference to a new
420  * probability of any value can be expressed differentially as 1-A,255-A
421  * where some part of this (absolute range) exists both in positive as
422  * well as the negative part, whereas another part only exists in one
423  * half. We're trying to code this shared part differentially, i.e.
424  * times two where the value of the lowest bit specifies the sign, and
425  * the single part is then coded on top of this. This absolute difference
426  * then again has a value of [0,254], but a bigger value in this range
427  * indicates that we're further away from the original value A, so we
428  * can code this as a VLC code, since higher values are increasingly
429  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
430  * updates vs. the 'fine, exact' updates further down the range, which
431  * adds one extra dimension to this differential update model. */
432 
433  if (!vp8_rac_get(c)) {
434  d = vp8_rac_get_uint(c, 4) + 0;
435  } else if (!vp8_rac_get(c)) {
436  d = vp8_rac_get_uint(c, 4) + 16;
437  } else if (!vp8_rac_get(c)) {
438  d = vp8_rac_get_uint(c, 5) + 32;
439  } else {
440  d = vp8_rac_get_uint(c, 7);
441  if (d >= 65)
442  d = (d << 1) - 65 + vp8_rac_get(c);
443  d += 64;
444  av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
445  }
446 
447  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
448  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
449 }
450 
452 {
453  static const enum AVColorSpace colorspaces[8] = {
456  };
457  VP9Context *s = ctx->priv_data;
458  int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
459 
460  s->bpp_index = bits;
461  s->bpp = 8 + bits * 2;
462  s->bytesperpixel = (7 + s->bpp) >> 3;
463  ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
464  if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
465  static const enum AVPixelFormat pix_fmt_rgb[3] = {
467  };
468  s->ss_h = s->ss_v = 0;
470  s->pix_fmt = pix_fmt_rgb[bits];
471  if (ctx->profile & 1) {
472  if (get_bits1(&s->gb)) {
473  av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
474  return AVERROR_INVALIDDATA;
475  }
476  } else {
477  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
478  ctx->profile);
479  return AVERROR_INVALIDDATA;
480  }
481  } else {
482  static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
489  };
491  if (ctx->profile & 1) {
492  s->ss_h = get_bits1(&s->gb);
493  s->ss_v = get_bits1(&s->gb);
494  s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
495  if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
496  av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
497  ctx->profile);
498  return AVERROR_INVALIDDATA;
499  } else if (get_bits1(&s->gb)) {
500  av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
501  ctx->profile);
502  return AVERROR_INVALIDDATA;
503  }
504  } else {
505  s->ss_h = s->ss_v = 1;
506  s->pix_fmt = pix_fmt_for_ss[bits][1][1];
507  }
508  }
509 
510  return 0;
511 }
512 
514  const uint8_t *data, int size, int *ref)
515 {
516  VP9Context *s = ctx->priv_data;
517  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
518  int last_invisible;
519  const uint8_t *data2;
520 
521  /* general header */
522  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
523  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
524  return res;
525  }
526  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
527  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
528  return AVERROR_INVALIDDATA;
529  }
530  ctx->profile = get_bits1(&s->gb);
531  ctx->profile |= get_bits1(&s->gb) << 1;
532  if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
533  if (ctx->profile > 3) {
534  av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
535  return AVERROR_INVALIDDATA;
536  }
537  s->s.h.profile = ctx->profile;
538  if (get_bits1(&s->gb)) {
539  *ref = get_bits(&s->gb, 3);
540  return 0;
541  }
542  s->last_keyframe = s->s.h.keyframe;
543  s->s.h.keyframe = !get_bits1(&s->gb);
544  last_invisible = s->s.h.invisible;
545  s->s.h.invisible = !get_bits1(&s->gb);
546  s->s.h.errorres = get_bits1(&s->gb);
547  s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
548  if (s->s.h.keyframe) {
549  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
550  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
551  return AVERROR_INVALIDDATA;
552  }
553  if ((res = read_colorspace_details(ctx)) < 0)
554  return res;
555  // for profile 1, here follows the subsampling bits
556  s->s.h.refreshrefmask = 0xff;
557  w = get_bits(&s->gb, 16) + 1;
558  h = get_bits(&s->gb, 16) + 1;
559  if (get_bits1(&s->gb)) // display size
560  skip_bits(&s->gb, 32);
561  } else {
562  s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
563  s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
564  if (s->s.h.intraonly) {
565  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
566  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
567  return AVERROR_INVALIDDATA;
568  }
569  if (ctx->profile >= 1) {
570  if ((res = read_colorspace_details(ctx)) < 0)
571  return res;
572  } else {
573  s->ss_h = s->ss_v = 1;
574  s->bpp = 8;
575  s->bpp_index = 0;
576  s->bytesperpixel = 1;
577  s->pix_fmt = AV_PIX_FMT_YUV420P;
580  }
581  s->s.h.refreshrefmask = get_bits(&s->gb, 8);
582  w = get_bits(&s->gb, 16) + 1;
583  h = get_bits(&s->gb, 16) + 1;
584  if (get_bits1(&s->gb)) // display size
585  skip_bits(&s->gb, 32);
586  } else {
587  s->s.h.refreshrefmask = get_bits(&s->gb, 8);
588  s->s.h.refidx[0] = get_bits(&s->gb, 3);
589  s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
590  s->s.h.refidx[1] = get_bits(&s->gb, 3);
591  s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
592  s->s.h.refidx[2] = get_bits(&s->gb, 3);
593  s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
594  if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
595  !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
596  !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
597  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
598  return AVERROR_INVALIDDATA;
599  }
600  if (get_bits1(&s->gb)) {
601  w = s->s.refs[s->s.h.refidx[0]].f->width;
602  h = s->s.refs[s->s.h.refidx[0]].f->height;
603  } else if (get_bits1(&s->gb)) {
604  w = s->s.refs[s->s.h.refidx[1]].f->width;
605  h = s->s.refs[s->s.h.refidx[1]].f->height;
606  } else if (get_bits1(&s->gb)) {
607  w = s->s.refs[s->s.h.refidx[2]].f->width;
608  h = s->s.refs[s->s.h.refidx[2]].f->height;
609  } else {
610  w = get_bits(&s->gb, 16) + 1;
611  h = get_bits(&s->gb, 16) + 1;
612  }
613  // Note that in this code, "CUR_FRAME" is actually before we
614  // have formally allocated a frame, and thus actually represents
615  // the _last_ frame
616  s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
617  s->s.frames[CUR_FRAME].tf.f->height == h;
618  if (get_bits1(&s->gb)) // display size
619  skip_bits(&s->gb, 32);
620  s->s.h.highprecisionmvs = get_bits1(&s->gb);
622  get_bits(&s->gb, 2);
623  s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
624  s->s.h.signbias[0] != s->s.h.signbias[2];
625  if (s->s.h.allowcompinter) {
626  if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
627  s->s.h.fixcompref = 2;
628  s->s.h.varcompref[0] = 0;
629  s->s.h.varcompref[1] = 1;
630  } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
631  s->s.h.fixcompref = 1;
632  s->s.h.varcompref[0] = 0;
633  s->s.h.varcompref[1] = 2;
634  } else {
635  s->s.h.fixcompref = 0;
636  s->s.h.varcompref[0] = 1;
637  s->s.h.varcompref[1] = 2;
638  }
639  }
640  }
641  }
642  s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
643  s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
644  s->s.h.framectxid = c = get_bits(&s->gb, 2);
645 
646  /* loopfilter header data */
647  if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
648  // reset loopfilter defaults
649  s->s.h.lf_delta.ref[0] = 1;
650  s->s.h.lf_delta.ref[1] = 0;
651  s->s.h.lf_delta.ref[2] = -1;
652  s->s.h.lf_delta.ref[3] = -1;
653  s->s.h.lf_delta.mode[0] = 0;
654  s->s.h.lf_delta.mode[1] = 0;
655  memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
656  }
657  s->s.h.filter.level = get_bits(&s->gb, 6);
658  sharp = get_bits(&s->gb, 3);
659  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
660  // the old cache values since they are still valid
661  if (s->s.h.filter.sharpness != sharp)
662  memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
663  s->s.h.filter.sharpness = sharp;
664  if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
665  if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
666  for (i = 0; i < 4; i++)
667  if (get_bits1(&s->gb))
668  s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
669  for (i = 0; i < 2; i++)
670  if (get_bits1(&s->gb))
671  s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
672  }
673  }
674 
675  /* quantization header data */
676  s->s.h.yac_qi = get_bits(&s->gb, 8);
677  s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
678  s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
679  s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
680  s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
681  s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
682  if (s->s.h.lossless)
684 
685  /* segmentation header info */
686  if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
687  if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
688  for (i = 0; i < 7; i++)
689  s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
690  get_bits(&s->gb, 8) : 255;
691  if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
692  for (i = 0; i < 3; i++)
693  s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
694  get_bits(&s->gb, 8) : 255;
695  }
696  }
697 
698  if (get_bits1(&s->gb)) {
700  for (i = 0; i < 8; i++) {
701  if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
702  s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
703  if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
704  s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
705  if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
706  s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
707  s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
708  }
709  }
710  }
711 
712  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
713  for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
714  int qyac, qydc, quvac, quvdc, lflvl, sh;
715 
716  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
717  if (s->s.h.segmentation.absolute_vals)
718  qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
719  else
720  qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
721  } else {
722  qyac = s->s.h.yac_qi;
723  }
724  qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
725  quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
726  quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
727  qyac = av_clip_uintp2(qyac, 8);
728 
729  s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
730  s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
731  s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
732  s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
733 
734  sh = s->s.h.filter.level >= 32;
735  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
736  if (s->s.h.segmentation.absolute_vals)
737  lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
738  else
739  lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
740  } else {
741  lflvl = s->s.h.filter.level;
742  }
743  if (s->s.h.lf_delta.enabled) {
744  s->s.h.segmentation.feat[i].lflvl[0][0] =
745  s->s.h.segmentation.feat[i].lflvl[0][1] =
746  av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] << sh), 6);
747  for (j = 1; j < 4; j++) {
748  s->s.h.segmentation.feat[i].lflvl[j][0] =
749  av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
750  s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
751  s->s.h.segmentation.feat[i].lflvl[j][1] =
752  av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
753  s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
754  }
755  } else {
756  memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
757  sizeof(s->s.h.segmentation.feat[i].lflvl));
758  }
759  }
760 
761  /* tiling info */
762  if ((res = update_size(ctx, w, h)) < 0) {
763  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
764  w, h, s->pix_fmt);
765  return res;
766  }
767  for (s->s.h.tiling.log2_tile_cols = 0;
768  s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
769  s->s.h.tiling.log2_tile_cols++) ;
770  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
771  max = FFMAX(0, max - 1);
772  while (max > s->s.h.tiling.log2_tile_cols) {
773  if (get_bits1(&s->gb))
774  s->s.h.tiling.log2_tile_cols++;
775  else
776  break;
777  }
778  s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
779  s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
780  if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
781  s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
782  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
783  sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
784  if (!s->c_b) {
785  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
786  return AVERROR(ENOMEM);
787  }
788  }
789 
790  /* check reference frames */
791  if (!s->s.h.keyframe && !s->s.h.intraonly) {
792  for (i = 0; i < 3; i++) {
793  AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
794  int refw = ref->width, refh = ref->height;
795 
796  if (ref->format != ctx->pix_fmt) {
797  av_log(ctx, AV_LOG_ERROR,
798  "Ref pixfmt (%s) did not match current frame (%s)",
801  return AVERROR_INVALIDDATA;
802  } else if (refw == w && refh == h) {
803  s->mvscale[i][0] = s->mvscale[i][1] = 0;
804  } else {
805  if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
806  av_log(ctx, AV_LOG_ERROR,
807  "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
808  refw, refh, w, h);
809  return AVERROR_INVALIDDATA;
810  }
811  s->mvscale[i][0] = (refw << 14) / w;
812  s->mvscale[i][1] = (refh << 14) / h;
813  s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
814  s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
815  }
816  }
817  }
818 
819  if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
820  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
821  s->prob_ctx[3].p = vp9_default_probs;
822  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
823  sizeof(vp9_default_coef_probs));
824  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
825  sizeof(vp9_default_coef_probs));
826  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
827  sizeof(vp9_default_coef_probs));
828  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
829  sizeof(vp9_default_coef_probs));
830  } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
832  memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
833  sizeof(vp9_default_coef_probs));
834  }
835 
836  // next 16 bits is size of the rest of the header (arith-coded)
837  s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
838  s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
839 
840  data2 = align_get_bits(&s->gb);
841  if (size2 > size - (data2 - data)) {
842  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
843  return AVERROR_INVALIDDATA;
844  }
845  ff_vp56_init_range_decoder(&s->c, data2, size2);
846  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
847  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
848  return AVERROR_INVALIDDATA;
849  }
850 
851  if (s->s.h.keyframe || s->s.h.intraonly) {
852  memset(s->counts.coef, 0, sizeof(s->counts.coef));
853  memset(s->counts.eob, 0, sizeof(s->counts.eob));
854  } else {
855  memset(&s->counts, 0, sizeof(s->counts));
856  }
857  // FIXME is it faster to not copy here, but do it down in the fw updates
858  // as explicit copies if the fw update is missing (and skip the copy upon
859  // fw update)?
860  s->prob.p = s->prob_ctx[c].p;
861 
862  // txfm updates
863  if (s->s.h.lossless) {
864  s->s.h.txfmmode = TX_4X4;
865  } else {
866  s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
867  if (s->s.h.txfmmode == 3)
868  s->s.h.txfmmode += vp8_rac_get(&s->c);
869 
870  if (s->s.h.txfmmode == TX_SWITCHABLE) {
871  for (i = 0; i < 2; i++)
872  if (vp56_rac_get_prob_branchy(&s->c, 252))
873  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
874  for (i = 0; i < 2; i++)
875  for (j = 0; j < 2; j++)
876  if (vp56_rac_get_prob_branchy(&s->c, 252))
877  s->prob.p.tx16p[i][j] =
878  update_prob(&s->c, s->prob.p.tx16p[i][j]);
879  for (i = 0; i < 2; i++)
880  for (j = 0; j < 3; j++)
881  if (vp56_rac_get_prob_branchy(&s->c, 252))
882  s->prob.p.tx32p[i][j] =
883  update_prob(&s->c, s->prob.p.tx32p[i][j]);
884  }
885  }
886 
887  // coef updates
888  for (i = 0; i < 4; i++) {
889  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
890  if (vp8_rac_get(&s->c)) {
891  for (j = 0; j < 2; j++)
892  for (k = 0; k < 2; k++)
893  for (l = 0; l < 6; l++)
894  for (m = 0; m < 6; m++) {
895  uint8_t *p = s->prob.coef[i][j][k][l][m];
896  uint8_t *r = ref[j][k][l][m];
897  if (m >= 3 && l == 0) // dc only has 3 pt
898  break;
899  for (n = 0; n < 3; n++) {
900  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
901  p[n] = update_prob(&s->c, r[n]);
902  } else {
903  p[n] = r[n];
904  }
905  }
906  p[3] = 0;
907  }
908  } else {
909  for (j = 0; j < 2; j++)
910  for (k = 0; k < 2; k++)
911  for (l = 0; l < 6; l++)
912  for (m = 0; m < 6; m++) {
913  uint8_t *p = s->prob.coef[i][j][k][l][m];
914  uint8_t *r = ref[j][k][l][m];
915  if (m > 3 && l == 0) // dc only has 3 pt
916  break;
917  memcpy(p, r, 3);
918  p[3] = 0;
919  }
920  }
921  if (s->s.h.txfmmode == i)
922  break;
923  }
924 
925  // mode updates
926  for (i = 0; i < 3; i++)
927  if (vp56_rac_get_prob_branchy(&s->c, 252))
928  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
929  if (!s->s.h.keyframe && !s->s.h.intraonly) {
930  for (i = 0; i < 7; i++)
931  for (j = 0; j < 3; j++)
932  if (vp56_rac_get_prob_branchy(&s->c, 252))
933  s->prob.p.mv_mode[i][j] =
934  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
935 
936  if (s->s.h.filtermode == FILTER_SWITCHABLE)
937  for (i = 0; i < 4; i++)
938  for (j = 0; j < 2; j++)
939  if (vp56_rac_get_prob_branchy(&s->c, 252))
940  s->prob.p.filter[i][j] =
941  update_prob(&s->c, s->prob.p.filter[i][j]);
942 
943  for (i = 0; i < 4; i++)
944  if (vp56_rac_get_prob_branchy(&s->c, 252))
945  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
946 
947  if (s->s.h.allowcompinter) {
948  s->s.h.comppredmode = vp8_rac_get(&s->c);
949  if (s->s.h.comppredmode)
950  s->s.h.comppredmode += vp8_rac_get(&s->c);
951  if (s->s.h.comppredmode == PRED_SWITCHABLE)
952  for (i = 0; i < 5; i++)
953  if (vp56_rac_get_prob_branchy(&s->c, 252))
954  s->prob.p.comp[i] =
955  update_prob(&s->c, s->prob.p.comp[i]);
956  } else {
958  }
959 
960  if (s->s.h.comppredmode != PRED_COMPREF) {
961  for (i = 0; i < 5; i++) {
962  if (vp56_rac_get_prob_branchy(&s->c, 252))
963  s->prob.p.single_ref[i][0] =
964  update_prob(&s->c, s->prob.p.single_ref[i][0]);
965  if (vp56_rac_get_prob_branchy(&s->c, 252))
966  s->prob.p.single_ref[i][1] =
967  update_prob(&s->c, s->prob.p.single_ref[i][1]);
968  }
969  }
970 
971  if (s->s.h.comppredmode != PRED_SINGLEREF) {
972  for (i = 0; i < 5; i++)
973  if (vp56_rac_get_prob_branchy(&s->c, 252))
974  s->prob.p.comp_ref[i] =
975  update_prob(&s->c, s->prob.p.comp_ref[i]);
976  }
977 
978  for (i = 0; i < 4; i++)
979  for (j = 0; j < 9; j++)
980  if (vp56_rac_get_prob_branchy(&s->c, 252))
981  s->prob.p.y_mode[i][j] =
982  update_prob(&s->c, s->prob.p.y_mode[i][j]);
983 
984  for (i = 0; i < 4; i++)
985  for (j = 0; j < 4; j++)
986  for (k = 0; k < 3; k++)
987  if (vp56_rac_get_prob_branchy(&s->c, 252))
988  s->prob.p.partition[3 - i][j][k] =
989  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
990 
991  // mv fields don't use the update_prob subexp model for some reason
992  for (i = 0; i < 3; i++)
993  if (vp56_rac_get_prob_branchy(&s->c, 252))
994  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
995 
996  for (i = 0; i < 2; i++) {
997  if (vp56_rac_get_prob_branchy(&s->c, 252))
998  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
999 
1000  for (j = 0; j < 10; j++)
1001  if (vp56_rac_get_prob_branchy(&s->c, 252))
1002  s->prob.p.mv_comp[i].classes[j] =
1003  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1004 
1005  if (vp56_rac_get_prob_branchy(&s->c, 252))
1006  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1007 
1008  for (j = 0; j < 10; j++)
1009  if (vp56_rac_get_prob_branchy(&s->c, 252))
1010  s->prob.p.mv_comp[i].bits[j] =
1011  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1012  }
1013 
1014  for (i = 0; i < 2; i++) {
1015  for (j = 0; j < 2; j++)
1016  for (k = 0; k < 3; k++)
1017  if (vp56_rac_get_prob_branchy(&s->c, 252))
1018  s->prob.p.mv_comp[i].class0_fp[j][k] =
1019  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1020 
1021  for (j = 0; j < 3; j++)
1022  if (vp56_rac_get_prob_branchy(&s->c, 252))
1023  s->prob.p.mv_comp[i].fp[j] =
1024  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1025  }
1026 
1027  if (s->s.h.highprecisionmvs) {
1028  for (i = 0; i < 2; i++) {
1029  if (vp56_rac_get_prob_branchy(&s->c, 252))
1030  s->prob.p.mv_comp[i].class0_hp =
1031  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1032 
1033  if (vp56_rac_get_prob_branchy(&s->c, 252))
1034  s->prob.p.mv_comp[i].hp =
1035  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1036  }
1037  }
1038  }
1039 
1040  return (data2 - data) + size2;
1041 }
1042 
1043 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1044  VP9Context *s)
1045 {
1046  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1047  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1048 }
1049 
1051  VP56mv *pmv, int ref, int z, int idx, int sb)
1052 {
1053  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1054  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1055  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1056  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1057  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1058  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1059  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1060  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1061  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1062  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1063  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1064  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1065  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1066  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1067  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1068  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1069  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1070  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1071  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1072  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1073  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1074  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1075  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1076  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1077  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1078  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1079  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1080  };
1081  VP9Block *b = s->b;
1082  int row = s->row, col = s->col, row7 = s->row7;
1083  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1084 #define INVALID_MV 0x80008000U
1085  uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1086  int i;
1087 
1088 #define RETURN_DIRECT_MV(mv) \
1089  do { \
1090  uint32_t m = AV_RN32A(&mv); \
1091  if (!idx) { \
1092  AV_WN32A(pmv, m); \
1093  return; \
1094  } else if (mem == INVALID_MV) { \
1095  mem = m; \
1096  } else if (m != mem) { \
1097  AV_WN32A(pmv, m); \
1098  return; \
1099  } \
1100  } while (0)
1101 
1102  if (sb >= 0) {
1103  if (sb == 2 || sb == 1) {
1104  RETURN_DIRECT_MV(b->mv[0][z]);
1105  } else if (sb == 3) {
1106  RETURN_DIRECT_MV(b->mv[2][z]);
1107  RETURN_DIRECT_MV(b->mv[1][z]);
1108  RETURN_DIRECT_MV(b->mv[0][z]);
1109  }
1110 
1111 #define RETURN_MV(mv) \
1112  do { \
1113  if (sb > 0) { \
1114  VP56mv tmp; \
1115  uint32_t m; \
1116  av_assert2(idx == 1); \
1117  av_assert2(mem != INVALID_MV); \
1118  if (mem_sub8x8 == INVALID_MV) { \
1119  clamp_mv(&tmp, &mv, s); \
1120  m = AV_RN32A(&tmp); \
1121  if (m != mem) { \
1122  AV_WN32A(pmv, m); \
1123  return; \
1124  } \
1125  mem_sub8x8 = AV_RN32A(&mv); \
1126  } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1127  clamp_mv(&tmp, &mv, s); \
1128  m = AV_RN32A(&tmp); \
1129  if (m != mem) { \
1130  AV_WN32A(pmv, m); \
1131  } else { \
1132  /* BUG I'm pretty sure this isn't the intention */ \
1133  AV_WN32A(pmv, 0); \
1134  } \
1135  return; \
1136  } \
1137  } else { \
1138  uint32_t m = AV_RN32A(&mv); \
1139  if (!idx) { \
1140  clamp_mv(pmv, &mv, s); \
1141  return; \
1142  } else if (mem == INVALID_MV) { \
1143  mem = m; \
1144  } else if (m != mem) { \
1145  clamp_mv(pmv, &mv, s); \
1146  return; \
1147  } \
1148  } \
1149  } while (0)
1150 
1151  if (row > 0) {
1152  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1153  if (mv->ref[0] == ref) {
1154  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1155  } else if (mv->ref[1] == ref) {
1156  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1157  }
1158  }
1159  if (col > s->tile_col_start) {
1160  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1161  if (mv->ref[0] == ref) {
1162  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1163  } else if (mv->ref[1] == ref) {
1164  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1165  }
1166  }
1167  i = 2;
1168  } else {
1169  i = 0;
1170  }
1171 
1172  // previously coded MVs in this neighbourhood, using same reference frame
1173  for (; i < 8; i++) {
1174  int c = p[i][0] + col, r = p[i][1] + row;
1175 
1176  if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1177  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1178 
1179  if (mv->ref[0] == ref) {
1180  RETURN_MV(mv->mv[0]);
1181  } else if (mv->ref[1] == ref) {
1182  RETURN_MV(mv->mv[1]);
1183  }
1184  }
1185  }
1186 
1187  // MV at this position in previous frame, using same reference frame
1188  if (s->s.h.use_last_frame_mvs) {
1189  struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1190 
1193  if (mv->ref[0] == ref) {
1194  RETURN_MV(mv->mv[0]);
1195  } else if (mv->ref[1] == ref) {
1196  RETURN_MV(mv->mv[1]);
1197  }
1198  }
1199 
1200 #define RETURN_SCALE_MV(mv, scale) \
1201  do { \
1202  if (scale) { \
1203  VP56mv mv_temp = { -mv.x, -mv.y }; \
1204  RETURN_MV(mv_temp); \
1205  } else { \
1206  RETURN_MV(mv); \
1207  } \
1208  } while (0)
1209 
1210  // previously coded MVs in this neighbourhood, using different reference frame
1211  for (i = 0; i < 8; i++) {
1212  int c = p[i][0] + col, r = p[i][1] + row;
1213 
1214  if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1215  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1216 
1217  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1218  RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1219  }
1220  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1221  // BUG - libvpx has this condition regardless of whether
1222  // we used the first ref MV and pre-scaling
1223  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1224  RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1225  }
1226  }
1227  }
1228 
1229  // MV at this position in previous frame, using different reference frame
1230  if (s->s.h.use_last_frame_mvs) {
1231  struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1232 
1233  // no need to await_progress, because we already did that above
1234  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1235  RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1236  }
1237  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1238  // BUG - libvpx has this condition regardless of whether
1239  // we used the first ref MV and pre-scaling
1240  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1241  RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1242  }
1243  }
1244 
1245  AV_ZERO32(pmv);
1246  clamp_mv(pmv, pmv, s);
1247 #undef INVALID_MV
1248 #undef RETURN_MV
1249 #undef RETURN_SCALE_MV
1250 }
1251 
1252 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1253 {
1254  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1255  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1256  s->prob.p.mv_comp[idx].classes);
1257 
1258  s->counts.mv_comp[idx].sign[sign]++;
1259  s->counts.mv_comp[idx].classes[c]++;
1260  if (c) {
1261  int m;
1262 
1263  for (n = 0, m = 0; m < c; m++) {
1264  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1265  n |= bit << m;
1266  s->counts.mv_comp[idx].bits[m][bit]++;
1267  }
1268  n <<= 3;
1269  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1270  n |= bit << 1;
1271  s->counts.mv_comp[idx].fp[bit]++;
1272  if (hp) {
1273  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1274  s->counts.mv_comp[idx].hp[bit]++;
1275  n |= bit;
1276  } else {
1277  n |= 1;
1278  // bug in libvpx - we count for bw entropy purposes even if the
1279  // bit wasn't coded
1280  s->counts.mv_comp[idx].hp[1]++;
1281  }
1282  n += 8 << c;
1283  } else {
1284  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1285  s->counts.mv_comp[idx].class0[n]++;
1286  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1287  s->prob.p.mv_comp[idx].class0_fp[n]);
1288  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1289  n = (n << 3) | (bit << 1);
1290  if (hp) {
1291  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1292  s->counts.mv_comp[idx].class0_hp[bit]++;
1293  n |= bit;
1294  } else {
1295  n |= 1;
1296  // bug in libvpx - we count for bw entropy purposes even if the
1297  // bit wasn't coded
1298  s->counts.mv_comp[idx].class0_hp[1]++;
1299  }
1300  }
1301 
1302  return sign ? -(n + 1) : (n + 1);
1303 }
1304 
1305 static void fill_mv(VP9Context *s,
1306  VP56mv *mv, int mode, int sb)
1307 {
1308  VP9Block *b = s->b;
1309 
1310  if (mode == ZEROMV) {
1311  AV_ZERO64(mv);
1312  } else {
1313  int hp;
1314 
1315  // FIXME cache this value and reuse for other subblocks
1316  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1317  mode == NEWMV ? -1 : sb);
1318  // FIXME maybe move this code into find_ref_mvs()
1319  if ((mode == NEWMV || sb == -1) &&
1320  !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1321  if (mv[0].y & 1) {
1322  if (mv[0].y < 0)
1323  mv[0].y++;
1324  else
1325  mv[0].y--;
1326  }
1327  if (mv[0].x & 1) {
1328  if (mv[0].x < 0)
1329  mv[0].x++;
1330  else
1331  mv[0].x--;
1332  }
1333  }
1334  if (mode == NEWMV) {
1336  s->prob.p.mv_joint);
1337 
1338  s->counts.mv_joint[j]++;
1339  if (j >= MV_JOINT_V)
1340  mv[0].y += read_mv_component(s, 0, hp);
1341  if (j & 1)
1342  mv[0].x += read_mv_component(s, 1, hp);
1343  }
1344 
1345  if (b->comp) {
1346  // FIXME cache this value and reuse for other subblocks
1347  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1348  mode == NEWMV ? -1 : sb);
1349  if ((mode == NEWMV || sb == -1) &&
1350  !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1351  if (mv[1].y & 1) {
1352  if (mv[1].y < 0)
1353  mv[1].y++;
1354  else
1355  mv[1].y--;
1356  }
1357  if (mv[1].x & 1) {
1358  if (mv[1].x < 0)
1359  mv[1].x++;
1360  else
1361  mv[1].x--;
1362  }
1363  }
1364  if (mode == NEWMV) {
1366  s->prob.p.mv_joint);
1367 
1368  s->counts.mv_joint[j]++;
1369  if (j >= MV_JOINT_V)
1370  mv[1].y += read_mv_component(s, 0, hp);
1371  if (j & 1)
1372  mv[1].x += read_mv_component(s, 1, hp);
1373  }
1374  }
1375  }
1376 }
1377 
1378 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1379  ptrdiff_t stride, int v)
1380 {
1381  switch (w) {
1382  case 1:
1383  do {
1384  *ptr = v;
1385  ptr += stride;
1386  } while (--h);
1387  break;
1388  case 2: {
1389  int v16 = v * 0x0101;
1390  do {
1391  AV_WN16A(ptr, v16);
1392  ptr += stride;
1393  } while (--h);
1394  break;
1395  }
1396  case 4: {
1397  uint32_t v32 = v * 0x01010101;
1398  do {
1399  AV_WN32A(ptr, v32);
1400  ptr += stride;
1401  } while (--h);
1402  break;
1403  }
1404  case 8: {
1405 #if HAVE_FAST_64BIT
1406  uint64_t v64 = v * 0x0101010101010101ULL;
1407  do {
1408  AV_WN64A(ptr, v64);
1409  ptr += stride;
1410  } while (--h);
1411 #else
1412  uint32_t v32 = v * 0x01010101;
1413  do {
1414  AV_WN32A(ptr, v32);
1415  AV_WN32A(ptr + 4, v32);
1416  ptr += stride;
1417  } while (--h);
1418 #endif
1419  break;
1420  }
1421  }
1422 }
1423 
1425 {
1426  static const uint8_t left_ctx[N_BS_SIZES] = {
1427  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1428  };
1429  static const uint8_t above_ctx[N_BS_SIZES] = {
1430  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1431  };
1432  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1434  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1435  };
1436  VP9Context *s = ctx->priv_data;
1437  VP9Block *b = s->b;
1438  int row = s->row, col = s->col, row7 = s->row7;
1439  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1440  int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1441  int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1442  int have_a = row > 0, have_l = col > s->tile_col_start;
1443  int vref, filter_id;
1444 
1445  if (!s->s.h.segmentation.enabled) {
1446  b->seg_id = 0;
1447  } else if (s->s.h.keyframe || s->s.h.intraonly) {
1448  b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1450  } else if (!s->s.h.segmentation.update_map ||
1451  (s->s.h.segmentation.temporal &&
1454  s->left_segpred_ctx[row7]]))) {
1456  int pred = 8, x;
1458 
1461  for (y = 0; y < h4; y++) {
1462  int idx_base = (y + row) * 8 * s->sb_cols + col;
1463  for (x = 0; x < w4; x++)
1464  pred = FFMIN(pred, refsegmap[idx_base + x]);
1465  }
1466  av_assert1(pred < 8);
1467  b->seg_id = pred;
1468  } else {
1469  b->seg_id = 0;
1470  }
1471 
1472  memset(&s->above_segpred_ctx[col], 1, w4);
1473  memset(&s->left_segpred_ctx[row7], 1, h4);
1474  } else {
1476  s->s.h.segmentation.prob);
1477 
1478  memset(&s->above_segpred_ctx[col], 0, w4);
1479  memset(&s->left_segpred_ctx[row7], 0, h4);
1480  }
1481  if (s->s.h.segmentation.enabled &&
1482  (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1483  setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1484  bw4, bh4, 8 * s->sb_cols, b->seg_id);
1485  }
1486 
1487  b->skip = s->s.h.segmentation.enabled &&
1488  s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1489  if (!b->skip) {
1490  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1491  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1492  s->counts.skip[c][b->skip]++;
1493  }
1494 
1495  if (s->s.h.keyframe || s->s.h.intraonly) {
1496  b->intra = 1;
1497  } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1498  b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1499  } else {
1500  int c, bit;
1501 
1502  if (have_a && have_l) {
1503  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1504  c += (c == 2);
1505  } else {
1506  c = have_a ? 2 * s->above_intra_ctx[col] :
1507  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1508  }
1509  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1510  s->counts.intra[c][bit]++;
1511  b->intra = !bit;
1512  }
1513 
1514  if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1515  int c;
1516  if (have_a) {
1517  if (have_l) {
1518  c = (s->above_skip_ctx[col] ? max_tx :
1519  s->above_txfm_ctx[col]) +
1520  (s->left_skip_ctx[row7] ? max_tx :
1521  s->left_txfm_ctx[row7]) > max_tx;
1522  } else {
1523  c = s->above_skip_ctx[col] ? 1 :
1524  (s->above_txfm_ctx[col] * 2 > max_tx);
1525  }
1526  } else if (have_l) {
1527  c = s->left_skip_ctx[row7] ? 1 :
1528  (s->left_txfm_ctx[row7] * 2 > max_tx);
1529  } else {
1530  c = 1;
1531  }
1532  switch (max_tx) {
1533  case TX_32X32:
1534  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1535  if (b->tx) {
1536  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1537  if (b->tx == 2)
1538  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1539  }
1540  s->counts.tx32p[c][b->tx]++;
1541  break;
1542  case TX_16X16:
1543  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1544  if (b->tx)
1545  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1546  s->counts.tx16p[c][b->tx]++;
1547  break;
1548  case TX_8X8:
1549  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1550  s->counts.tx8p[c][b->tx]++;
1551  break;
1552  case TX_4X4:
1553  b->tx = TX_4X4;
1554  break;
1555  }
1556  } else {
1557  b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1558  }
1559 
1560  if (s->s.h.keyframe || s->s.h.intraonly) {
1561  uint8_t *a = &s->above_mode_ctx[col * 2];
1562  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1563 
1564  b->comp = 0;
1565  if (b->bs > BS_8x8) {
1566  // FIXME the memory storage intermediates here aren't really
1567  // necessary, they're just there to make the code slightly
1568  // simpler for now
1569  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1570  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1571  if (b->bs != BS_8x4) {
1573  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1574  l[0] = a[1] = b->mode[1];
1575  } else {
1576  l[0] = a[1] = b->mode[1] = b->mode[0];
1577  }
1578  if (b->bs != BS_4x8) {
1579  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1580  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1581  if (b->bs != BS_8x4) {
1583  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1584  l[1] = a[1] = b->mode[3];
1585  } else {
1586  l[1] = a[1] = b->mode[3] = b->mode[2];
1587  }
1588  } else {
1589  b->mode[2] = b->mode[0];
1590  l[1] = a[1] = b->mode[3] = b->mode[1];
1591  }
1592  } else {
1594  vp9_default_kf_ymode_probs[*a][*l]);
1595  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1596  // FIXME this can probably be optimized
1597  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1598  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1599  }
1602  } else if (b->intra) {
1603  b->comp = 0;
1604  if (b->bs > BS_8x8) {
1606  s->prob.p.y_mode[0]);
1607  s->counts.y_mode[0][b->mode[0]]++;
1608  if (b->bs != BS_8x4) {
1610  s->prob.p.y_mode[0]);
1611  s->counts.y_mode[0][b->mode[1]]++;
1612  } else {
1613  b->mode[1] = b->mode[0];
1614  }
1615  if (b->bs != BS_4x8) {
1617  s->prob.p.y_mode[0]);
1618  s->counts.y_mode[0][b->mode[2]]++;
1619  if (b->bs != BS_8x4) {
1621  s->prob.p.y_mode[0]);
1622  s->counts.y_mode[0][b->mode[3]]++;
1623  } else {
1624  b->mode[3] = b->mode[2];
1625  }
1626  } else {
1627  b->mode[2] = b->mode[0];
1628  b->mode[3] = b->mode[1];
1629  }
1630  } else {
1631  static const uint8_t size_group[10] = {
1632  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1633  };
1634  int sz = size_group[b->bs];
1635 
1637  s->prob.p.y_mode[sz]);
1638  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1639  s->counts.y_mode[sz][b->mode[3]]++;
1640  }
1642  s->prob.p.uv_mode[b->mode[3]]);
1643  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1644  } else {
1645  static const uint8_t inter_mode_ctx_lut[14][14] = {
1646  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1647  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1648  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1649  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1650  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1651  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1652  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1653  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1654  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1655  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1656  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1657  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1658  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1659  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1660  };
1661 
1662  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1663  av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1664  b->comp = 0;
1665  b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1666  } else {
1667  // read comp_pred flag
1668  if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1669  b->comp = s->s.h.comppredmode == PRED_COMPREF;
1670  } else {
1671  int c;
1672 
1673  // FIXME add intra as ref=0xff (or -1) to make these easier?
1674  if (have_a) {
1675  if (have_l) {
1676  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1677  c = 4;
1678  } else if (s->above_comp_ctx[col]) {
1679  c = 2 + (s->left_intra_ctx[row7] ||
1680  s->left_ref_ctx[row7] == s->s.h.fixcompref);
1681  } else if (s->left_comp_ctx[row7]) {
1682  c = 2 + (s->above_intra_ctx[col] ||
1683  s->above_ref_ctx[col] == s->s.h.fixcompref);
1684  } else {
1685  c = (!s->above_intra_ctx[col] &&
1686  s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1687  (!s->left_intra_ctx[row7] &&
1688  s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1689  }
1690  } else {
1691  c = s->above_comp_ctx[col] ? 3 :
1692  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1693  }
1694  } else if (have_l) {
1695  c = s->left_comp_ctx[row7] ? 3 :
1696  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1697  } else {
1698  c = 1;
1699  }
1700  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1701  s->counts.comp[c][b->comp]++;
1702  }
1703 
1704  // read actual references
1705  // FIXME probably cache a few variables here to prevent repetitive
1706  // memory accesses below
1707  if (b->comp) /* two references */ {
1708  int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1709 
1710  b->ref[fix_idx] = s->s.h.fixcompref;
1711  // FIXME can this codeblob be replaced by some sort of LUT?
1712  if (have_a) {
1713  if (have_l) {
1714  if (s->above_intra_ctx[col]) {
1715  if (s->left_intra_ctx[row7]) {
1716  c = 2;
1717  } else {
1718  c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1719  }
1720  } else if (s->left_intra_ctx[row7]) {
1721  c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1722  } else {
1723  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1724 
1725  if (refl == refa && refa == s->s.h.varcompref[1]) {
1726  c = 0;
1727  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1728  if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1729  (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1730  c = 4;
1731  } else {
1732  c = (refa == refl) ? 3 : 1;
1733  }
1734  } else if (!s->left_comp_ctx[row7]) {
1735  if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1736  c = 1;
1737  } else {
1738  c = (refl == s->s.h.varcompref[1] &&
1739  refa != s->s.h.varcompref[1]) ? 2 : 4;
1740  }
1741  } else if (!s->above_comp_ctx[col]) {
1742  if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1743  c = 1;
1744  } else {
1745  c = (refa == s->s.h.varcompref[1] &&
1746  refl != s->s.h.varcompref[1]) ? 2 : 4;
1747  }
1748  } else {
1749  c = (refl == refa) ? 4 : 2;
1750  }
1751  }
1752  } else {
1753  if (s->above_intra_ctx[col]) {
1754  c = 2;
1755  } else if (s->above_comp_ctx[col]) {
1756  c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1757  } else {
1758  c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1759  }
1760  }
1761  } else if (have_l) {
1762  if (s->left_intra_ctx[row7]) {
1763  c = 2;
1764  } else if (s->left_comp_ctx[row7]) {
1765  c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1766  } else {
1767  c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1768  }
1769  } else {
1770  c = 2;
1771  }
1772  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1773  b->ref[var_idx] = s->s.h.varcompref[bit];
1774  s->counts.comp_ref[c][bit]++;
1775  } else /* single reference */ {
1776  int bit, c;
1777 
1778  if (have_a && !s->above_intra_ctx[col]) {
1779  if (have_l && !s->left_intra_ctx[row7]) {
1780  if (s->left_comp_ctx[row7]) {
1781  if (s->above_comp_ctx[col]) {
1782  c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1783  !s->above_ref_ctx[col]);
1784  } else {
1785  c = (3 * !s->above_ref_ctx[col]) +
1786  (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1787  }
1788  } else if (s->above_comp_ctx[col]) {
1789  c = (3 * !s->left_ref_ctx[row7]) +
1790  (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1791  } else {
1792  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1793  }
1794  } else if (s->above_intra_ctx[col]) {
1795  c = 2;
1796  } else if (s->above_comp_ctx[col]) {
1797  c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1798  } else {
1799  c = 4 * (!s->above_ref_ctx[col]);
1800  }
1801  } else if (have_l && !s->left_intra_ctx[row7]) {
1802  if (s->left_intra_ctx[row7]) {
1803  c = 2;
1804  } else if (s->left_comp_ctx[row7]) {
1805  c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1806  } else {
1807  c = 4 * (!s->left_ref_ctx[row7]);
1808  }
1809  } else {
1810  c = 2;
1811  }
1812  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1813  s->counts.single_ref[c][0][bit]++;
1814  if (!bit) {
1815  b->ref[0] = 0;
1816  } else {
1817  // FIXME can this codeblob be replaced by some sort of LUT?
1818  if (have_a) {
1819  if (have_l) {
1820  if (s->left_intra_ctx[row7]) {
1821  if (s->above_intra_ctx[col]) {
1822  c = 2;
1823  } else if (s->above_comp_ctx[col]) {
1824  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1825  s->above_ref_ctx[col] == 1);
1826  } else if (!s->above_ref_ctx[col]) {
1827  c = 3;
1828  } else {
1829  c = 4 * (s->above_ref_ctx[col] == 1);
1830  }
1831  } else if (s->above_intra_ctx[col]) {
1832  if (s->left_intra_ctx[row7]) {
1833  c = 2;
1834  } else if (s->left_comp_ctx[row7]) {
1835  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1836  s->left_ref_ctx[row7] == 1);
1837  } else if (!s->left_ref_ctx[row7]) {
1838  c = 3;
1839  } else {
1840  c = 4 * (s->left_ref_ctx[row7] == 1);
1841  }
1842  } else if (s->above_comp_ctx[col]) {
1843  if (s->left_comp_ctx[row7]) {
1844  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1845  c = 3 * (s->s.h.fixcompref == 1 ||
1846  s->left_ref_ctx[row7] == 1);
1847  } else {
1848  c = 2;
1849  }
1850  } else if (!s->left_ref_ctx[row7]) {
1851  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1852  s->above_ref_ctx[col] == 1);
1853  } else {
1854  c = 3 * (s->left_ref_ctx[row7] == 1) +
1855  (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1856  }
1857  } else if (s->left_comp_ctx[row7]) {
1858  if (!s->above_ref_ctx[col]) {
1859  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1860  s->left_ref_ctx[row7] == 1);
1861  } else {
1862  c = 3 * (s->above_ref_ctx[col] == 1) +
1863  (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1864  }
1865  } else if (!s->above_ref_ctx[col]) {
1866  if (!s->left_ref_ctx[row7]) {
1867  c = 3;
1868  } else {
1869  c = 4 * (s->left_ref_ctx[row7] == 1);
1870  }
1871  } else if (!s->left_ref_ctx[row7]) {
1872  c = 4 * (s->above_ref_ctx[col] == 1);
1873  } else {
1874  c = 2 * (s->left_ref_ctx[row7] == 1) +
1875  2 * (s->above_ref_ctx[col] == 1);
1876  }
1877  } else {
1878  if (s->above_intra_ctx[col] ||
1879  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1880  c = 2;
1881  } else if (s->above_comp_ctx[col]) {
1882  c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1883  } else {
1884  c = 4 * (s->above_ref_ctx[col] == 1);
1885  }
1886  }
1887  } else if (have_l) {
1888  if (s->left_intra_ctx[row7] ||
1889  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1890  c = 2;
1891  } else if (s->left_comp_ctx[row7]) {
1892  c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1893  } else {
1894  c = 4 * (s->left_ref_ctx[row7] == 1);
1895  }
1896  } else {
1897  c = 2;
1898  }
1899  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1900  s->counts.single_ref[c][1][bit]++;
1901  b->ref[0] = 1 + bit;
1902  }
1903  }
1904  }
1905 
1906  if (b->bs <= BS_8x8) {
1907  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1908  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1909  } else {
1910  static const uint8_t off[10] = {
1911  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1912  };
1913 
1914  // FIXME this needs to use the LUT tables from find_ref_mvs
1915  // because not all are -1,0/0,-1
1916  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1917  [s->left_mode_ctx[row7 + off[b->bs]]];
1918 
1920  s->prob.p.mv_mode[c]);
1921  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1922  s->counts.mv_mode[c][b->mode[0] - 10]++;
1923  }
1924  }
1925 
1926  if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1927  int c;
1928 
1929  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1930  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1931  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1932  s->left_filter_ctx[row7] : 3;
1933  } else {
1934  c = s->above_filter_ctx[col];
1935  }
1936  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1937  c = s->left_filter_ctx[row7];
1938  } else {
1939  c = 3;
1940  }
1941 
1942  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1943  s->prob.p.filter[c]);
1944  s->counts.filter[c][filter_id]++;
1945  b->filter = vp9_filter_lut[filter_id];
1946  } else {
1947  b->filter = s->s.h.filtermode;
1948  }
1949 
1950  if (b->bs > BS_8x8) {
1951  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1952 
1954  s->prob.p.mv_mode[c]);
1955  s->counts.mv_mode[c][b->mode[0] - 10]++;
1956  fill_mv(s, b->mv[0], b->mode[0], 0);
1957 
1958  if (b->bs != BS_8x4) {
1960  s->prob.p.mv_mode[c]);
1961  s->counts.mv_mode[c][b->mode[1] - 10]++;
1962  fill_mv(s, b->mv[1], b->mode[1], 1);
1963  } else {
1964  b->mode[1] = b->mode[0];
1965  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1966  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1967  }
1968 
1969  if (b->bs != BS_4x8) {
1971  s->prob.p.mv_mode[c]);
1972  s->counts.mv_mode[c][b->mode[2] - 10]++;
1973  fill_mv(s, b->mv[2], b->mode[2], 2);
1974 
1975  if (b->bs != BS_8x4) {
1977  s->prob.p.mv_mode[c]);
1978  s->counts.mv_mode[c][b->mode[3] - 10]++;
1979  fill_mv(s, b->mv[3], b->mode[3], 3);
1980  } else {
1981  b->mode[3] = b->mode[2];
1982  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1983  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1984  }
1985  } else {
1986  b->mode[2] = b->mode[0];
1987  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1988  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1989  b->mode[3] = b->mode[1];
1990  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1991  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1992  }
1993  } else {
1994  fill_mv(s, b->mv[0], b->mode[0], -1);
1995  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1996  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1997  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1998  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1999  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2000  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2001  }
2002 
2003  vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2004  }
2005 
2006 #if HAVE_FAST_64BIT
2007 #define SPLAT_CTX(var, val, n) \
2008  switch (n) { \
2009  case 1: var = val; break; \
2010  case 2: AV_WN16A(&var, val * 0x0101); break; \
2011  case 4: AV_WN32A(&var, val * 0x01010101); break; \
2012  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2013  case 16: { \
2014  uint64_t v64 = val * 0x0101010101010101ULL; \
2015  AV_WN64A( &var, v64); \
2016  AV_WN64A(&((uint8_t *) &var)[8], v64); \
2017  break; \
2018  } \
2019  }
2020 #else
2021 #define SPLAT_CTX(var, val, n) \
2022  switch (n) { \
2023  case 1: var = val; break; \
2024  case 2: AV_WN16A(&var, val * 0x0101); break; \
2025  case 4: AV_WN32A(&var, val * 0x01010101); break; \
2026  case 8: { \
2027  uint32_t v32 = val * 0x01010101; \
2028  AV_WN32A( &var, v32); \
2029  AV_WN32A(&((uint8_t *) &var)[4], v32); \
2030  break; \
2031  } \
2032  case 16: { \
2033  uint32_t v32 = val * 0x01010101; \
2034  AV_WN32A( &var, v32); \
2035  AV_WN32A(&((uint8_t *) &var)[4], v32); \
2036  AV_WN32A(&((uint8_t *) &var)[8], v32); \
2037  AV_WN32A(&((uint8_t *) &var)[12], v32); \
2038  break; \
2039  } \
2040  }
2041 #endif
2042 
2043  switch (bwh_tab[1][b->bs][0]) {
2044 #define SET_CTXS(dir, off, n) \
2045  do { \
2046  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2047  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2048  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2049  if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2050  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2051  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2052  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2053  if (!b->intra) { \
2054  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2055  if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2056  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2057  } \
2058  } \
2059  } \
2060  } while (0)
2061  case 1: SET_CTXS(above, col, 1); break;
2062  case 2: SET_CTXS(above, col, 2); break;
2063  case 4: SET_CTXS(above, col, 4); break;
2064  case 8: SET_CTXS(above, col, 8); break;
2065  }
2066  switch (bwh_tab[1][b->bs][1]) {
2067  case 1: SET_CTXS(left, row7, 1); break;
2068  case 2: SET_CTXS(left, row7, 2); break;
2069  case 4: SET_CTXS(left, row7, 4); break;
2070  case 8: SET_CTXS(left, row7, 8); break;
2071  }
2072 #undef SPLAT_CTX
2073 #undef SET_CTXS
2074 
2075  if (!s->s.h.keyframe && !s->s.h.intraonly) {
2076  if (b->bs > BS_8x8) {
2077  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2078 
2079  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2080  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2081  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2082  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2083  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2084  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2085  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2086  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2087  } else {
2088  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2089 
2090  for (n = 0; n < w4 * 2; n++) {
2091  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2092  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2093  }
2094  for (n = 0; n < h4 * 2; n++) {
2095  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2096  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2097  }
2098  }
2099  }
2100 
2101  // FIXME kinda ugly
2102  for (y = 0; y < h4; y++) {
2103  int x, o = (row + y) * s->sb_cols * 8 + col;
2104  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2105 
2106  if (b->intra) {
2107  for (x = 0; x < w4; x++) {
2108  mv[x].ref[0] =
2109  mv[x].ref[1] = -1;
2110  }
2111  } else if (b->comp) {
2112  for (x = 0; x < w4; x++) {
2113  mv[x].ref[0] = b->ref[0];
2114  mv[x].ref[1] = b->ref[1];
2115  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2116  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2117  }
2118  } else {
2119  for (x = 0; x < w4; x++) {
2120  mv[x].ref[0] = b->ref[0];
2121  mv[x].ref[1] = -1;
2122  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2123  }
2124  }
2125  }
2126 }
2127 
2128 // FIXME merge cnt/eob arguments?
2129 static av_always_inline int
2130 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2131  int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2132  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2133  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2134  const int16_t *band_counts, const int16_t *qmul)
2135 {
2136  int i = 0, band = 0, band_left = band_counts[band];
2137  uint8_t *tp = p[0][nnz];
2138  uint8_t cache[1024];
2139 
2140  do {
2141  int val, rc;
2142 
2143  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2144  eob[band][nnz][val]++;
2145  if (!val)
2146  break;
2147 
2148  skip_eob:
2149  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2150  cnt[band][nnz][0]++;
2151  if (!--band_left)
2152  band_left = band_counts[++band];
2153  cache[scan[i]] = 0;
2154  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2155  tp = p[band][nnz];
2156  if (++i == n_coeffs)
2157  break; //invalid input; blocks should end with EOB
2158  goto skip_eob;
2159  }
2160 
2161  rc = scan[i];
2162  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2163  cnt[band][nnz][1]++;
2164  val = 1;
2165  cache[rc] = 1;
2166  } else {
2167  // fill in p[3-10] (model fill) - only once per frame for each pos
2168  if (!tp[3])
2169  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2170 
2171  cnt[band][nnz][2]++;
2172  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2173  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2174  cache[rc] = val = 2;
2175  } else {
2176  val = 3 + vp56_rac_get_prob(c, tp[5]);
2177  cache[rc] = 3;
2178  }
2179  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2180  cache[rc] = 4;
2181  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2182  val = 5 + vp56_rac_get_prob(c, 159);
2183  } else {
2184  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2185  val += vp56_rac_get_prob(c, 145);
2186  }
2187  } else { // cat 3-6
2188  cache[rc] = 5;
2189  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2190  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2191  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2192  val += (vp56_rac_get_prob(c, 148) << 1);
2193  val += vp56_rac_get_prob(c, 140);
2194  } else {
2195  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2196  val += (vp56_rac_get_prob(c, 155) << 2);
2197  val += (vp56_rac_get_prob(c, 140) << 1);
2198  val += vp56_rac_get_prob(c, 135);
2199  }
2200  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2201  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2202  val += (vp56_rac_get_prob(c, 157) << 3);
2203  val += (vp56_rac_get_prob(c, 141) << 2);
2204  val += (vp56_rac_get_prob(c, 134) << 1);
2205  val += vp56_rac_get_prob(c, 130);
2206  } else {
2207  val = 67;
2208  if (!is8bitsperpixel) {
2209  if (bpp == 12) {
2210  val += vp56_rac_get_prob(c, 255) << 17;
2211  val += vp56_rac_get_prob(c, 255) << 16;
2212  }
2213  val += (vp56_rac_get_prob(c, 255) << 15);
2214  val += (vp56_rac_get_prob(c, 255) << 14);
2215  }
2216  val += (vp56_rac_get_prob(c, 254) << 13);
2217  val += (vp56_rac_get_prob(c, 254) << 12);
2218  val += (vp56_rac_get_prob(c, 254) << 11);
2219  val += (vp56_rac_get_prob(c, 252) << 10);
2220  val += (vp56_rac_get_prob(c, 249) << 9);
2221  val += (vp56_rac_get_prob(c, 243) << 8);
2222  val += (vp56_rac_get_prob(c, 230) << 7);
2223  val += (vp56_rac_get_prob(c, 196) << 6);
2224  val += (vp56_rac_get_prob(c, 177) << 5);
2225  val += (vp56_rac_get_prob(c, 153) << 4);
2226  val += (vp56_rac_get_prob(c, 140) << 3);
2227  val += (vp56_rac_get_prob(c, 133) << 2);
2228  val += (vp56_rac_get_prob(c, 130) << 1);
2229  val += vp56_rac_get_prob(c, 129);
2230  }
2231  }
2232  }
2233 #define STORE_COEF(c, i, v) do { \
2234  if (is8bitsperpixel) { \
2235  c[i] = v; \
2236  } else { \
2237  AV_WN32A(&c[i * 2], v); \
2238  } \
2239 } while (0)
2240  if (!--band_left)
2241  band_left = band_counts[++band];
2242  if (is_tx32x32)
2243  STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2244  else
2245  STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2246  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2247  tp = p[band][nnz];
2248  } while (++i < n_coeffs);
2249 
2250  return i;
2251 }
2252 
2253 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2254  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2255  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2256  const int16_t (*nb)[2], const int16_t *band_counts,
2257  const int16_t *qmul)
2258 {
2259  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2260  nnz, scan, nb, band_counts, qmul);
2261 }
2262 
2263 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2264  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2265  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2266  const int16_t (*nb)[2], const int16_t *band_counts,
2267  const int16_t *qmul)
2268 {
2269  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2270  nnz, scan, nb, band_counts, qmul);
2271 }
2272 
2273 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2274  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2275  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2276  const int16_t (*nb)[2], const int16_t *band_counts,
2277  const int16_t *qmul)
2278 {
2279  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2280  nnz, scan, nb, band_counts, qmul);
2281 }
2282 
2283 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2284  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2285  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2286  const int16_t (*nb)[2], const int16_t *band_counts,
2287  const int16_t *qmul)
2288 {
2289  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2290  nnz, scan, nb, band_counts, qmul);
2291 }
2292 
2293 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2294 {
2295  VP9Context *s = ctx->priv_data;
2296  VP9Block *b = s->b;
2297  int row = s->row, col = s->col;
2298  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2299  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2300  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2301  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2302  int end_x = FFMIN(2 * (s->cols - col), w4);
2303  int end_y = FFMIN(2 * (s->rows - row), h4);
2304  int n, pl, x, y, res;
2305  int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2306  int tx = 4 * s->s.h.lossless + b->tx;
2307  const int16_t * const *yscans = vp9_scans[tx];
2308  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2309  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2310  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2311  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2312  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2313  static const int16_t band_counts[4][8] = {
2314  { 1, 2, 3, 4, 3, 16 - 13 },
2315  { 1, 2, 3, 4, 11, 64 - 21 },
2316  { 1, 2, 3, 4, 11, 256 - 21 },
2317  { 1, 2, 3, 4, 11, 1024 - 21 },
2318  };
2319  const int16_t *y_band_counts = band_counts[b->tx];
2320  const int16_t *uv_band_counts = band_counts[b->uvtx];
2321  int bytesperpixel = is8bitsperpixel ? 1 : 2;
2322  int total_coeff = 0;
2323 
2324 #define MERGE(la, end, step, rd) \
2325  for (n = 0; n < end; n += step) \
2326  la[n] = !!rd(&la[n])
2327 #define MERGE_CTX(step, rd) \
2328  do { \
2329  MERGE(l, end_y, step, rd); \
2330  MERGE(a, end_x, step, rd); \
2331  } while (0)
2332 
2333 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2334  for (n = 0, y = 0; y < end_y; y += step) { \
2335  for (x = 0; x < end_x; x += step, n += step * step) { \
2336  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2337  res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2338  (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2339  c, e, p, a[x] + l[y], yscans[txtp], \
2340  ynbs[txtp], y_band_counts, qmul[0]); \
2341  a[x] = l[y] = !!res; \
2342  total_coeff |= !!res; \
2343  if (step >= 4) { \
2344  AV_WN16A(&s->eob[n], res); \
2345  } else { \
2346  s->eob[n] = res; \
2347  } \
2348  } \
2349  }
2350 
2351 #define SPLAT(la, end, step, cond) \
2352  if (step == 2) { \
2353  for (n = 1; n < end; n += step) \
2354  la[n] = la[n - 1]; \
2355  } else if (step == 4) { \
2356  if (cond) { \
2357  for (n = 0; n < end; n += step) \
2358  AV_WN32A(&la[n], la[n] * 0x01010101); \
2359  } else { \
2360  for (n = 0; n < end; n += step) \
2361  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2362  } \
2363  } else /* step == 8 */ { \
2364  if (cond) { \
2365  if (HAVE_FAST_64BIT) { \
2366  for (n = 0; n < end; n += step) \
2367  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2368  } else { \
2369  for (n = 0; n < end; n += step) { \
2370  uint32_t v32 = la[n] * 0x01010101; \
2371  AV_WN32A(&la[n], v32); \
2372  AV_WN32A(&la[n + 4], v32); \
2373  } \
2374  } \
2375  } else { \
2376  for (n = 0; n < end; n += step) \
2377  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2378  } \
2379  }
2380 #define SPLAT_CTX(step) \
2381  do { \
2382  SPLAT(a, end_x, step, end_x == w4); \
2383  SPLAT(l, end_y, step, end_y == h4); \
2384  } while (0)
2385 
2386  /* y tokens */
2387  switch (b->tx) {
2388  case TX_4X4:
2389  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2390  break;
2391  case TX_8X8:
2392  MERGE_CTX(2, AV_RN16A);
2393  DECODE_Y_COEF_LOOP(2, 0,);
2394  SPLAT_CTX(2);
2395  break;
2396  case TX_16X16:
2397  MERGE_CTX(4, AV_RN32A);
2398  DECODE_Y_COEF_LOOP(4, 0,);
2399  SPLAT_CTX(4);
2400  break;
2401  case TX_32X32:
2402  MERGE_CTX(8, AV_RN64A);
2403  DECODE_Y_COEF_LOOP(8, 0, 32);
2404  SPLAT_CTX(8);
2405  break;
2406  }
2407 
2408 #define DECODE_UV_COEF_LOOP(step, v) \
2409  for (n = 0, y = 0; y < end_y; y += step) { \
2410  for (x = 0; x < end_x; x += step, n += step * step) { \
2411  res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2412  (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2413  16 * step * step, c, e, p, a[x] + l[y], \
2414  uvscan, uvnb, uv_band_counts, qmul[1]); \
2415  a[x] = l[y] = !!res; \
2416  total_coeff |= !!res; \
2417  if (step >= 4) { \
2418  AV_WN16A(&s->uveob[pl][n], res); \
2419  } else { \
2420  s->uveob[pl][n] = res; \
2421  } \
2422  } \
2423  }
2424 
2425  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2426  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2427  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2428  w4 >>= s->ss_h;
2429  end_x >>= s->ss_h;
2430  h4 >>= s->ss_v;
2431  end_y >>= s->ss_v;
2432  for (pl = 0; pl < 2; pl++) {
2433  a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2434  l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2435  switch (b->uvtx) {
2436  case TX_4X4:
2437  DECODE_UV_COEF_LOOP(1,);
2438  break;
2439  case TX_8X8:
2440  MERGE_CTX(2, AV_RN16A);
2441  DECODE_UV_COEF_LOOP(2,);
2442  SPLAT_CTX(2);
2443  break;
2444  case TX_16X16:
2445  MERGE_CTX(4, AV_RN32A);
2446  DECODE_UV_COEF_LOOP(4,);
2447  SPLAT_CTX(4);
2448  break;
2449  case TX_32X32:
2450  MERGE_CTX(8, AV_RN64A);
2451  DECODE_UV_COEF_LOOP(8, 32);
2452  SPLAT_CTX(8);
2453  break;
2454  }
2455  }
2456 
2457  return total_coeff;
2458 }
2459 
2461 {
2462  return decode_coeffs(ctx, 1);
2463 }
2464 
2466 {
2467  return decode_coeffs(ctx, 0);
2468 }
2469 
2471  uint8_t *dst_edge, ptrdiff_t stride_edge,
2472  uint8_t *dst_inner, ptrdiff_t stride_inner,
2473  uint8_t *l, int col, int x, int w,
2474  int row, int y, enum TxfmMode tx,
2475  int p, int ss_h, int ss_v, int bytesperpixel)
2476 {
2477  int have_top = row > 0 || y > 0;
2478  int have_left = col > s->tile_col_start || x > 0;
2479  int have_right = x < w - 1;
2480  int bpp = s->bpp;
2481  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2482  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2483  { DC_127_PRED, VERT_PRED } },
2484  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2485  { HOR_PRED, HOR_PRED } },
2486  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2487  { LEFT_DC_PRED, DC_PRED } },
2497  { DC_127_PRED, VERT_LEFT_PRED } },
2498  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2499  { HOR_UP_PRED, HOR_UP_PRED } },
2500  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2501  { HOR_PRED, TM_VP8_PRED } },
2502  };
2503  static const struct {
2504  uint8_t needs_left:1;
2505  uint8_t needs_top:1;
2506  uint8_t needs_topleft:1;
2507  uint8_t needs_topright:1;
2508  uint8_t invert_left:1;
2509  } edges[N_INTRA_PRED_MODES] = {
2510  [VERT_PRED] = { .needs_top = 1 },
2511  [HOR_PRED] = { .needs_left = 1 },
2512  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2513  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2514  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2515  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2516  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2517  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2518  [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2519  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2520  [LEFT_DC_PRED] = { .needs_left = 1 },
2521  [TOP_DC_PRED] = { .needs_top = 1 },
2522  [DC_128_PRED] = { 0 },
2523  [DC_127_PRED] = { 0 },
2524  [DC_129_PRED] = { 0 }
2525  };
2526 
2527  av_assert2(mode >= 0 && mode < 10);
2528  mode = mode_conv[mode][have_left][have_top];
2529  if (edges[mode].needs_top) {
2530  uint8_t *top, *topleft;
2531  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2532  int n_px_need_tr = 0;
2533 
2534  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2535  n_px_need_tr = 4;
2536 
2537  // if top of sb64-row, use s->intra_pred_data[] instead of
2538  // dst[-stride] for intra prediction (it contains pre- instead of
2539  // post-loopfilter data)
2540  if (have_top) {
2541  top = !(row & 7) && !y ?
2542  s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2543  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2544  if (have_left)
2545  topleft = !(row & 7) && !y ?
2546  s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2547  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2548  &dst_inner[-stride_inner];
2549  }
2550 
2551  if (have_top &&
2552  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2553  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2554  n_px_need + n_px_need_tr <= n_px_have) {
2555  *a = top;
2556  } else {
2557  if (have_top) {
2558  if (n_px_need <= n_px_have) {
2559  memcpy(*a, top, n_px_need * bytesperpixel);
2560  } else {
2561 #define memset_bpp(c, i1, v, i2, num) do { \
2562  if (bytesperpixel == 1) { \
2563  memset(&(c)[(i1)], (v)[(i2)], (num)); \
2564  } else { \
2565  int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2566  for (n = 0; n < (num); n++) { \
2567  AV_WN16A(&(c)[((i1) + n) * 2], val); \
2568  } \
2569  } \
2570 } while (0)
2571  memcpy(*a, top, n_px_have * bytesperpixel);
2572  memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2573  }
2574  } else {
2575 #define memset_val(c, val, num) do { \
2576  if (bytesperpixel == 1) { \
2577  memset((c), (val), (num)); \
2578  } else { \
2579  int n; \
2580  for (n = 0; n < (num); n++) { \
2581  AV_WN16A(&(c)[n * 2], (val)); \
2582  } \
2583  } \
2584 } while (0)
2585  memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2586  }
2587  if (edges[mode].needs_topleft) {
2588  if (have_left && have_top) {
2589 #define assign_bpp(c, i1, v, i2) do { \
2590  if (bytesperpixel == 1) { \
2591  (c)[(i1)] = (v)[(i2)]; \
2592  } else { \
2593  AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2594  } \
2595 } while (0)
2596  assign_bpp(*a, -1, topleft, -1);
2597  } else {
2598 #define assign_val(c, i, v) do { \
2599  if (bytesperpixel == 1) { \
2600  (c)[(i)] = (v); \
2601  } else { \
2602  AV_WN16A(&(c)[(i) * 2], (v)); \
2603  } \
2604 } while (0)
2605  assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2606  }
2607  }
2608  if (tx == TX_4X4 && edges[mode].needs_topright) {
2609  if (have_top && have_right &&
2610  n_px_need + n_px_need_tr <= n_px_have) {
2611  memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2612  } else {
2613  memset_bpp(*a, 4, *a, 3, 4);
2614  }
2615  }
2616  }
2617  }
2618  if (edges[mode].needs_left) {
2619  if (have_left) {
2620  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2621  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2622  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2623 
2624  if (edges[mode].invert_left) {
2625  if (n_px_need <= n_px_have) {
2626  for (i = 0; i < n_px_need; i++)
2627  assign_bpp(l, i, &dst[i * stride], -1);
2628  } else {
2629  for (i = 0; i < n_px_have; i++)
2630  assign_bpp(l, i, &dst[i * stride], -1);
2631  memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2632  }
2633  } else {
2634  if (n_px_need <= n_px_have) {
2635  for (i = 0; i < n_px_need; i++)
2636  assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2637  } else {
2638  for (i = 0; i < n_px_have; i++)
2639  assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2640  memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2641  }
2642  }
2643  } else {
2644  memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2645  }
2646  }
2647 
2648  return mode;
2649 }
2650 
2651 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2652  ptrdiff_t uv_off, int bytesperpixel)
2653 {
2654  VP9Context *s = ctx->priv_data;
2655  VP9Block *b = s->b;
2656  int row = s->row, col = s->col;
2657  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2658  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2659  int end_x = FFMIN(2 * (s->cols - col), w4);
2660  int end_y = FFMIN(2 * (s->rows - row), h4);
2661  int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2662  int uvstep1d = 1 << b->uvtx, p;
2663  uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2664  LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2665  LOCAL_ALIGNED_32(uint8_t, l, [64]);
2666 
2667  for (n = 0, y = 0; y < end_y; y += step1d) {
2668  uint8_t *ptr = dst, *ptr_r = dst_r;
2669  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2670  ptr_r += 4 * step1d * bytesperpixel, n += step) {
2671  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2672  y * 2 + x : 0];
2673  uint8_t *a = &a_buf[32];
2674  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2675  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2676 
2677  mode = check_intra_mode(s, mode, &a, ptr_r,
2678  s->s.frames[CUR_FRAME].tf.f->linesize[0],
2679  ptr, s->y_stride, l,
2680  col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2681  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2682  if (eob)
2683  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2684  s->block + 16 * n * bytesperpixel, eob);
2685  }
2686  dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2687  dst += 4 * step1d * s->y_stride;
2688  }
2689 
2690  // U/V
2691  w4 >>= s->ss_h;
2692  end_x >>= s->ss_h;
2693  end_y >>= s->ss_v;
2694  step = 1 << (b->uvtx * 2);
2695  for (p = 0; p < 2; p++) {
2696  dst = s->dst[1 + p];
2697  dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2698  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2699  uint8_t *ptr = dst, *ptr_r = dst_r;
2700  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2701  ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2702  int mode = b->uvmode;
2703  uint8_t *a = &a_buf[32];
2704  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2705 
2706  mode = check_intra_mode(s, mode, &a, ptr_r,
2707  s->s.frames[CUR_FRAME].tf.f->linesize[1],
2708  ptr, s->uv_stride, l, col, x, w4, row, y,
2709  b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2710  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2711  if (eob)
2712  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2713  s->uvblock[p] + 16 * n * bytesperpixel, eob);
2714  }
2715  dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2716  dst += 4 * uvstep1d * s->uv_stride;
2717  }
2718  }
2719 }
2720 
2721 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2722 {
2723  intra_recon(ctx, y_off, uv_off, 1);
2724 }
2725 
2726 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2727 {
2728  intra_recon(ctx, y_off, uv_off, 2);
2729 }
2730 
2732  uint8_t *dst, ptrdiff_t dst_stride,
2733  const uint8_t *ref, ptrdiff_t ref_stride,
2735  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2736  int bw, int bh, int w, int h, int bytesperpixel)
2737 {
2738  int mx = mv->x, my = mv->y, th;
2739 
2740  y += my >> 3;
2741  x += mx >> 3;
2742  ref += y * ref_stride + x * bytesperpixel;
2743  mx &= 7;
2744  my &= 7;
2745  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2746  // we use +7 because the last 7 pixels of each sbrow can be changed in
2747  // the longest loopfilter of the next sbrow
2748  th = (y + bh + 4 * !!my + 7) >> 6;
2749  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2750  if (x < !!mx * 3 || y < !!my * 3 ||
2751  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2753  ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2754  160, ref_stride,
2755  bw + !!mx * 7, bh + !!my * 7,
2756  x - !!mx * 3, y - !!my * 3, w, h);
2757  ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2758  ref_stride = 160;
2759  }
2760  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2761 }
2762 
2764  uint8_t *dst_u, uint8_t *dst_v,
2765  ptrdiff_t dst_stride,
2766  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2767  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2769  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2770  int bw, int bh, int w, int h, int bytesperpixel)
2771 {
2772  int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2773 
2774  y += my >> 4;
2775  x += mx >> 4;
2776  ref_u += y * src_stride_u + x * bytesperpixel;
2777  ref_v += y * src_stride_v + x * bytesperpixel;
2778  mx &= 15;
2779  my &= 15;
2780  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2781  // we use +7 because the last 7 pixels of each sbrow can be changed in
2782  // the longest loopfilter of the next sbrow
2783  th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2784  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2785  if (x < !!mx * 3 || y < !!my * 3 ||
2786  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2788  ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2789  160, src_stride_u,
2790  bw + !!mx * 7, bh + !!my * 7,
2791  x - !!mx * 3, y - !!my * 3, w, h);
2792  ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2793  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2794 
2796  ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2797  160, src_stride_v,
2798  bw + !!mx * 7, bh + !!my * 7,
2799  x - !!mx * 3, y - !!my * 3, w, h);
2800  ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2801  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2802  } else {
2803  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2804  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2805  }
2806 }
2807 
2808 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2809  px, py, pw, ph, bw, bh, w, h, i) \
2810  mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2811  mv, bw, bh, w, h, bytesperpixel)
2812 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2813  row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2814  mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2815  row, col, mv, bw, bh, w, h, bytesperpixel)
2816 #define SCALED 0
2817 #define FN(x) x##_8bpp
2818 #define BYTES_PER_PIXEL 1
2819 #include "vp9_mc_template.c"
2820 #undef FN
2821 #undef BYTES_PER_PIXEL
2822 #define FN(x) x##_16bpp
2823 #define BYTES_PER_PIXEL 2
2824 #include "vp9_mc_template.c"
2825 #undef mc_luma_dir
2826 #undef mc_chroma_dir
2827 #undef FN
2828 #undef BYTES_PER_PIXEL
2829 #undef SCALED
2830 
2832  vp9_mc_func (*mc)[2],
2833  uint8_t *dst, ptrdiff_t dst_stride,
2834  const uint8_t *ref, ptrdiff_t ref_stride,
2836  ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2837  int px, int py, int pw, int ph,
2838  int bw, int bh, int w, int h, int bytesperpixel,
2839  const uint16_t *scale, const uint8_t *step)
2840 {
2841  if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2842  s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2843  mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2844  y, x, in_mv, bw, bh, w, h, bytesperpixel);
2845  } else {
2846 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2847  int mx, my;
2848  int refbw_m1, refbh_m1;
2849  int th;
2850  VP56mv mv;
2851 
2852  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2853  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2854  // BUG libvpx seems to scale the two components separately. This introduces
2855  // rounding errors but we have to reproduce them to be exactly compatible
2856  // with the output from libvpx...
2857  mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2858  my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2859 
2860  y = my >> 4;
2861  x = mx >> 4;
2862  ref += y * ref_stride + x * bytesperpixel;
2863  mx &= 15;
2864  my &= 15;
2865  refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2866  refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2867  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2868  // we use +7 because the last 7 pixels of each sbrow can be changed in
2869  // the longest loopfilter of the next sbrow
2870  th = (y + refbh_m1 + 4 + 7) >> 6;
2871  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2872  if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2874  ref - 3 * ref_stride - 3 * bytesperpixel,
2875  288, ref_stride,
2876  refbw_m1 + 8, refbh_m1 + 8,
2877  x - 3, y - 3, w, h);
2878  ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2879  ref_stride = 288;
2880  }
2881  smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2882  }
2883 }
2884 
2886  vp9_mc_func (*mc)[2],
2887  uint8_t *dst_u, uint8_t *dst_v,
2888  ptrdiff_t dst_stride,
2889  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2890  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2892  ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2893  int px, int py, int pw, int ph,
2894  int bw, int bh, int w, int h, int bytesperpixel,
2895  const uint16_t *scale, const uint8_t *step)
2896 {
2897  if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2898  s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2899  mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2900  ref_v, src_stride_v, ref_frame,
2901  y, x, in_mv, bw, bh, w, h, bytesperpixel);
2902  } else {
2903  int mx, my;
2904  int refbw_m1, refbh_m1;
2905  int th;
2906  VP56mv mv;
2907 
2908  if (s->ss_h) {
2909  // BUG https://code.google.com/p/webm/issues/detail?id=820
2910  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2911  mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2912  } else {
2913  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2914  mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2915  }
2916  if (s->ss_v) {
2917  // BUG https://code.google.com/p/webm/issues/detail?id=820
2918  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2919  my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2920  } else {
2921  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2922  my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2923  }
2924 #undef scale_mv
2925  y = my >> 4;
2926  x = mx >> 4;
2927  ref_u += y * src_stride_u + x * bytesperpixel;
2928  ref_v += y * src_stride_v + x * bytesperpixel;
2929  mx &= 15;
2930  my &= 15;
2931  refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2932  refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2933  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2934  // we use +7 because the last 7 pixels of each sbrow can be changed in
2935  // the longest loopfilter of the next sbrow
2936  th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2937  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2938  if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2940  ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2941  288, src_stride_u,
2942  refbw_m1 + 8, refbh_m1 + 8,
2943  x - 3, y - 3, w, h);
2944  ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2945  smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2946 
2948  ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2949  288, src_stride_v,
2950  refbw_m1 + 8, refbh_m1 + 8,
2951  x - 3, y - 3, w, h);
2952  ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2953  smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2954  } else {
2955  smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2956  smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2957  }
2958  }
2959 }
2960 
2961 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2962  px, py, pw, ph, bw, bh, w, h, i) \
2963  mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2964  mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2965  s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2966 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2967  row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2968  mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2969  row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2970  s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2971 #define SCALED 1
2972 #define FN(x) x##_scaled_8bpp
2973 #define BYTES_PER_PIXEL 1
2974 #include "vp9_mc_template.c"
2975 #undef FN
2976 #undef BYTES_PER_PIXEL
2977 #define FN(x) x##_scaled_16bpp
2978 #define BYTES_PER_PIXEL 2
2979 #include "vp9_mc_template.c"
2980 #undef mc_luma_dir
2981 #undef mc_chroma_dir
2982 #undef FN
2983 #undef BYTES_PER_PIXEL
2984 #undef SCALED
2985 
2986 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2987 {
2988  VP9Context *s = ctx->priv_data;
2989  VP9Block *b = s->b;
2990  int row = s->row, col = s->col;
2991 
2992  if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2993  if (bytesperpixel == 1) {
2994  inter_pred_scaled_8bpp(ctx);
2995  } else {
2996  inter_pred_scaled_16bpp(ctx);
2997  }
2998  } else {
2999  if (bytesperpixel == 1) {
3000  inter_pred_8bpp(ctx);
3001  } else {
3002  inter_pred_16bpp(ctx);
3003  }
3004  }
3005  if (!b->skip) {
3006  /* mostly copied intra_recon() */
3007 
3008  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3009  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3010  int end_x = FFMIN(2 * (s->cols - col), w4);
3011  int end_y = FFMIN(2 * (s->rows - row), h4);
3012  int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3013  int uvstep1d = 1 << b->uvtx, p;
3014  uint8_t *dst = s->dst[0];
3015 
3016  // y itxfm add
3017  for (n = 0, y = 0; y < end_y; y += step1d) {
3018  uint8_t *ptr = dst;
3019  for (x = 0; x < end_x; x += step1d,
3020  ptr += 4 * step1d * bytesperpixel, n += step) {
3021  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3022 
3023  if (eob)
3024  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3025  s->block + 16 * n * bytesperpixel, eob);
3026  }
3027  dst += 4 * s->y_stride * step1d;
3028  }
3029 
3030  // uv itxfm add
3031  end_x >>= s->ss_h;
3032  end_y >>= s->ss_v;
3033  step = 1 << (b->uvtx * 2);
3034  for (p = 0; p < 2; p++) {
3035  dst = s->dst[p + 1];
3036  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3037  uint8_t *ptr = dst;
3038  for (x = 0; x < end_x; x += uvstep1d,
3039  ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3040  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3041 
3042  if (eob)
3043  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3044  s->uvblock[p] + 16 * n * bytesperpixel, eob);
3045  }
3046  dst += 4 * uvstep1d * s->uv_stride;
3047  }
3048  }
3049  }
3050 }
3051 
3053 {
3054  inter_recon(ctx, 1);
3055 }
3056 
3058 {
3059  inter_recon(ctx, 2);
3060 }
3061 
3062 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3063  int row_and_7, int col_and_7,
3064  int w, int h, int col_end, int row_end,
3065  enum TxfmMode tx, int skip_inter)
3066 {
3067  static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3068  static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3069 
3070  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3071  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3072  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3073  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3074 
3075  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3076  // edges. This means that for UV, we work on two subsampled blocks at
3077  // a time, and we only use the topleft block's mode information to set
3078  // things like block strength. Thus, for any block size smaller than
3079  // 16x16, ignore the odd portion of the block.
3080  if (tx == TX_4X4 && (ss_v | ss_h)) {
3081  if (h == ss_v) {
3082  if (row_and_7 & 1)
3083  return;
3084  if (!row_end)
3085  h += 1;
3086  }
3087  if (w == ss_h) {
3088  if (col_and_7 & 1)
3089  return;
3090  if (!col_end)
3091  w += 1;
3092  }
3093  }
3094 
3095  if (tx == TX_4X4 && !skip_inter) {
3096  int t = 1 << col_and_7, m_col = (t << w) - t, y;
3097  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3098  int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3099 
3100  for (y = row_and_7; y < h + row_and_7; y++) {
3101  int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3102 
3103  mask[0][y][1] |= m_row_8;
3104  mask[0][y][2] |= m_row_4;
3105  // for odd lines, if the odd col is not being filtered,
3106  // skip odd row also:
3107  // .---. <-- a
3108  // | |
3109  // |___| <-- b
3110  // ^ ^
3111  // c d
3112  //
3113  // if a/c are even row/col and b/d are odd, and d is skipped,
3114  // e.g. right edge of size-66x66.webm, then skip b also (bug)
3115  if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3116  mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3117  } else {
3118  mask[1][y][col_mask_id] |= m_col;
3119  }
3120  if (!ss_h)
3121  mask[0][y][3] |= m_col;
3122  if (!ss_v) {
3123  if (ss_h && (col_end & 1))
3124  mask[1][y][3] |= (t << (w - 1)) - t;
3125  else
3126  mask[1][y][3] |= m_col;
3127  }
3128  }
3129  } else {
3130  int y, t = 1 << col_and_7, m_col = (t << w) - t;
3131 
3132  if (!skip_inter) {
3133  int mask_id = (tx == TX_8X8);
3134  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3135  int l2 = tx + ss_h - 1, step1d;
3136  int m_row = m_col & masks[l2];
3137 
3138  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3139  // 8wd loopfilter to prevent going off the visible edge.
3140  if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3141  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3142  int m_row_8 = m_row - m_row_16;
3143 
3144  for (y = row_and_7; y < h + row_and_7; y++) {
3145  mask[0][y][0] |= m_row_16;
3146  mask[0][y][1] |= m_row_8;
3147  }
3148  } else {
3149  for (y = row_and_7; y < h + row_and_7; y++)
3150  mask[0][y][mask_id] |= m_row;
3151  }
3152 
3153  l2 = tx + ss_v - 1;
3154  step1d = 1 << l2;
3155  if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3156  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3157  mask[1][y][0] |= m_col;
3158  if (y - row_and_7 == h - 1)
3159  mask[1][y][1] |= m_col;
3160  } else {
3161  for (y = row_and_7; y < h + row_and_7; y += step1d)
3162  mask[1][y][mask_id] |= m_col;
3163  }
3164  } else if (tx != TX_4X4) {
3165  int mask_id;
3166 
3167  mask_id = (tx == TX_8X8) || (h == ss_v);
3168  mask[1][row_and_7][mask_id] |= m_col;
3169  mask_id = (tx == TX_8X8) || (w == ss_h);
3170  for (y = row_and_7; y < h + row_and_7; y++)
3171  mask[0][y][mask_id] |= t;
3172  } else {
3173  int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3174 
3175  for (y = row_and_7; y < h + row_and_7; y++) {
3176  mask[0][y][2] |= t4;
3177  mask[0][y][1] |= t8;
3178  }
3179  mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3180  }
3181  }
3182 }
3183 
3184 static void decode_b(AVCodecContext *ctx, int row, int col,
3185  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3186  enum BlockLevel bl, enum BlockPartition bp)
3187 {
3188  VP9Context *s = ctx->priv_data;
3189  VP9Block *b = s->b;
3190  enum BlockSize bs = bl * 3 + bp;
3191  int bytesperpixel = s->bytesperpixel;
3192  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3193  int emu[2];
3194  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3195 
3196  s->row = row;
3197  s->row7 = row & 7;
3198  s->col = col;
3199  s->col7 = col & 7;
3200  s->min_mv.x = -(128 + col * 64);
3201  s->min_mv.y = -(128 + row * 64);
3202  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3203  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3204  if (s->pass < 2) {
3205  b->bs = bs;
3206  b->bl = bl;
3207  b->bp = bp;
3208  decode_mode(ctx);
3209  b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3210  (s->ss_v && h4 * 2 == (1 << b->tx)));
3211 
3212  if (!b->skip) {
3213  int has_coeffs;
3214 
3215  if (bytesperpixel == 1) {
3216  has_coeffs = decode_coeffs_8bpp(ctx);
3217  } else {
3218  has_coeffs = decode_coeffs_16bpp(ctx);
3219  }
3220  if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3221  b->skip = 1;
3222  memset(&s->above_skip_ctx[col], 1, w4);
3223  memset(&s->left_skip_ctx[s->row7], 1, h4);
3224  }
3225  } else {
3226  int row7 = s->row7;
3227 
3228 #define SPLAT_ZERO_CTX(v, n) \
3229  switch (n) { \
3230  case 1: v = 0; break; \
3231  case 2: AV_ZERO16(&v); break; \
3232  case 4: AV_ZERO32(&v); break; \
3233  case 8: AV_ZERO64(&v); break; \
3234  case 16: AV_ZERO128(&v); break; \
3235  }
3236 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3237  do { \
3238  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3239  if (s->ss_##dir2) { \
3240  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3241  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3242  } else { \
3243  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3244  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3245  } \
3246  } while (0)
3247 
3248  switch (w4) {
3249  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3250  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3251  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3252  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3253  }
3254  switch (h4) {
3255  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3256  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3257  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3258  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3259  }
3260  }
3261 
3262  if (s->pass == 1) {
3263  s->b++;
3264  s->block += w4 * h4 * 64 * bytesperpixel;
3265  s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3266  s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3267  s->eob += 4 * w4 * h4;
3268  s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3269  s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3270 
3271  return;
3272  }
3273  }
3274 
3275  // emulated overhangs if the stride of the target buffer can't hold. This
3276  // makes it possible to support emu-edge and so on even if we have large block
3277  // overhangs
3278  emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3279  (row + h4) > s->rows;
3280  emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3281  (row + h4) > s->rows;
3282  if (emu[0]) {
3283  s->dst[0] = s->tmp_y;
3284  s->y_stride = 128;
3285  } else {
3286  s->dst[0] = f->data[0] + yoff;
3287  s->y_stride = f->linesize[0];
3288  }
3289  if (emu[1]) {
3290  s->dst[1] = s->tmp_uv[0];
3291  s->dst[2] = s->tmp_uv[1];
3292  s->uv_stride = 128;
3293  } else {
3294  s->dst[1] = f->data[1] + uvoff;
3295  s->dst[2] = f->data[2] + uvoff;
3296  s->uv_stride = f->linesize[1];
3297  }
3298  if (b->intra) {
3299  if (s->bpp > 8) {
3300  intra_recon_16bpp(ctx, yoff, uvoff);
3301  } else {
3302  intra_recon_8bpp(ctx, yoff, uvoff);
3303  }
3304  } else {
3305  if (s->bpp > 8) {
3306  inter_recon_16bpp(ctx);
3307  } else {
3308  inter_recon_8bpp(ctx);
3309  }
3310  }
3311  if (emu[0]) {
3312  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3313 
3314  for (n = 0; o < w; n++) {
3315  int bw = 64 >> n;
3316 
3317  av_assert2(n <= 4);
3318  if (w & bw) {
3319  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3320  s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3321  o += bw;
3322  }
3323  }
3324  }
3325  if (emu[1]) {
3326  int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3327  int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3328 
3329  for (n = s->ss_h; o < w; n++) {
3330  int bw = 64 >> n;
3331 
3332  av_assert2(n <= 4);
3333  if (w & bw) {
3334  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3335  s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3336  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3337  s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3338  o += bw;
3339  }
3340  }
3341  }
3342 
3343  // pick filter level and find edges to apply filter to
3344  if (s->s.h.filter.level &&
3345  (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3346  [b->mode[3] != ZEROMV]) > 0) {
3347  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3348  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3349 
3350  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3351  mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3352  if (s->ss_h || s->ss_v)
3353  mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3354  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3355  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3356  b->uvtx, skip_inter);
3357 
3358  if (!s->filter_lut.lim_lut[lvl]) {
3359  int sharp = s->s.h.filter.sharpness;
3360  int limit = lvl;
3361 
3362  if (sharp > 0) {
3363  limit >>= (sharp + 3) >> 2;
3364  limit = FFMIN(limit, 9 - sharp);
3365  }
3366  limit = FFMAX(limit, 1);
3367 
3368  s->filter_lut.lim_lut[lvl] = limit;
3369  s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3370  }
3371  }
3372 
3373  if (s->pass == 2) {
3374  s->b++;
3375  s->block += w4 * h4 * 64 * bytesperpixel;
3376  s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3377  s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3378  s->eob += 4 * w4 * h4;
3379  s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3380  s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3381  }
3382 }
3383 
3384 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3385  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3386 {
3387  VP9Context *s = ctx->priv_data;
3388  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3389  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3390  const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3391  s->prob.p.partition[bl][c];
3392  enum BlockPartition bp;
3393  ptrdiff_t hbs = 4 >> bl;
3394  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3395  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3396  int bytesperpixel = s->bytesperpixel;
3397 
3398  if (bl == BL_8X8) {
3399  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3400  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3401  } else if (col + hbs < s->cols) { // FIXME why not <=?
3402  if (row + hbs < s->rows) { // FIXME why not <=?
3403  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3404  switch (bp) {
3405  case PARTITION_NONE:
3406  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3407  break;
3408  case PARTITION_H:
3409  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3410  yoff += hbs * 8 * y_stride;
3411  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3412  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3413  break;
3414  case PARTITION_V:
3415  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3416  yoff += hbs * 8 * bytesperpixel;
3417  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3418  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3419  break;
3420  case PARTITION_SPLIT:
3421  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3422  decode_sb(ctx, row, col + hbs, lflvl,
3423  yoff + 8 * hbs * bytesperpixel,
3424  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3425  yoff += hbs * 8 * y_stride;
3426  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3427  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3428  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3429  yoff + 8 * hbs * bytesperpixel,
3430  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3431  break;
3432  default:
3433  av_assert0(0);
3434  }
3435  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3436  bp = PARTITION_SPLIT;
3437  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3438  decode_sb(ctx, row, col + hbs, lflvl,
3439  yoff + 8 * hbs * bytesperpixel,
3440  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3441  } else {
3442  bp = PARTITION_H;
3443  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3444  }
3445  } else if (row + hbs < s->rows) { // FIXME why not <=?
3446  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3447  bp = PARTITION_SPLIT;
3448  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3449  yoff += hbs * 8 * y_stride;
3450  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3451  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3452  } else {
3453  bp = PARTITION_V;
3454  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3455  }
3456  } else {
3457  bp = PARTITION_SPLIT;
3458  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3459  }
3460  s->counts.partition[bl][c][bp]++;
3461 }
3462 
3463 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3464  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3465 {
3466  VP9Context *s = ctx->priv_data;
3467  VP9Block *b = s->b;
3468  ptrdiff_t hbs = 4 >> bl;
3469  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3470  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3471  int bytesperpixel = s->bytesperpixel;
3472 
3473  if (bl == BL_8X8) {
3474  av_assert2(b->bl == BL_8X8);
3475  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3476  } else if (s->b->bl == bl) {
3477  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3478  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3479  yoff += hbs * 8 * y_stride;
3480  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3481  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3482  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3483  yoff += hbs * 8 * bytesperpixel;
3484  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3485  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3486  }
3487  } else {
3488  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3489  if (col + hbs < s->cols) { // FIXME why not <=?
3490  if (row + hbs < s->rows) {
3491  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3492  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3493  yoff += hbs * 8 * y_stride;
3494  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3495  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3496  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3497  yoff + 8 * hbs * bytesperpixel,
3498  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3499  } else {
3500  yoff += hbs * 8 * bytesperpixel;
3501  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3502  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3503  }
3504  } else if (row + hbs < s->rows) {
3505  yoff += hbs * 8 * y_stride;
3506  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3507  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3508  }
3509  }
3510 }
3511 
3512 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3513  uint8_t *lvl, uint8_t (*mask)[4],
3514  uint8_t *dst, ptrdiff_t ls)
3515 {
3516  int y, x, bytesperpixel = s->bytesperpixel;
3517 
3518  // filter edges between columns (e.g. block1 | block2)
3519  for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3520  uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3521  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3522  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3523  unsigned hm = hm1 | hm2 | hm13 | hm23;
3524 
3525  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3526  if (col || x > 1) {
3527  if (hm1 & x) {
3528  int L = *l, H = L >> 4;
3529  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3530 
3531  if (hmask1[0] & x) {
3532  if (hmask2[0] & x) {
3533  av_assert2(l[8 << ss_v] == L);
3534  s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3535  } else {
3536  s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3537  }
3538  } else if (hm2 & x) {
3539  L = l[8 << ss_v];
3540  H |= (L >> 4) << 8;
3541  E |= s->filter_lut.mblim_lut[L] << 8;
3542  I |= s->filter_lut.lim_lut[L] << 8;
3543  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3544  [!!(hmask2[1] & x)]
3545  [0](ptr, ls, E, I, H);
3546  } else {
3547  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3548  [0](ptr, ls, E, I, H);
3549  }
3550  } else if (hm2 & x) {
3551  int L = l[8 << ss_v], H = L >> 4;
3552  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3553 
3554  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3555  [0](ptr + 8 * ls, ls, E, I, H);
3556  }
3557  }
3558  if (ss_h) {
3559  if (x & 0xAA)
3560  l += 2;
3561  } else {
3562  if (hm13 & x) {
3563  int L = *l, H = L >> 4;
3564  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3565 
3566  if (hm23 & x) {
3567  L = l[8 << ss_v];
3568  H |= (L >> 4) << 8;
3569  E |= s->filter_lut.mblim_lut[L] << 8;
3570  I |= s->filter_lut.lim_lut[L] << 8;
3571  s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3572  } else {
3573  s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3574  }
3575  } else if (hm23 & x) {
3576  int L = l[8 << ss_v], H = L >> 4;
3577  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3578 
3579  s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3580  }
3581  l++;
3582  }
3583  }
3584  }
3585 }
3586 
3587 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3588  uint8_t *lvl, uint8_t (*mask)[4],
3589  uint8_t *dst, ptrdiff_t ls)
3590 {
3591  int y, x, bytesperpixel = s->bytesperpixel;
3592 
3593  // block1
3594  // filter edges between rows (e.g. ------)
3595  // block2
3596  for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3597  uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3598  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3599 
3600  for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3601  if (row || y) {
3602  if (vm & x) {
3603  int L = *l, H = L >> 4;
3604  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3605 
3606  if (vmask[0] & x) {
3607  if (vmask[0] & (x << (1 + ss_h))) {
3608  av_assert2(l[1 + ss_h] == L);
3609  s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3610  } else {
3611  s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3612  }
3613  } else if (vm & (x << (1 + ss_h))) {
3614  L = l[1 + ss_h];
3615  H |= (L >> 4) << 8;
3616  E |= s->filter_lut.mblim_lut[L] << 8;
3617  I |= s->filter_lut.lim_lut[L] << 8;
3618  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3619  [!!(vmask[1] & (x << (1 + ss_h)))]
3620  [1](ptr, ls, E, I, H);
3621  } else {
3622  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3623  [1](ptr, ls, E, I, H);
3624  }
3625  } else if (vm & (x << (1 + ss_h))) {
3626  int L = l[1 + ss_h], H = L >> 4;
3627  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3628 
3629  s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3630  [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3631  }
3632  }
3633  if (!ss_v) {
3634  if (vm3 & x) {
3635  int L = *l, H = L >> 4;
3636  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3637 
3638  if (vm3 & (x << (1 + ss_h))) {
3639  L = l[1 + ss_h];
3640  H |= (L >> 4) << 8;
3641  E |= s->filter_lut.mblim_lut[L] << 8;
3642  I |= s->filter_lut.lim_lut[L] << 8;
3643  s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3644  } else {
3645  s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3646  }
3647  } else if (vm3 & (x << (1 + ss_h))) {
3648  int L = l[1 + ss_h], H = L >> 4;
3649  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3650 
3651  s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3652  }
3653  }
3654  }
3655  if (ss_v) {
3656  if (y & 1)
3657  lvl += 16;
3658  } else {
3659  lvl += 8;
3660  }
3661  }
3662 }
3663 
3664 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3665  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3666 {
3667  VP9Context *s = ctx->priv_data;
3668  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3669  uint8_t *dst = f->data[0] + yoff;
3670  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3671  uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3672  int p;
3673 
3674  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3675  // if you think of them as acting on a 8x8 block max, we can interleave
3676  // each v/h within the single x loop, but that only works if we work on
3677  // 8 pixel blocks, and we won't always do that (we want at least 16px
3678  // to use SSE2 optimizations, perhaps 32 for AVX2)
3679 
3680  filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3681  filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3682 
3683  for (p = 0; p < 2; p++) {
3684  dst = f->data[1 + p] + uvoff;
3685  filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3686  filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3687  }
3688 }
3689 
3690 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3691 {
3692  int sb_start = ( idx * n) >> log2_n;
3693  int sb_end = ((idx + 1) * n) >> log2_n;
3694  *start = FFMIN(sb_start, n) << 3;
3695  *end = FFMIN(sb_end, n) << 3;
3696 }
3697 
3698 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3699  int max_count, int update_factor)
3700 {
3701  unsigned ct = ct0 + ct1, p2, p1;
3702 
3703  if (!ct)
3704  return;
3705 
3706  p1 = *p;
3707  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3708  p2 = av_clip(p2, 1, 255);
3709  ct = FFMIN(ct, max_count);
3710  update_factor = FASTDIV(update_factor * ct, max_count);
3711 
3712  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3713  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3714 }
3715 
3716 static void adapt_probs(VP9Context *s)
3717 {
3718  int i, j, k, l, m;
3719  prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3720  int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3721 
3722  // coefficients
3723  for (i = 0; i < 4; i++)
3724  for (j = 0; j < 2; j++)
3725  for (k = 0; k < 2; k++)
3726  for (l = 0; l < 6; l++)
3727  for (m = 0; m < 6; m++) {
3728  uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3729  unsigned *e = s->counts.eob[i][j][k][l][m];
3730  unsigned *c = s->counts.coef[i][j][k][l][m];
3731 
3732  if (l == 0 && m >= 3) // dc only has 3 pt
3733  break;
3734 
3735  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3736  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3737  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3738  }
3739 
3740  if (s->s.h.keyframe || s->s.h.intraonly) {
3741  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3742  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3743  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3744  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3745  return;
3746  }
3747 
3748  // skip flag
3749  for (i = 0; i < 3; i++)
3750  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3751 
3752  // intra/inter flag
3753  for (i = 0; i < 4; i++)
3754  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3755 
3756  // comppred flag
3757  if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3758  for (i = 0; i < 5; i++)
3759  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3760  }
3761 
3762  // reference frames
3763  if (s->s.h.comppredmode != PRED_SINGLEREF) {
3764  for (i = 0; i < 5; i++)
3765  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3766  s->counts.comp_ref[i][1], 20, 128);
3767  }
3768 
3769  if (s->s.h.comppredmode != PRED_COMPREF) {
3770  for (i = 0; i < 5; i++) {
3771  uint8_t *pp = p->single_ref[i];
3772  unsigned (*c)[2] = s->counts.single_ref[i];
3773 
3774  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3775  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3776  }
3777  }
3778 
3779  // block partitioning
3780  for (i = 0; i < 4; i++)
3781  for (j = 0; j < 4; j++) {
3782  uint8_t *pp = p->partition[i][j];
3783  unsigned *c = s->counts.partition[i][j];
3784 
3785  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3786  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3787  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3788  }
3789 
3790  // tx size
3791  if (s->s.h.txfmmode == TX_SWITCHABLE) {
3792  for (i = 0; i < 2; i++) {
3793  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3794 
3795  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3796  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3797  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3798  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3799  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3800  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3801  }
3802  }
3803 
3804  // interpolation filter
3805  if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3806  for (i = 0; i < 4; i++) {
3807  uint8_t *pp = p->filter[i];
3808  unsigned *c = s->counts.filter[i];
3809 
3810  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3811  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3812  }
3813  }
3814 
3815  // inter modes
3816  for (i = 0; i < 7; i++) {
3817  uint8_t *pp = p->mv_mode[i];
3818  unsigned *c = s->counts.mv_mode[i];
3819 
3820  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3821  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3822  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3823  }
3824 
3825  // mv joints
3826  {
3827  uint8_t *pp = p->mv_joint;
3828  unsigned *c = s->counts.mv_joint;
3829 
3830  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3831  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3832  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3833  }
3834 
3835  // mv components
3836  for (i = 0; i < 2; i++) {
3837  uint8_t *pp;
3838  unsigned *c, (*c2)[2], sum;
3839 
3840  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3841  s->counts.mv_comp[i].sign[1], 20, 128);
3842 
3843  pp = p->mv_comp[i].classes;
3844  c = s->counts.mv_comp[i].classes;
3845  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3846  adapt_prob(&pp[0], c[0], sum, 20, 128);
3847  sum -= c[1];
3848  adapt_prob(&pp[1], c[1], sum, 20, 128);
3849  sum -= c[2] + c[3];
3850  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3851  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3852  sum -= c[4] + c[5];
3853  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3854  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3855  sum -= c[6];
3856  adapt_prob(&pp[6], c[6], sum, 20, 128);
3857  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3858  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3859  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3860 
3861  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3862  s->counts.mv_comp[i].class0[1], 20, 128);
3863  pp = p->mv_comp[i].bits;
3864  c2 = s->counts.mv_comp[i].bits;
3865  for (j = 0; j < 10; j++)
3866  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3867 
3868  for (j = 0; j < 2; j++) {
3869  pp = p->mv_comp[i].class0_fp[j];
3870  c = s->counts.mv_comp[i].class0_fp[j];
3871  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3872  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3873  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3874  }
3875  pp = p->mv_comp[i].fp;
3876  c = s->counts.mv_comp[i].fp;
3877  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3878  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3879  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3880 
3881  if (s->s.h.highprecisionmvs) {
3882  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3883  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3884  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3885  s->counts.mv_comp[i].hp[1], 20, 128);
3886  }
3887  }
3888 
3889  // y intra modes
3890  for (i = 0; i < 4; i++) {
3891  uint8_t *pp = p->y_mode[i];
3892  unsigned *c = s->counts.y_mode[i], sum, s2;
3893 
3894  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3895  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3896  sum -= c[TM_VP8_PRED];
3897  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3898  sum -= c[VERT_PRED];
3899  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3900  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3901  sum -= s2;
3902  adapt_prob(&pp[3], s2, sum, 20, 128);
3903  s2 -= c[HOR_PRED];
3904  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3905  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3906  sum -= c[DIAG_DOWN_LEFT_PRED];
3907  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3908  sum -= c[VERT_LEFT_PRED];
3909  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3910  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3911  }
3912 
3913  // uv intra modes
3914  for (i = 0; i < 10; i++) {
3915  uint8_t *pp = p->uv_mode[i];
3916  unsigned *c = s->counts.uv_mode[i], sum, s2;
3917 
3918  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3919  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3920  sum -= c[TM_VP8_PRED];
3921  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3922  sum -= c[VERT_PRED];
3923  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3924  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3925  sum -= s2;
3926  adapt_prob(&pp[3], s2, sum, 20, 128);
3927  s2 -= c[HOR_PRED];
3928  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3929  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3930  sum -= c[DIAG_DOWN_LEFT_PRED];
3931  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3932  sum -= c[VERT_LEFT_PRED];
3933  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3934  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3935  }
3936 }
3937 
3938 static void free_buffers(VP9Context *s)
3939 {
3940  av_freep(&s->intra_pred_data[0]);
3941  av_freep(&s->b_base);
3942  av_freep(&s->block_base);
3943 }
3944 
3946 {
3947  VP9Context *s = ctx->priv_data;
3948  int i;
3949 
3950  for (i = 0; i < 3; i++) {
3951  if (s->s.frames[i].tf.f->buf[0])
3952  vp9_unref_frame(ctx, &s->s.frames[i]);
3953  av_frame_free(&s->s.frames[i].tf.f);
3954  }
3955  for (i = 0; i < 8; i++) {
3956  if (s->s.refs[i].f->buf[0])
3957  ff_thread_release_buffer(ctx, &s->s.refs[i]);
3958  av_frame_free(&s->s.refs[i].f);
3959  if (s->next_refs[i].f->buf[0])
3960  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3961  av_frame_free(&s->next_refs[i].f);
3962  }
3963  free_buffers(s);
3964  av_freep(&s->c_b);
3965  s->c_b_size = 0;
3966 
3967  return 0;
3968 }
3969 
3970 
3972  int *got_frame, AVPacket *pkt)
3973 {
3974  const uint8_t *data = pkt->data;
3975  int size = pkt->size;
3976  VP9Context *s = ctx->priv_data;
3977  int res, tile_row, tile_col, i, ref, row, col;
3978  int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3980  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3981  AVFrame *f;
3982  int bytesperpixel;
3983 
3984  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3985  return res;
3986  } else if (res == 0) {
3987  if (!s->s.refs[ref].f->buf[0]) {
3988  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3989  return AVERROR_INVALIDDATA;
3990  }
3991  if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3992  return res;
3993  ((AVFrame *)frame)->pkt_pts = pkt->pts;
3994  ((AVFrame *)frame)->pkt_dts = pkt->dts;
3995  for (i = 0; i < 8; i++) {
3996  if (s->next_refs[i].f->buf[0])
3997  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3998  if (s->s.refs[i].f->buf[0] &&
3999  (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4000  return res;
4001  }
4002  *got_frame = 1;
4003  return pkt->size;
4004  }
4005  data += res;
4006  size -= res;
4007 
4008  if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4009  if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4011  if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4012  (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4013  return res;
4014  }
4015  if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4017  if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4018  (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4019  return res;
4020  if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4021  vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4022  if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4023  return res;
4024  f = s->s.frames[CUR_FRAME].tf.f;
4025  f->key_frame = s->s.h.keyframe;
4027  ls_y = f->linesize[0];
4028  ls_uv =f->linesize[1];
4029 
4030  if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4034  }
4035 
4036  // ref frame setup
4037  for (i = 0; i < 8; i++) {
4038  if (s->next_refs[i].f->buf[0])
4039  ff_thread_release_buffer(ctx, &s->next_refs[i]);
4040  if (s->s.h.refreshrefmask & (1 << i)) {
4041  res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4042  } else if (s->s.refs[i].f->buf[0]) {
4043  res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4044  }
4045  if (res < 0)
4046  return res;
4047  }
4048 
4049  if (ctx->hwaccel) {
4050  res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4051  if (res < 0)
4052  return res;
4053  res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4054  if (res < 0)
4055  return res;
4056  res = ctx->hwaccel->end_frame(ctx);
4057  if (res < 0)
4058  return res;
4059  goto finish;
4060  }
4061 
4062  // main tile decode loop
4063  bytesperpixel = s->bytesperpixel;
4064  memset(s->above_partition_ctx, 0, s->cols);
4065  memset(s->above_skip_ctx, 0, s->cols);
4066  if (s->s.h.keyframe || s->s.h.intraonly) {
4067  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4068  } else {
4069  memset(s->above_mode_ctx, NEARESTMV, s->cols);
4070  }
4071  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4072  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4073  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4074  memset(s->above_segpred_ctx, 0, s->cols);
4075  s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4077  if ((res = update_block_buffers(ctx)) < 0) {
4078  av_log(ctx, AV_LOG_ERROR,
4079  "Failed to allocate block buffers\n");
4080  return res;
4081  }
4082  if (s->s.h.refreshctx && s->s.h.parallelmode) {
4083  int j, k, l, m;
4084 
4085  for (i = 0; i < 4; i++) {
4086  for (j = 0; j < 2; j++)
4087  for (k = 0; k < 2; k++)
4088  for (l = 0; l < 6; l++)
4089  for (m = 0; m < 6; m++)
4090  memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4091  s->prob.coef[i][j][k][l][m], 3);
4092  if (s->s.h.txfmmode == i)
4093  break;
4094  }
4095  s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4097  } else if (!s->s.h.refreshctx) {
4099  }
4100 
4101  do {
4102  yoff = uvoff = 0;
4103  s->b = s->b_base;
4104  s->block = s->block_base;
4105  s->uvblock[0] = s->uvblock_base[0];
4106  s->uvblock[1] = s->uvblock_base[1];
4107  s->eob = s->eob_base;
4108  s->uveob[0] = s->uveob_base[0];
4109  s->uveob[1] = s->uveob_base[1];
4110 
4111  for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4112