FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "profiles.h"
28 #include "thread.h"
29 #include "videodsp.h"
30 #include "vp56.h"
31 #include "vp9.h"
32 #include "vp9data.h"
33 #include "vp9dsp.h"
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
36 
37 #define VP9_SYNCCODE 0x498342
38 
39 struct VP9Filter {
40  uint8_t level[8 * 8];
41  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
43 };
44 
45 typedef struct VP9Block {
48  VP56mv mv[4 /* b_idx */][2 /* ref */];
49  enum BlockSize bs;
50  enum TxfmMode tx, uvtx;
51  enum BlockLevel bl;
53 } VP9Block;
54 
55 typedef struct VP9Context {
57 
63  unsigned c_b_size;
65  int pass;
66  int row, row7, col, col7;
67  uint8_t *dst[3];
68  ptrdiff_t y_stride, uv_stride;
69 
73  // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74  // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75  // and are therefore per-stream. pix_fmt represents the value in the header
76  // of the currently processed frame.
77  int w, h;
78  enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79  unsigned sb_cols, sb_rows, rows, cols;
81 
82  struct {
85  } filter_lut;
87  struct {
89  uint8_t coef[4][2][2][6][6][3];
90  } prob_ctx[4];
91  struct {
93  uint8_t coef[4][2][2][6][6][11];
94  } prob;
95  struct {
96  unsigned y_mode[4][10];
97  unsigned uv_mode[10][10];
98  unsigned filter[4][3];
99  unsigned mv_mode[7][4];
100  unsigned intra[4][2];
101  unsigned comp[5][2];
102  unsigned single_ref[5][2][2];
103  unsigned comp_ref[5][2];
104  unsigned tx32p[2][4];
105  unsigned tx16p[2][3];
106  unsigned tx8p[2][2];
107  unsigned skip[3][2];
108  unsigned mv_joint[4];
109  struct {
110  unsigned sign[2];
111  unsigned classes[11];
112  unsigned class0[2];
113  unsigned bits[10][2];
114  unsigned class0_fp[2][4];
115  unsigned fp[4];
116  unsigned class0_hp[2];
117  unsigned hp[2];
118  } mv_comp[2];
119  unsigned partition[4][4][4];
120  unsigned coef[4][2][2][6][6][3];
121  unsigned eob[4][2][2][6][6][2];
122  } counts;
123 
124  // contextual (left/above) cache
139  // FIXME maybe merge some of the below in a flags field?
150 
151  // whole-frame cache
153  struct VP9Filter *lflvl;
155 
156  // block reconstruction intermediates
158  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
160  struct { int x, y; } min_mv, max_mv;
161  DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163  uint16_t mvscale[3][2];
165 } VP9Context;
166 
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
168  {
169  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
171  }, {
172  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
174  }
175 };
176 
178 {
179  ff_thread_release_buffer(ctx, &f->tf);
182  f->segmentation_map = NULL;
184 }
185 
187 {
188  VP9Context *s = ctx->priv_data;
189  int ret, sz;
190 
191  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
192  return ret;
193  sz = 64 * s->sb_cols * s->sb_rows;
194  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
195  goto fail;
196  }
197 
199  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
200 
201  if (ctx->hwaccel) {
202  const AVHWAccel *hwaccel = ctx->hwaccel;
204  if (hwaccel->frame_priv_data_size) {
206  if (!f->hwaccel_priv_buf)
207  goto fail;
209  }
210  }
211 
212  return 0;
213 
214 fail:
215  vp9_unref_frame(ctx, f);
216  return AVERROR(ENOMEM);
217 }
218 
220 {
221  int res;
222 
223  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
224  return res;
225  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
226  goto fail;
227  }
228 
230  dst->mv = src->mv;
231  dst->uses_2pass = src->uses_2pass;
232 
233  if (src->hwaccel_picture_private) {
235  if (!dst->hwaccel_priv_buf)
236  goto fail;
238  }
239 
240  return 0;
241 
242 fail:
243  vp9_unref_frame(ctx, dst);
244  return AVERROR(ENOMEM);
245 }
246 
247 static int update_size(AVCodecContext *ctx, int w, int h)
248 {
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250  enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251  VP9Context *s = ctx->priv_data;
252  uint8_t *p;
253  int bytesperpixel = s->bytesperpixel, res, cols, rows;
254 
255  av_assert0(w > 0 && h > 0);
256 
257  if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258  if ((res = ff_set_dimensions(ctx, w, h)) < 0)
259  return res;
260 
261  if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
262 #if CONFIG_VP9_DXVA2_HWACCEL
263  *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
264 #endif
265 #if CONFIG_VP9_D3D11VA_HWACCEL
266  *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
267 #endif
268 #if CONFIG_VP9_VAAPI_HWACCEL
269  *fmtp++ = AV_PIX_FMT_VAAPI;
270 #endif
271  }
272 
273  *fmtp++ = s->pix_fmt;
274  *fmtp = AV_PIX_FMT_NONE;
275 
276  res = ff_thread_get_format(ctx, pix_fmts);
277  if (res < 0)
278  return res;
279 
280  ctx->pix_fmt = res;
281  s->gf_fmt = s->pix_fmt;
282  s->w = w;
283  s->h = h;
284  }
285 
286  cols = (w + 7) >> 3;
287  rows = (h + 7) >> 3;
288 
289  if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
290  return 0;
291 
292  s->last_fmt = s->pix_fmt;
293  s->sb_cols = (w + 63) >> 6;
294  s->sb_rows = (h + 63) >> 6;
295  s->cols = (w + 7) >> 3;
296  s->rows = (h + 7) >> 3;
297 
298 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
299  av_freep(&s->intra_pred_data[0]);
300  // FIXME we slightly over-allocate here for subsampled chroma, but a little
301  // bit of padding shouldn't affect performance...
302  p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
303  sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
304  if (!p)
305  return AVERROR(ENOMEM);
306  assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
307  assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
308  assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
309  assign(s->above_y_nnz_ctx, uint8_t *, 16);
310  assign(s->above_mode_ctx, uint8_t *, 16);
311  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
312  assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
313  assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
315  assign(s->above_skip_ctx, uint8_t *, 8);
316  assign(s->above_txfm_ctx, uint8_t *, 8);
317  assign(s->above_segpred_ctx, uint8_t *, 8);
318  assign(s->above_intra_ctx, uint8_t *, 8);
319  assign(s->above_comp_ctx, uint8_t *, 8);
320  assign(s->above_ref_ctx, uint8_t *, 8);
321  assign(s->above_filter_ctx, uint8_t *, 8);
322  assign(s->lflvl, struct VP9Filter *, 1);
323 #undef assign
324 
325  // these will be re-allocated a little later
326  av_freep(&s->b_base);
327  av_freep(&s->block_base);
328 
329  if (s->bpp != s->last_bpp) {
331  ff_videodsp_init(&s->vdsp, s->bpp);
332  s->last_bpp = s->bpp;
333  }
334 
335  return 0;
336 }
337 
339 {
340  VP9Context *s = ctx->priv_data;
341  int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
342 
344  return 0;
345 
346  av_free(s->b_base);
347  av_free(s->block_base);
348  chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
349  chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
350  if (s->s.frames[CUR_FRAME].uses_2pass) {
351  int sbs = s->sb_cols * s->sb_rows;
352 
353  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
354  s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
355  16 * 16 + 2 * chroma_eobs) * sbs);
356  if (!s->b_base || !s->block_base)
357  return AVERROR(ENOMEM);
358  s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
359  s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
360  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
361  s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
362  s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
363  } else {
364  s->b_base = av_malloc(sizeof(VP9Block));
365  s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
366  16 * 16 + 2 * chroma_eobs);
367  if (!s->b_base || !s->block_base)
368  return AVERROR(ENOMEM);
369  s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
370  s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
371  s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
372  s->uveob_base[0] = s->eob_base + 16 * 16;
373  s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
374  }
376 
377  return 0;
378 }
379 
380 // for some reason the sign bit is at the end, not the start, of a bit sequence
382 {
383  int v = get_bits(gb, n);
384  return get_bits1(gb) ? -v : v;
385 }
386 
387 static av_always_inline int inv_recenter_nonneg(int v, int m)
388 {
389  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
390 }
391 
392 // differential forward probability updates
393 static int update_prob(VP56RangeCoder *c, int p)
394 {
395  static const int inv_map_table[255] = {
396  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
397  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
398  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
399  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
400  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
401  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
402  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
403  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
404  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
405  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
406  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
407  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
408  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
409  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
410  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
411  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
412  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
413  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
414  252, 253, 253,
415  };
416  int d;
417 
418  /* This code is trying to do a differential probability update. For a
419  * current probability A in the range [1, 255], the difference to a new
420  * probability of any value can be expressed differentially as 1-A,255-A
421  * where some part of this (absolute range) exists both in positive as
422  * well as the negative part, whereas another part only exists in one
423  * half. We're trying to code this shared part differentially, i.e.
424  * times two where the value of the lowest bit specifies the sign, and
425  * the single part is then coded on top of this. This absolute difference
426  * then again has a value of [0,254], but a bigger value in this range
427  * indicates that we're further away from the original value A, so we
428  * can code this as a VLC code, since higher values are increasingly
429  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
430  * updates vs. the 'fine, exact' updates further down the range, which
431  * adds one extra dimension to this differential update model. */
432 
433  if (!vp8_rac_get(c)) {
434  d = vp8_rac_get_uint(c, 4) + 0;
435  } else if (!vp8_rac_get(c)) {
436  d = vp8_rac_get_uint(c, 4) + 16;
437  } else if (!vp8_rac_get(c)) {
438  d = vp8_rac_get_uint(c, 5) + 32;
439  } else {
440  d = vp8_rac_get_uint(c, 7);
441  if (d >= 65)
442  d = (d << 1) - 65 + vp8_rac_get(c);
443  d += 64;
444  av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
445  }
446 
447  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
448  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
449 }
450 
452 {
453  static const enum AVColorSpace colorspaces[8] = {
456  };
457  VP9Context *s = ctx->priv_data;
458  int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
459 
460  s->bpp_index = bits;
461  s->bpp = 8 + bits * 2;
462  s->bytesperpixel = (7 + s->bpp) >> 3;
463  ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
464  if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
465  static const enum AVPixelFormat pix_fmt_rgb[3] = {
467  };
468  s->ss_h = s->ss_v = 0;
470  s->pix_fmt = pix_fmt_rgb[bits];
471  if (ctx->profile & 1) {
472  if (get_bits1(&s->gb)) {
473  av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
474  return AVERROR_INVALIDDATA;
475  }
476  } else {
477  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
478  ctx->profile);
479  return AVERROR_INVALIDDATA;
480  }
481  } else {
482  static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
489  };
491  if (ctx->profile & 1) {
492  s->ss_h = get_bits1(&s->gb);
493  s->ss_v = get_bits1(&s->gb);
494  s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
495  if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
496  av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
497  ctx->profile);
498  return AVERROR_INVALIDDATA;
499  } else if (get_bits1(&s->gb)) {
500  av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
501  ctx->profile);
502  return AVERROR_INVALIDDATA;
503  }
504  } else {
505  s->ss_h = s->ss_v = 1;
506  s->pix_fmt = pix_fmt_for_ss[bits][1][1];
507  }
508  }
509 
510  return 0;
511 }
512 
514  const uint8_t *data, int size, int *ref)
515 {
516  VP9Context *s = ctx->priv_data;
517  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
518  int last_invisible;
519  const uint8_t *data2;
520 
521  /* general header */
522  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
523  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
524  return res;
525  }
526  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
527  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
528  return AVERROR_INVALIDDATA;
529  }
530  ctx->profile = get_bits1(&s->gb);
531  ctx->profile |= get_bits1(&s->gb) << 1;
532  if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
533  if (ctx->profile > 3) {
534  av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
535  return AVERROR_INVALIDDATA;
536  }
537  s->s.h.profile = ctx->profile;
538  if (get_bits1(&s->gb)) {
539  *ref = get_bits(&s->gb, 3);
540  return 0;
541  }
542  s->last_keyframe = s->s.h.keyframe;
543  s->s.h.keyframe = !get_bits1(&s->gb);
544  last_invisible = s->s.h.invisible;
545  s->s.h.invisible = !get_bits1(&s->gb);
546  s->s.h.errorres = get_bits1(&s->gb);
547  s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
548  if (s->s.h.keyframe) {
549  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
550  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
551  return AVERROR_INVALIDDATA;
552  }
553  if ((res = read_colorspace_details(ctx)) < 0)
554  return res;
555  // for profile 1, here follows the subsampling bits
556  s->s.h.refreshrefmask = 0xff;
557  w = get_bits(&s->gb, 16) + 1;
558  h = get_bits(&s->gb, 16) + 1;
559  if (get_bits1(&s->gb)) // display size
560  skip_bits(&s->gb, 32);
561  } else {
562  s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
563  s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
564  if (s->s.h.intraonly) {
565  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
566  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
567  return AVERROR_INVALIDDATA;
568  }
569  if (ctx->profile >= 1) {
570  if ((res = read_colorspace_details(ctx)) < 0)
571  return res;
572  } else {
573  s->ss_h = s->ss_v = 1;
574  s->bpp = 8;
575  s->bpp_index = 0;
576  s->bytesperpixel = 1;
577  s->pix_fmt = AV_PIX_FMT_YUV420P;
580  }
581  s->s.h.refreshrefmask = get_bits(&s->gb, 8);
582  w = get_bits(&s->gb, 16) + 1;
583  h = get_bits(&s->gb, 16) + 1;
584  if (get_bits1(&s->gb)) // display size
585  skip_bits(&s->gb, 32);
586  } else {
587  s->s.h.refreshrefmask = get_bits(&s->gb, 8);
588  s->s.h.refidx[0] = get_bits(&s->gb, 3);
589  s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
590  s->s.h.refidx[1] = get_bits(&s->gb, 3);
591  s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
592  s->s.h.refidx[2] = get_bits(&s->gb, 3);
593  s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
594  if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
595  !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
596  !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
597  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
598  return AVERROR_INVALIDDATA;
599  }
600  if (get_bits1(&s->gb)) {
601  w = s->s.refs[s->s.h.refidx[0]].f->width;
602  h = s->s.refs[s->s.h.refidx[0]].f->height;
603  } else if (get_bits1(&s->gb)) {
604  w = s->s.refs[s->s.h.refidx[1]].f->width;
605  h = s->s.refs[s->s.h.refidx[1]].f->height;
606  } else if (get_bits1(&s->gb)) {
607  w = s->s.refs[s->s.h.refidx[2]].f->width;
608  h = s->s.refs[s->s.h.refidx[2]].f->height;
609  } else {
610  w = get_bits(&s->gb, 16) + 1;
611  h = get_bits(&s->gb, 16) + 1;
612  }
613  // Note that in this code, "CUR_FRAME" is actually before we
614  // have formally allocated a frame, and thus actually represents
615  // the _last_ frame
616  s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
617  s->s.frames[CUR_FRAME].tf.f->height == h;
618  if (get_bits1(&s->gb)) // display size
619  skip_bits(&s->gb, 32);
620  s->s.h.highprecisionmvs = get_bits1(&s->gb);
622  get_bits(&s->gb, 2);
623  s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
624  s->s.h.signbias[0] != s->s.h.signbias[2];
625  if (s->s.h.allowcompinter) {
626  if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
627  s->s.h.fixcompref = 2;
628  s->s.h.varcompref[0] = 0;
629  s->s.h.varcompref[1] = 1;
630  } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
631  s->s.h.fixcompref = 1;
632  s->s.h.varcompref[0] = 0;
633  s->s.h.varcompref[1] = 2;
634  } else {
635  s->s.h.fixcompref = 0;
636  s->s.h.varcompref[0] = 1;
637  s->s.h.varcompref[1] = 2;
638  }
639  }
640  }
641  }
642  s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
643  s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
644  s->s.h.framectxid = c = get_bits(&s->gb, 2);
645  if (s->s.h.keyframe || s->s.h.intraonly)
646  s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
647 
648  /* loopfilter header data */
649  if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
650  // reset loopfilter defaults
651  s->s.h.lf_delta.ref[0] = 1;
652  s->s.h.lf_delta.ref[1] = 0;
653  s->s.h.lf_delta.ref[2] = -1;
654  s->s.h.lf_delta.ref[3] = -1;
655  s->s.h.lf_delta.mode[0] = 0;
656  s->s.h.lf_delta.mode[1] = 0;
657  memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
658  }
659  s->s.h.filter.level = get_bits(&s->gb, 6);
660  sharp = get_bits(&s->gb, 3);
661  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
662  // the old cache values since they are still valid
663  if (s->s.h.filter.sharpness != sharp)
664  memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
665  s->s.h.filter.sharpness = sharp;
666  if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
667  if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
668  for (i = 0; i < 4; i++)
669  if (get_bits1(&s->gb))
670  s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
671  for (i = 0; i < 2; i++)
672  if (get_bits1(&s->gb))
673  s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
674  }
675  }
676 
677  /* quantization header data */
678  s->s.h.yac_qi = get_bits(&s->gb, 8);
679  s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
680  s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
681  s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
682  s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
683  s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
684  if (s->s.h.lossless)
686 
687  /* segmentation header info */
688  if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
689  if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
690  for (i = 0; i < 7; i++)
691  s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
692  get_bits(&s->gb, 8) : 255;
693  if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
694  for (i = 0; i < 3; i++)
695  s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
696  get_bits(&s->gb, 8) : 255;
697  }
698  }
699 
700  if (get_bits1(&s->gb)) {
702  for (i = 0; i < 8; i++) {
703  if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
704  s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
705  if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
706  s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
707  if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
708  s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
709  s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
710  }
711  }
712  }
713 
714  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
715  for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
716  int qyac, qydc, quvac, quvdc, lflvl, sh;
717 
718  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
719  if (s->s.h.segmentation.absolute_vals)
720  qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
721  else
722  qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
723  } else {
724  qyac = s->s.h.yac_qi;
725  }
726  qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
727  quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
728  quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
729  qyac = av_clip_uintp2(qyac, 8);
730 
731  s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
732  s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
733  s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
734  s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
735 
736  sh = s->s.h.filter.level >= 32;
737  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
738  if (s->s.h.segmentation.absolute_vals)
739  lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
740  else
741  lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
742  } else {
743  lflvl = s->s.h.filter.level;
744  }
745  if (s->s.h.lf_delta.enabled) {
746  s->s.h.segmentation.feat[i].lflvl[0][0] =
747  s->s.h.segmentation.feat[i].lflvl[0][1] =
748  av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
749  for (j = 1; j < 4; j++) {
750  s->s.h.segmentation.feat[i].lflvl[j][0] =
751  av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
752  s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
753  s->s.h.segmentation.feat[i].lflvl[j][1] =
754  av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
755  s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
756  }
757  } else {
758  memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
759  sizeof(s->s.h.segmentation.feat[i].lflvl));
760  }
761  }
762 
763  /* tiling info */
764  if ((res = update_size(ctx, w, h)) < 0) {
765  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
766  w, h, s->pix_fmt);
767  return res;
768  }
769  for (s->s.h.tiling.log2_tile_cols = 0;
770  s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
771  s->s.h.tiling.log2_tile_cols++) ;
772  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
773  max = FFMAX(0, max - 1);
774  while (max > s->s.h.tiling.log2_tile_cols) {
775  if (get_bits1(&s->gb))
776  s->s.h.tiling.log2_tile_cols++;
777  else
778  break;
779  }
780  s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
781  s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
782  if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
783  s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
784  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
785  sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
786  if (!s->c_b) {
787  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
788  return AVERROR(ENOMEM);
789  }
790  }
791 
792  /* check reference frames */
793  if (!s->s.h.keyframe && !s->s.h.intraonly) {
794  for (i = 0; i < 3; i++) {
795  AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
796  int refw = ref->width, refh = ref->height;
797 
798  if (ref->format != ctx->pix_fmt) {
799  av_log(ctx, AV_LOG_ERROR,
800  "Ref pixfmt (%s) did not match current frame (%s)",
803  return AVERROR_INVALIDDATA;
804  } else if (refw == w && refh == h) {
805  s->mvscale[i][0] = s->mvscale[i][1] = 0;
806  } else {
807  if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
808  av_log(ctx, AV_LOG_ERROR,
809  "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
810  refw, refh, w, h);
811  return AVERROR_INVALIDDATA;
812  }
813  s->mvscale[i][0] = (refw << 14) / w;
814  s->mvscale[i][1] = (refh << 14) / h;
815  s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
816  s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
817  }
818  }
819  }
820 
821  if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
822  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
823  s->prob_ctx[3].p = vp9_default_probs;
824  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
825  sizeof(vp9_default_coef_probs));
826  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
827  sizeof(vp9_default_coef_probs));
828  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
829  sizeof(vp9_default_coef_probs));
830  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
831  sizeof(vp9_default_coef_probs));
832  } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
834  memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
835  sizeof(vp9_default_coef_probs));
836  }
837 
838  // next 16 bits is size of the rest of the header (arith-coded)
839  s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
840  s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
841 
842  data2 = align_get_bits(&s->gb);
843  if (size2 > size - (data2 - data)) {
844  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
845  return AVERROR_INVALIDDATA;
846  }
847  ff_vp56_init_range_decoder(&s->c, data2, size2);
848  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
849  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
850  return AVERROR_INVALIDDATA;
851  }
852 
853  if (s->s.h.keyframe || s->s.h.intraonly) {
854  memset(s->counts.coef, 0, sizeof(s->counts.coef));
855  memset(s->counts.eob, 0, sizeof(s->counts.eob));
856  } else {
857  memset(&s->counts, 0, sizeof(s->counts));
858  }
859  // FIXME is it faster to not copy here, but do it down in the fw updates
860  // as explicit copies if the fw update is missing (and skip the copy upon
861  // fw update)?
862  s->prob.p = s->prob_ctx[c].p;
863 
864  // txfm updates
865  if (s->s.h.lossless) {
866  s->s.h.txfmmode = TX_4X4;
867  } else {
868  s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
869  if (s->s.h.txfmmode == 3)
870  s->s.h.txfmmode += vp8_rac_get(&s->c);
871 
872  if (s->s.h.txfmmode == TX_SWITCHABLE) {
873  for (i = 0; i < 2; i++)
874  if (vp56_rac_get_prob_branchy(&s->c, 252))
875  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
876  for (i = 0; i < 2; i++)
877  for (j = 0; j < 2; j++)
878  if (vp56_rac_get_prob_branchy(&s->c, 252))
879  s->prob.p.tx16p[i][j] =
880  update_prob(&s->c, s->prob.p.tx16p[i][j]);
881  for (i = 0; i < 2; i++)
882  for (j = 0; j < 3; j++)
883  if (vp56_rac_get_prob_branchy(&s->c, 252))
884  s->prob.p.tx32p[i][j] =
885  update_prob(&s->c, s->prob.p.tx32p[i][j]);
886  }
887  }
888 
889  // coef updates
890  for (i = 0; i < 4; i++) {
891  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
892  if (vp8_rac_get(&s->c)) {
893  for (j = 0; j < 2; j++)
894  for (k = 0; k < 2; k++)
895  for (l = 0; l < 6; l++)
896  for (m = 0; m < 6; m++) {
897  uint8_t *p = s->prob.coef[i][j][k][l][m];
898  uint8_t *r = ref[j][k][l][m];
899  if (m >= 3 && l == 0) // dc only has 3 pt
900  break;
901  for (n = 0; n < 3; n++) {
902  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
903  p[n] = update_prob(&s->c, r[n]);
904  } else {
905  p[n] = r[n];
906  }
907  }
908  p[3] = 0;
909  }
910  } else {
911  for (j = 0; j < 2; j++)
912  for (k = 0; k < 2; k++)
913  for (l = 0; l < 6; l++)
914  for (m = 0; m < 6; m++) {
915  uint8_t *p = s->prob.coef[i][j][k][l][m];
916  uint8_t *r = ref[j][k][l][m];
917  if (m > 3 && l == 0) // dc only has 3 pt
918  break;
919  memcpy(p, r, 3);
920  p[3] = 0;
921  }
922  }
923  if (s->s.h.txfmmode == i)
924  break;
925  }
926 
927  // mode updates
928  for (i = 0; i < 3; i++)
929  if (vp56_rac_get_prob_branchy(&s->c, 252))
930  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
931  if (!s->s.h.keyframe && !s->s.h.intraonly) {
932  for (i = 0; i < 7; i++)
933  for (j = 0; j < 3; j++)
934  if (vp56_rac_get_prob_branchy(&s->c, 252))
935  s->prob.p.mv_mode[i][j] =
936  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
937 
938  if (s->s.h.filtermode == FILTER_SWITCHABLE)
939  for (i = 0; i < 4; i++)
940  for (j = 0; j < 2; j++)
941  if (vp56_rac_get_prob_branchy(&s->c, 252))
942  s->prob.p.filter[i][j] =
943  update_prob(&s->c, s->prob.p.filter[i][j]);
944 
945  for (i = 0; i < 4; i++)
946  if (vp56_rac_get_prob_branchy(&s->c, 252))
947  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
948 
949  if (s->s.h.allowcompinter) {
950  s->s.h.comppredmode = vp8_rac_get(&s->c);
951  if (s->s.h.comppredmode)
952  s->s.h.comppredmode += vp8_rac_get(&s->c);
953  if (s->s.h.comppredmode == PRED_SWITCHABLE)
954  for (i = 0; i < 5; i++)
955  if (vp56_rac_get_prob_branchy(&s->c, 252))
956  s->prob.p.comp[i] =
957  update_prob(&s->c, s->prob.p.comp[i]);
958  } else {
960  }
961 
962  if (s->s.h.comppredmode != PRED_COMPREF) {
963  for (i = 0; i < 5; i++) {
964  if (vp56_rac_get_prob_branchy(&s->c, 252))
965  s->prob.p.single_ref[i][0] =
966  update_prob(&s->c, s->prob.p.single_ref[i][0]);
967  if (vp56_rac_get_prob_branchy(&s->c, 252))
968  s->prob.p.single_ref[i][1] =
969  update_prob(&s->c, s->prob.p.single_ref[i][1]);
970  }
971  }
972 
973  if (s->s.h.comppredmode != PRED_SINGLEREF) {
974  for (i = 0; i < 5; i++)
975  if (vp56_rac_get_prob_branchy(&s->c, 252))
976  s->prob.p.comp_ref[i] =
977  update_prob(&s->c, s->prob.p.comp_ref[i]);
978  }
979 
980  for (i = 0; i < 4; i++)
981  for (j = 0; j < 9; j++)
982  if (vp56_rac_get_prob_branchy(&s->c, 252))
983  s->prob.p.y_mode[i][j] =
984  update_prob(&s->c, s->prob.p.y_mode[i][j]);
985 
986  for (i = 0; i < 4; i++)
987  for (j = 0; j < 4; j++)
988  for (k = 0; k < 3; k++)
989  if (vp56_rac_get_prob_branchy(&s->c, 252))
990  s->prob.p.partition[3 - i][j][k] =
991  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
992 
993  // mv fields don't use the update_prob subexp model for some reason
994  for (i = 0; i < 3; i++)
995  if (vp56_rac_get_prob_branchy(&s->c, 252))
996  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
997 
998  for (i = 0; i < 2; i++) {
999  if (vp56_rac_get_prob_branchy(&s->c, 252))
1000  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1001 
1002  for (j = 0; j < 10; j++)
1003  if (vp56_rac_get_prob_branchy(&s->c, 252))
1004  s->prob.p.mv_comp[i].classes[j] =
1005  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1006 
1007  if (vp56_rac_get_prob_branchy(&s->c, 252))
1008  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1009 
1010  for (j = 0; j < 10; j++)
1011  if (vp56_rac_get_prob_branchy(&s->c, 252))
1012  s->prob.p.mv_comp[i].bits[j] =
1013  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1014  }
1015 
1016  for (i = 0; i < 2; i++) {
1017  for (j = 0; j < 2; j++)
1018  for (k = 0; k < 3; k++)
1019  if (vp56_rac_get_prob_branchy(&s->c, 252))
1020  s->prob.p.mv_comp[i].class0_fp[j][k] =
1021  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1022 
1023  for (j = 0; j < 3; j++)
1024  if (vp56_rac_get_prob_branchy(&s->c, 252))
1025  s->prob.p.mv_comp[i].fp[j] =
1026  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1027  }
1028 
1029  if (s->s.h.highprecisionmvs) {
1030  for (i = 0; i < 2; i++) {
1031  if (vp56_rac_get_prob_branchy(&s->c, 252))
1032  s->prob.p.mv_comp[i].class0_hp =
1033  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1034 
1035  if (vp56_rac_get_prob_branchy(&s->c, 252))
1036  s->prob.p.mv_comp[i].hp =
1037  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1038  }
1039  }
1040  }
1041 
1042  return (data2 - data) + size2;
1043 }
1044 
1045 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1046  VP9Context *s)
1047 {
1048  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1049  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1050 }
1051 
1053  VP56mv *pmv, int ref, int z, int idx, int sb)
1054 {
1055  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1056  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1057  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1058  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1059  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1060  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1061  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1062  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1063  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1064  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1065  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1066  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1067  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1068  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1069  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1070  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1071  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1072  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1073  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1074  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1075  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1076  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1077  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1078  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1079  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1080  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1081  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1082  };
1083  VP9Block *b = s->b;
1084  int row = s->row, col = s->col, row7 = s->row7;
1085  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1086 #define INVALID_MV 0x80008000U
1087  uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1088  int i;
1089 
1090 #define RETURN_DIRECT_MV(mv) \
1091  do { \
1092  uint32_t m = AV_RN32A(&mv); \
1093  if (!idx) { \
1094  AV_WN32A(pmv, m); \
1095  return; \
1096  } else if (mem == INVALID_MV) { \
1097  mem = m; \
1098  } else if (m != mem) { \
1099  AV_WN32A(pmv, m); \
1100  return; \
1101  } \
1102  } while (0)
1103 
1104  if (sb >= 0) {
1105  if (sb == 2 || sb == 1) {
1106  RETURN_DIRECT_MV(b->mv[0][z]);
1107  } else if (sb == 3) {
1108  RETURN_DIRECT_MV(b->mv[2][z]);
1109  RETURN_DIRECT_MV(b->mv[1][z]);
1110  RETURN_DIRECT_MV(b->mv[0][z]);
1111  }
1112 
1113 #define RETURN_MV(mv) \
1114  do { \
1115  if (sb > 0) { \
1116  VP56mv tmp; \
1117  uint32_t m; \
1118  av_assert2(idx == 1); \
1119  av_assert2(mem != INVALID_MV); \
1120  if (mem_sub8x8 == INVALID_MV) { \
1121  clamp_mv(&tmp, &mv, s); \
1122  m = AV_RN32A(&tmp); \
1123  if (m != mem) { \
1124  AV_WN32A(pmv, m); \
1125  return; \
1126  } \
1127  mem_sub8x8 = AV_RN32A(&mv); \
1128  } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1129  clamp_mv(&tmp, &mv, s); \
1130  m = AV_RN32A(&tmp); \
1131  if (m != mem) { \
1132  AV_WN32A(pmv, m); \
1133  } else { \
1134  /* BUG I'm pretty sure this isn't the intention */ \
1135  AV_WN32A(pmv, 0); \
1136  } \
1137  return; \
1138  } \
1139  } else { \
1140  uint32_t m = AV_RN32A(&mv); \
1141  if (!idx) { \
1142  clamp_mv(pmv, &mv, s); \
1143  return; \
1144  } else if (mem == INVALID_MV) { \
1145  mem = m; \
1146  } else if (m != mem) { \
1147  clamp_mv(pmv, &mv, s); \
1148  return; \
1149  } \
1150  } \
1151  } while (0)
1152 
1153  if (row > 0) {
1154  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1155  if (mv->ref[0] == ref) {
1156  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1157  } else if (mv->ref[1] == ref) {
1158  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1159  }
1160  }
1161  if (col > s->tile_col_start) {
1162  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1163  if (mv->ref[0] == ref) {
1164  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1165  } else if (mv->ref[1] == ref) {
1166  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1167  }
1168  }
1169  i = 2;
1170  } else {
1171  i = 0;
1172  }
1173 
1174  // previously coded MVs in this neighbourhood, using same reference frame
1175  for (; i < 8; i++) {
1176  int c = p[i][0] + col, r = p[i][1] + row;
1177 
1178  if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1179  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1180 
1181  if (mv->ref[0] == ref) {
1182  RETURN_MV(mv->mv[0]);
1183  } else if (mv->ref[1] == ref) {
1184  RETURN_MV(mv->mv[1]);
1185  }
1186  }
1187  }
1188 
1189  // MV at this position in previous frame, using same reference frame
1190  if (s->s.h.use_last_frame_mvs) {
1191  struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1192 
1195  if (mv->ref[0] == ref) {
1196  RETURN_MV(mv->mv[0]);
1197  } else if (mv->ref[1] == ref) {
1198  RETURN_MV(mv->mv[1]);
1199  }
1200  }
1201 
1202 #define RETURN_SCALE_MV(mv, scale) \
1203  do { \
1204  if (scale) { \
1205  VP56mv mv_temp = { -mv.x, -mv.y }; \
1206  RETURN_MV(mv_temp); \
1207  } else { \
1208  RETURN_MV(mv); \
1209  } \
1210  } while (0)
1211 
1212  // previously coded MVs in this neighbourhood, using different reference frame
1213  for (i = 0; i < 8; i++) {
1214  int c = p[i][0] + col, r = p[i][1] + row;
1215 
1216  if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1217  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1218 
1219  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1220  RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1221  }
1222  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1223  // BUG - libvpx has this condition regardless of whether
1224  // we used the first ref MV and pre-scaling
1225  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1226  RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1227  }
1228  }
1229  }
1230 
1231  // MV at this position in previous frame, using different reference frame
1232  if (s->s.h.use_last_frame_mvs) {
1233  struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1234 
1235  // no need to await_progress, because we already did that above
1236  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1237  RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1238  }
1239  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1240  // BUG - libvpx has this condition regardless of whether
1241  // we used the first ref MV and pre-scaling
1242  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1243  RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1244  }
1245  }
1246 
1247  AV_ZERO32(pmv);
1248  clamp_mv(pmv, pmv, s);
1249 #undef INVALID_MV
1250 #undef RETURN_MV
1251 #undef RETURN_SCALE_MV
1252 }
1253 
1254 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1255 {
1256  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1257  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1258  s->prob.p.mv_comp[idx].classes);
1259 
1260  s->counts.mv_comp[idx].sign[sign]++;
1261  s->counts.mv_comp[idx].classes[c]++;
1262  if (c) {
1263  int m;
1264 
1265  for (n = 0, m = 0; m < c; m++) {
1266  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1267  n |= bit << m;
1268  s->counts.mv_comp[idx].bits[m][bit]++;
1269  }
1270  n <<= 3;
1271  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1272  n |= bit << 1;
1273  s->counts.mv_comp[idx].fp[bit]++;
1274  if (hp) {
1275  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1276  s->counts.mv_comp[idx].hp[bit]++;
1277  n |= bit;
1278  } else {
1279  n |= 1;
1280  // bug in libvpx - we count for bw entropy purposes even if the
1281  // bit wasn't coded
1282  s->counts.mv_comp[idx].hp[1]++;
1283  }
1284  n += 8 << c;
1285  } else {
1286  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1287  s->counts.mv_comp[idx].class0[n]++;
1288  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1289  s->prob.p.mv_comp[idx].class0_fp[n]);
1290  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1291  n = (n << 3) | (bit << 1);
1292  if (hp) {
1293  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1294  s->counts.mv_comp[idx].class0_hp[bit]++;
1295  n |= bit;
1296  } else {
1297  n |= 1;
1298  // bug in libvpx - we count for bw entropy purposes even if the
1299  // bit wasn't coded
1300  s->counts.mv_comp[idx].class0_hp[1]++;
1301  }
1302  }
1303 
1304  return sign ? -(n + 1) : (n + 1);
1305 }
1306 
1307 static void fill_mv(VP9Context *s,
1308  VP56mv *mv, int mode, int sb)
1309 {
1310  VP9Block *b = s->b;
1311 
1312  if (mode == ZEROMV) {
1313  AV_ZERO64(mv);
1314  } else {
1315  int hp;
1316 
1317  // FIXME cache this value and reuse for other subblocks
1318  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1319  mode == NEWMV ? -1 : sb);
1320  // FIXME maybe move this code into find_ref_mvs()
1321  if ((mode == NEWMV || sb == -1) &&
1322  !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1323  if (mv[0].y & 1) {
1324  if (mv[0].y < 0)
1325  mv[0].y++;
1326  else
1327  mv[0].y--;
1328  }
1329  if (mv[0].x & 1) {
1330  if (mv[0].x < 0)
1331  mv[0].x++;
1332  else
1333  mv[0].x--;
1334  }
1335  }
1336  if (mode == NEWMV) {
1338  s->prob.p.mv_joint);
1339 
1340  s->counts.mv_joint[j]++;
1341  if (j >= MV_JOINT_V)
1342  mv[0].y += read_mv_component(s, 0, hp);
1343  if (j & 1)
1344  mv[0].x += read_mv_component(s, 1, hp);
1345  }
1346 
1347  if (b->comp) {
1348  // FIXME cache this value and reuse for other subblocks
1349  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1350  mode == NEWMV ? -1 : sb);
1351  if ((mode == NEWMV || sb == -1) &&
1352  !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1353  if (mv[1].y & 1) {
1354  if (mv[1].y < 0)
1355  mv[1].y++;
1356  else
1357  mv[1].y--;
1358  }
1359  if (mv[1].x & 1) {
1360  if (mv[1].x < 0)
1361  mv[1].x++;
1362  else
1363  mv[1].x--;
1364  }
1365  }
1366  if (mode == NEWMV) {
1368  s->prob.p.mv_joint);
1369 
1370  s->counts.mv_joint[j]++;
1371  if (j >= MV_JOINT_V)
1372  mv[1].y += read_mv_component(s, 0, hp);
1373  if (j & 1)
1374  mv[1].x += read_mv_component(s, 1, hp);
1375  }
1376  }
1377  }
1378 }
1379 
1380 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1381  ptrdiff_t stride, int v)
1382 {
1383  switch (w) {
1384  case 1:
1385  do {
1386  *ptr = v;
1387  ptr += stride;
1388  } while (--h);
1389  break;
1390  case 2: {
1391  int v16 = v * 0x0101;
1392  do {
1393  AV_WN16A(ptr, v16);
1394  ptr += stride;
1395  } while (--h);
1396  break;
1397  }
1398  case 4: {
1399  uint32_t v32 = v * 0x01010101;
1400  do {
1401  AV_WN32A(ptr, v32);
1402  ptr += stride;
1403  } while (--h);
1404  break;
1405  }
1406  case 8: {
1407 #if HAVE_FAST_64BIT
1408  uint64_t v64 = v * 0x0101010101010101ULL;
1409  do {
1410  AV_WN64A(ptr, v64);
1411  ptr += stride;
1412  } while (--h);
1413 #else
1414  uint32_t v32 = v * 0x01010101;
1415  do {
1416  AV_WN32A(ptr, v32);
1417  AV_WN32A(ptr + 4, v32);
1418  ptr += stride;
1419  } while (--h);
1420 #endif
1421  break;
1422  }
1423  }
1424 }
1425 
1427 {
1428  static const uint8_t left_ctx[N_BS_SIZES] = {
1429  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1430  };
1431  static const uint8_t above_ctx[N_BS_SIZES] = {
1432  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1433  };
1434  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1436  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1437  };
1438  VP9Context *s = ctx->priv_data;
1439  VP9Block *b = s->b;
1440  int row = s->row, col = s->col, row7 = s->row7;
1441  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1442  int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1443  int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1444  int have_a = row > 0, have_l = col > s->tile_col_start;
1445  int vref, filter_id;
1446 
1447  if (!s->s.h.segmentation.enabled) {
1448  b->seg_id = 0;
1449  } else if (s->s.h.keyframe || s->s.h.intraonly) {
1450  b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1452  } else if (!s->s.h.segmentation.update_map ||
1453  (s->s.h.segmentation.temporal &&
1456  s->left_segpred_ctx[row7]]))) {
1458  int pred = 8, x;
1460 
1463  for (y = 0; y < h4; y++) {
1464  int idx_base = (y + row) * 8 * s->sb_cols + col;
1465  for (x = 0; x < w4; x++)
1466  pred = FFMIN(pred, refsegmap[idx_base + x]);
1467  }
1468  av_assert1(pred < 8);
1469  b->seg_id = pred;
1470  } else {
1471  b->seg_id = 0;
1472  }
1473 
1474  memset(&s->above_segpred_ctx[col], 1, w4);
1475  memset(&s->left_segpred_ctx[row7], 1, h4);
1476  } else {
1478  s->s.h.segmentation.prob);
1479 
1480  memset(&s->above_segpred_ctx[col], 0, w4);
1481  memset(&s->left_segpred_ctx[row7], 0, h4);
1482  }
1483  if (s->s.h.segmentation.enabled &&
1484  (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1485  setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1486  bw4, bh4, 8 * s->sb_cols, b->seg_id);
1487  }
1488 
1489  b->skip = s->s.h.segmentation.enabled &&
1490  s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1491  if (!b->skip) {
1492  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1493  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1494  s->counts.skip[c][b->skip]++;
1495  }
1496 
1497  if (s->s.h.keyframe || s->s.h.intraonly) {
1498  b->intra = 1;
1499  } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1500  b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1501  } else {
1502  int c, bit;
1503 
1504  if (have_a && have_l) {
1505  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1506  c += (c == 2);
1507  } else {
1508  c = have_a ? 2 * s->above_intra_ctx[col] :
1509  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1510  }
1511  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1512  s->counts.intra[c][bit]++;
1513  b->intra = !bit;
1514  }
1515 
1516  if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1517  int c;
1518  if (have_a) {
1519  if (have_l) {
1520  c = (s->above_skip_ctx[col] ? max_tx :
1521  s->above_txfm_ctx[col]) +
1522  (s->left_skip_ctx[row7] ? max_tx :
1523  s->left_txfm_ctx[row7]) > max_tx;
1524  } else {
1525  c = s->above_skip_ctx[col] ? 1 :
1526  (s->above_txfm_ctx[col] * 2 > max_tx);
1527  }
1528  } else if (have_l) {
1529  c = s->left_skip_ctx[row7] ? 1 :
1530  (s->left_txfm_ctx[row7] * 2 > max_tx);
1531  } else {
1532  c = 1;
1533  }
1534  switch (max_tx) {
1535  case TX_32X32:
1536  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1537  if (b->tx) {
1538  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1539  if (b->tx == 2)
1540  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1541  }
1542  s->counts.tx32p[c][b->tx]++;
1543  break;
1544  case TX_16X16:
1545  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1546  if (b->tx)
1547  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1548  s->counts.tx16p[c][b->tx]++;
1549  break;
1550  case TX_8X8:
1551  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1552  s->counts.tx8p[c][b->tx]++;
1553  break;
1554  case TX_4X4:
1555  b->tx = TX_4X4;
1556  break;
1557  }
1558  } else {
1559  b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1560  }
1561 
1562  if (s->s.h.keyframe || s->s.h.intraonly) {
1563  uint8_t *a = &s->above_mode_ctx[col * 2];
1564  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1565 
1566  b->comp = 0;
1567  if (b->bs > BS_8x8) {
1568  // FIXME the memory storage intermediates here aren't really
1569  // necessary, they're just there to make the code slightly
1570  // simpler for now
1571  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1572  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1573  if (b->bs != BS_8x4) {
1575  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1576  l[0] = a[1] = b->mode[1];
1577  } else {
1578  l[0] = a[1] = b->mode[1] = b->mode[0];
1579  }
1580  if (b->bs != BS_4x8) {
1581  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1582  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1583  if (b->bs != BS_8x4) {
1585  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1586  l[1] = a[1] = b->mode[3];
1587  } else {
1588  l[1] = a[1] = b->mode[3] = b->mode[2];
1589  }
1590  } else {
1591  b->mode[2] = b->mode[0];
1592  l[1] = a[1] = b->mode[3] = b->mode[1];
1593  }
1594  } else {
1596  vp9_default_kf_ymode_probs[*a][*l]);
1597  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1598  // FIXME this can probably be optimized
1599  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1600  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1601  }
1604  } else if (b->intra) {
1605  b->comp = 0;
1606  if (b->bs > BS_8x8) {
1608  s->prob.p.y_mode[0]);
1609  s->counts.y_mode[0][b->mode[0]]++;
1610  if (b->bs != BS_8x4) {
1612  s->prob.p.y_mode[0]);
1613  s->counts.y_mode[0][b->mode[1]]++;
1614  } else {
1615  b->mode[1] = b->mode[0];
1616  }
1617  if (b->bs != BS_4x8) {
1619  s->prob.p.y_mode[0]);
1620  s->counts.y_mode[0][b->mode[2]]++;
1621  if (b->bs != BS_8x4) {
1623  s->prob.p.y_mode[0]);
1624  s->counts.y_mode[0][b->mode[3]]++;
1625  } else {
1626  b->mode[3] = b->mode[2];
1627  }
1628  } else {
1629  b->mode[2] = b->mode[0];
1630  b->mode[3] = b->mode[1];
1631  }
1632  } else {
1633  static const uint8_t size_group[10] = {
1634  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1635  };
1636  int sz = size_group[b->bs];
1637 
1639  s->prob.p.y_mode[sz]);
1640  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1641  s->counts.y_mode[sz][b->mode[3]]++;
1642  }
1644  s->prob.p.uv_mode[b->mode[3]]);
1645  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1646  } else {
1647  static const uint8_t inter_mode_ctx_lut[14][14] = {
1648  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1649  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1650  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1651  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1652  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1653  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1654  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1655  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1656  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1657  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1658  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1659  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1660  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1661  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1662  };
1663 
1664  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1665  av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1666  b->comp = 0;
1667  b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1668  } else {
1669  // read comp_pred flag
1670  if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1671  b->comp = s->s.h.comppredmode == PRED_COMPREF;
1672  } else {
1673  int c;
1674 
1675  // FIXME add intra as ref=0xff (or -1) to make these easier?
1676  if (have_a) {
1677  if (have_l) {
1678  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1679  c = 4;
1680  } else if (s->above_comp_ctx[col]) {
1681  c = 2 + (s->left_intra_ctx[row7] ||
1682  s->left_ref_ctx[row7] == s->s.h.fixcompref);
1683  } else if (s->left_comp_ctx[row7]) {
1684  c = 2 + (s->above_intra_ctx[col] ||
1685  s->above_ref_ctx[col] == s->s.h.fixcompref);
1686  } else {
1687  c = (!s->above_intra_ctx[col] &&
1688  s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1689  (!s->left_intra_ctx[row7] &&
1690  s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1691  }
1692  } else {
1693  c = s->above_comp_ctx[col] ? 3 :
1694  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1695  }
1696  } else if (have_l) {
1697  c = s->left_comp_ctx[row7] ? 3 :
1698  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1699  } else {
1700  c = 1;
1701  }
1702  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1703  s->counts.comp[c][b->comp]++;
1704  }
1705 
1706  // read actual references
1707  // FIXME probably cache a few variables here to prevent repetitive
1708  // memory accesses below
1709  if (b->comp) /* two references */ {
1710  int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1711 
1712  b->ref[fix_idx] = s->s.h.fixcompref;
1713  // FIXME can this codeblob be replaced by some sort of LUT?
1714  if (have_a) {
1715  if (have_l) {
1716  if (s->above_intra_ctx[col]) {
1717  if (s->left_intra_ctx[row7]) {
1718  c = 2;
1719  } else {
1720  c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1721  }
1722  } else if (s->left_intra_ctx[row7]) {
1723  c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1724  } else {
1725  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1726 
1727  if (refl == refa && refa == s->s.h.varcompref[1]) {
1728  c = 0;
1729  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1730  if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1731  (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1732  c = 4;
1733  } else {
1734  c = (refa == refl) ? 3 : 1;
1735  }
1736  } else if (!s->left_comp_ctx[row7]) {
1737  if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1738  c = 1;
1739  } else {
1740  c = (refl == s->s.h.varcompref[1] &&
1741  refa != s->s.h.varcompref[1]) ? 2 : 4;
1742  }
1743  } else if (!s->above_comp_ctx[col]) {
1744  if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1745  c = 1;
1746  } else {
1747  c = (refa == s->s.h.varcompref[1] &&
1748  refl != s->s.h.varcompref[1]) ? 2 : 4;
1749  }
1750  } else {
1751  c = (refl == refa) ? 4 : 2;
1752  }
1753  }
1754  } else {
1755  if (s->above_intra_ctx[col]) {
1756  c = 2;
1757  } else if (s->above_comp_ctx[col]) {
1758  c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1759  } else {
1760  c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1761  }
1762  }
1763  } else if (have_l) {
1764  if (s->left_intra_ctx[row7]) {
1765  c = 2;
1766  } else if (s->left_comp_ctx[row7]) {
1767  c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1768  } else {
1769  c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1770  }
1771  } else {
1772  c = 2;
1773  }
1774  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1775  b->ref[var_idx] = s->s.h.varcompref[bit];
1776  s->counts.comp_ref[c][bit]++;
1777  } else /* single reference */ {
1778  int bit, c;
1779 
1780  if (have_a && !s->above_intra_ctx[col]) {
1781  if (have_l && !s->left_intra_ctx[row7]) {
1782  if (s->left_comp_ctx[row7]) {
1783  if (s->above_comp_ctx[col]) {
1784  c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1785  !s->above_ref_ctx[col]);
1786  } else {
1787  c = (3 * !s->above_ref_ctx[col]) +
1788  (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1789  }
1790  } else if (s->above_comp_ctx[col]) {
1791  c = (3 * !s->left_ref_ctx[row7]) +
1792  (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1793  } else {
1794  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1795  }
1796  } else if (s->above_intra_ctx[col]) {
1797  c = 2;
1798  } else if (s->above_comp_ctx[col]) {
1799  c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1800  } else {
1801  c = 4 * (!s->above_ref_ctx[col]);
1802  }
1803  } else if (have_l && !s->left_intra_ctx[row7]) {
1804  if (s->left_intra_ctx[row7]) {
1805  c = 2;
1806  } else if (s->left_comp_ctx[row7]) {
1807  c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1808  } else {
1809  c = 4 * (!s->left_ref_ctx[row7]);
1810  }
1811  } else {
1812  c = 2;
1813  }
1814  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1815  s->counts.single_ref[c][0][bit]++;
1816  if (!bit) {
1817  b->ref[0] = 0;
1818  } else {
1819  // FIXME can this codeblob be replaced by some sort of LUT?
1820  if (have_a) {
1821  if (have_l) {
1822  if (s->left_intra_ctx[row7]) {
1823  if (s->above_intra_ctx[col]) {
1824  c = 2;
1825  } else if (s->above_comp_ctx[col]) {
1826  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1827  s->above_ref_ctx[col] == 1);
1828  } else if (!s->above_ref_ctx[col]) {
1829  c = 3;
1830  } else {
1831  c = 4 * (s->above_ref_ctx[col] == 1);
1832  }
1833  } else if (s->above_intra_ctx[col]) {
1834  if (s->left_intra_ctx[row7]) {
1835  c = 2;
1836  } else if (s->left_comp_ctx[row7]) {
1837  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1838  s->left_ref_ctx[row7] == 1);
1839  } else if (!s->left_ref_ctx[row7]) {
1840  c = 3;
1841  } else {
1842  c = 4 * (s->left_ref_ctx[row7] == 1);
1843  }
1844  } else if (s->above_comp_ctx[col]) {
1845  if (s->left_comp_ctx[row7]) {
1846  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1847  c = 3 * (s->s.h.fixcompref == 1 ||
1848  s->left_ref_ctx[row7] == 1);
1849  } else {
1850  c = 2;
1851  }
1852  } else if (!s->left_ref_ctx[row7]) {
1853  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1854  s->above_ref_ctx[col] == 1);
1855  } else {
1856  c = 3 * (s->left_ref_ctx[row7] == 1) +
1857  (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1858  }
1859  } else if (s->left_comp_ctx[row7]) {
1860  if (!s->above_ref_ctx[col]) {
1861  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1862  s->left_ref_ctx[row7] == 1);
1863  } else {
1864  c = 3 * (s->above_ref_ctx[col] == 1) +
1865  (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1866  }
1867  } else if (!s->above_ref_ctx[col]) {
1868  if (!s->left_ref_ctx[row7]) {
1869  c = 3;
1870  } else {
1871  c = 4 * (s->left_ref_ctx[row7] == 1);
1872  }
1873  } else if (!s->left_ref_ctx[row7]) {
1874  c = 4 * (s->above_ref_ctx[col] == 1);
1875  } else {
1876  c = 2 * (s->left_ref_ctx[row7] == 1) +
1877  2 * (s->above_ref_ctx[col] == 1);
1878  }
1879  } else {
1880  if (s->above_intra_ctx[col] ||
1881  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1882  c = 2;
1883  } else if (s->above_comp_ctx[col]) {
1884  c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1885  } else {
1886  c = 4 * (s->above_ref_ctx[col] == 1);
1887  }
1888  }
1889  } else if (have_l) {
1890  if (s->left_intra_ctx[row7] ||
1891  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1892  c = 2;
1893  } else if (s->left_comp_ctx[row7]) {
1894  c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1895  } else {
1896  c = 4 * (s->left_ref_ctx[row7] == 1);
1897  }
1898  } else {
1899  c = 2;
1900  }
1901  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1902  s->counts.single_ref[c][1][bit]++;
1903  b->ref[0] = 1 + bit;
1904  }
1905  }
1906  }
1907 
1908  if (b->bs <= BS_8x8) {
1909  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1910  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1911  } else {
1912  static const uint8_t off[10] = {
1913  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1914  };
1915 
1916  // FIXME this needs to use the LUT tables from find_ref_mvs
1917  // because not all are -1,0/0,-1
1918  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1919  [s->left_mode_ctx[row7 + off[b->bs]]];
1920 
1922  s->prob.p.mv_mode[c]);
1923  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1924  s->counts.mv_mode[c][b->mode[0] - 10]++;
1925  }
1926  }
1927 
1928  if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1929  int c;
1930 
1931  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1932  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1933  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1934  s->left_filter_ctx[row7] : 3;
1935  } else {
1936  c = s->above_filter_ctx[col];
1937  }
1938  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1939  c = s->left_filter_ctx[row7];
1940  } else {
1941  c = 3;
1942  }
1943 
1944  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1945  s->prob.p.filter[c]);
1946  s->counts.filter[c][filter_id]++;
1947  b->filter = vp9_filter_lut[filter_id];
1948  } else {
1949  b->filter = s->s.h.filtermode;
1950  }
1951 
1952  if (b->bs > BS_8x8) {
1953  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1954 
1956  s->prob.p.mv_mode[c]);
1957  s->counts.mv_mode[c][b->mode[0] - 10]++;
1958  fill_mv(s, b->mv[0], b->mode[0], 0);
1959 
1960  if (b->bs != BS_8x4) {
1962  s->prob.p.mv_mode[c]);
1963  s->counts.mv_mode[c][b->mode[1] - 10]++;
1964  fill_mv(s, b->mv[1], b->mode[1], 1);
1965  } else {
1966  b->mode[1] = b->mode[0];
1967  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1968  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1969  }
1970 
1971  if (b->bs != BS_4x8) {
1973  s->prob.p.mv_mode[c]);
1974  s->counts.mv_mode[c][b->mode[2] - 10]++;
1975  fill_mv(s, b->mv[2], b->mode[2], 2);
1976 
1977  if (b->bs != BS_8x4) {
1979  s->prob.p.mv_mode[c]);
1980  s->counts.mv_mode[c][b->mode[3] - 10]++;
1981  fill_mv(s, b->mv[3], b->mode[3], 3);
1982  } else {
1983  b->mode[3] = b->mode[2];
1984  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1985  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1986  }
1987  } else {
1988  b->mode[2] = b->mode[0];
1989  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1990  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1991  b->mode[3] = b->mode[1];
1992  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1993  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1994  }
1995  } else {
1996  fill_mv(s, b->mv[0], b->mode[0], -1);
1997  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1998  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1999  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2000  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2001  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2002  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2003  }
2004 
2005  vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2006  }
2007 
2008 #if HAVE_FAST_64BIT
2009 #define SPLAT_CTX(var, val, n) \
2010  switch (n) { \
2011  case 1: var = val; break; \
2012  case 2: AV_WN16A(&var, val * 0x0101); break; \
2013  case 4: AV_WN32A(&var, val * 0x01010101); break; \
2014  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2015  case 16: { \
2016  uint64_t v64 = val * 0x0101010101010101ULL; \
2017  AV_WN64A( &var, v64); \
2018  AV_WN64A(&((uint8_t *) &var)[8], v64); \
2019  break; \
2020  } \
2021  }
2022 #else
2023 #define SPLAT_CTX(var, val, n) \
2024  switch (n) { \
2025  case 1: var = val; break; \
2026  case 2: AV_WN16A(&var, val * 0x0101); break; \
2027  case 4: AV_WN32A(&var, val * 0x01010101); break; \
2028  case 8: { \
2029  uint32_t v32 = val * 0x01010101; \
2030  AV_WN32A( &var, v32); \
2031  AV_WN32A(&((uint8_t *) &var)[4], v32); \
2032  break; \
2033  } \
2034  case 16: { \
2035  uint32_t v32 = val * 0x01010101; \
2036  AV_WN32A( &var, v32); \
2037  AV_WN32A(&((uint8_t *) &var)[4], v32); \
2038  AV_WN32A(&((uint8_t *) &var)[8], v32); \
2039  AV_WN32A(&((uint8_t *) &var)[12], v32); \
2040  break; \
2041  } \
2042  }
2043 #endif
2044 
2045  switch (bwh_tab[1][b->bs][0]) {
2046 #define SET_CTXS(dir, off, n) \
2047  do { \
2048  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2049  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2050  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2051  if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2052  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2053  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2054  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2055  if (!b->intra) { \
2056  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2057  if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2058  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2059  } \
2060  } \
2061  } \
2062  } while (0)
2063  case 1: SET_CTXS(above, col, 1); break;
2064  case 2: SET_CTXS(above, col, 2); break;
2065  case 4: SET_CTXS(above, col, 4); break;
2066  case 8: SET_CTXS(above, col, 8); break;
2067  }
2068  switch (bwh_tab[1][b->bs][1]) {
2069  case 1: SET_CTXS(left, row7, 1); break;
2070  case 2: SET_CTXS(left, row7, 2); break;
2071  case 4: SET_CTXS(left, row7, 4); break;
2072  case 8: SET_CTXS(left, row7, 8); break;
2073  }
2074 #undef SPLAT_CTX
2075 #undef SET_CTXS
2076 
2077  if (!s->s.h.keyframe && !s->s.h.intraonly) {
2078  if (b->bs > BS_8x8) {
2079  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2080 
2081  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2082  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2083  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2084  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2085  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2086  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2087  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2088  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2089  } else {
2090  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2091 
2092  for (n = 0; n < w4 * 2; n++) {
2093  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2094  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2095  }
2096  for (n = 0; n < h4 * 2; n++) {
2097  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2098  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2099  }
2100  }
2101  }
2102 
2103  // FIXME kinda ugly
2104  for (y = 0; y < h4; y++) {
2105  int x, o = (row + y) * s->sb_cols * 8 + col;
2106  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2107 
2108  if (b->intra) {
2109  for (x = 0; x < w4; x++) {
2110  mv[x].ref[0] =
2111  mv[x].ref[1] = -1;
2112  }
2113  } else if (b->comp) {
2114  for (x = 0; x < w4; x++) {
2115  mv[x].ref[0] = b->ref[0];
2116  mv[x].ref[1] = b->ref[1];
2117  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2118  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2119  }
2120  } else {
2121  for (x = 0; x < w4; x++) {
2122  mv[x].ref[0] = b->ref[0];
2123  mv[x].ref[1] = -1;
2124  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2125  }
2126  }
2127  }
2128 }
2129 
2130 // FIXME merge cnt/eob arguments?
2131 static av_always_inline int
2132 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2133  int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2134  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2135  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2136  const int16_t *band_counts, const int16_t *qmul)
2137 {
2138  int i = 0, band = 0, band_left = band_counts[band];
2139  uint8_t *tp = p[0][nnz];
2140  uint8_t cache[1024];
2141 
2142  do {
2143  int val, rc;
2144 
2145  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2146  eob[band][nnz][val]++;
2147  if (!val)
2148  break;
2149 
2150  skip_eob:
2151  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2152  cnt[band][nnz][0]++;
2153  if (!--band_left)
2154  band_left = band_counts[++band];
2155  cache[scan[i]] = 0;
2156  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2157  tp = p[band][nnz];
2158  if (++i == n_coeffs)
2159  break; //invalid input; blocks should end with EOB
2160  goto skip_eob;
2161  }
2162 
2163  rc = scan[i];
2164  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2165  cnt[band][nnz][1]++;
2166  val = 1;
2167  cache[rc] = 1;
2168  } else {
2169  // fill in p[3-10] (model fill) - only once per frame for each pos
2170  if (!tp[3])
2171  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2172 
2173  cnt[band][nnz][2]++;
2174  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2175  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2176  cache[rc] = val = 2;
2177  } else {
2178  val = 3 + vp56_rac_get_prob(c, tp[5]);
2179  cache[rc] = 3;
2180  }
2181  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2182  cache[rc] = 4;
2183  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2184  val = 5 + vp56_rac_get_prob(c, 159);
2185  } else {
2186  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2187  val += vp56_rac_get_prob(c, 145);
2188  }
2189  } else { // cat 3-6
2190  cache[rc] = 5;
2191  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2192  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2193  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2194  val += (vp56_rac_get_prob(c, 148) << 1);
2195  val += vp56_rac_get_prob(c, 140);
2196  } else {
2197  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2198  val += (vp56_rac_get_prob(c, 155) << 2);
2199  val += (vp56_rac_get_prob(c, 140) << 1);
2200  val += vp56_rac_get_prob(c, 135);
2201  }
2202  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2203  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2204  val += (vp56_rac_get_prob(c, 157) << 3);
2205  val += (vp56_rac_get_prob(c, 141) << 2);
2206  val += (vp56_rac_get_prob(c, 134) << 1);
2207  val += vp56_rac_get_prob(c, 130);
2208  } else {
2209  val = 67;
2210  if (!is8bitsperpixel) {
2211  if (bpp == 12) {
2212  val += vp56_rac_get_prob(c, 255) << 17;
2213  val += vp56_rac_get_prob(c, 255) << 16;
2214  }
2215  val += (vp56_rac_get_prob(c, 255) << 15);
2216  val += (vp56_rac_get_prob(c, 255) << 14);
2217  }
2218  val += (vp56_rac_get_prob(c, 254) << 13);
2219  val += (vp56_rac_get_prob(c, 254) << 12);
2220  val += (vp56_rac_get_prob(c, 254) << 11);
2221  val += (vp56_rac_get_prob(c, 252) << 10);
2222  val += (vp56_rac_get_prob(c, 249) << 9);
2223  val += (vp56_rac_get_prob(c, 243) << 8);
2224  val += (vp56_rac_get_prob(c, 230) << 7);
2225  val += (vp56_rac_get_prob(c, 196) << 6);
2226  val += (vp56_rac_get_prob(c, 177) << 5);
2227  val += (vp56_rac_get_prob(c, 153) << 4);
2228  val += (vp56_rac_get_prob(c, 140) << 3);
2229  val += (vp56_rac_get_prob(c, 133) << 2);
2230  val += (vp56_rac_get_prob(c, 130) << 1);
2231  val += vp56_rac_get_prob(c, 129);
2232  }
2233  }
2234  }
2235 #define STORE_COEF(c, i, v) do { \
2236  if (is8bitsperpixel) { \
2237  c[i] = v; \
2238  } else { \
2239  AV_WN32A(&c[i * 2], v); \
2240  } \
2241 } while (0)
2242  if (!--band_left)
2243  band_left = band_counts[++band];
2244  if (is_tx32x32)
2245  STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2246  else
2247  STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2248  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2249  tp = p[band][nnz];
2250  } while (++i < n_coeffs);
2251 
2252  return i;
2253 }
2254 
2255 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2256  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2257  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2258  const int16_t (*nb)[2], const int16_t *band_counts,
2259  const int16_t *qmul)
2260 {
2261  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2262  nnz, scan, nb, band_counts, qmul);
2263 }
2264 
2265 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2266  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2267  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2268  const int16_t (*nb)[2], const int16_t *band_counts,
2269  const int16_t *qmul)
2270 {
2271  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2272  nnz, scan, nb, band_counts, qmul);
2273 }
2274 
2275 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2276  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2277  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2278  const int16_t (*nb)[2], const int16_t *band_counts,
2279  const int16_t *qmul)
2280 {
2281  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2282  nnz, scan, nb, band_counts, qmul);
2283 }
2284 
2285 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2286  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2287  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2288  const int16_t (*nb)[2], const int16_t *band_counts,
2289  const int16_t *qmul)
2290 {
2291  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2292  nnz, scan, nb, band_counts, qmul);
2293 }
2294 
2295 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2296 {
2297  VP9Context *s = ctx->priv_data;
2298  VP9Block *b = s->b;
2299  int row = s->row, col = s->col;
2300  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2301  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2302  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2303  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2304  int end_x = FFMIN(2 * (s->cols - col), w4);
2305  int end_y = FFMIN(2 * (s->rows - row), h4);
2306  int n, pl, x, y, res;
2307  int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2308  int tx = 4 * s->s.h.lossless + b->tx;
2309  const int16_t * const *yscans = vp9_scans[tx];
2310  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2311  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2312  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2313  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2314  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2315  static const int16_t band_counts[4][8] = {
2316  { 1, 2, 3, 4, 3, 16 - 13 },
2317  { 1, 2, 3, 4, 11, 64 - 21 },
2318  { 1, 2, 3, 4, 11, 256 - 21 },
2319  { 1, 2, 3, 4, 11, 1024 - 21 },
2320  };
2321  const int16_t *y_band_counts = band_counts[b->tx];
2322  const int16_t *uv_band_counts = band_counts[b->uvtx];
2323  int bytesperpixel = is8bitsperpixel ? 1 : 2;
2324  int total_coeff = 0;
2325 
2326 #define MERGE(la, end, step, rd) \
2327  for (n = 0; n < end; n += step) \
2328  la[n] = !!rd(&la[n])
2329 #define MERGE_CTX(step, rd) \
2330  do { \
2331  MERGE(l, end_y, step, rd); \
2332  MERGE(a, end_x, step, rd); \
2333  } while (0)
2334 
2335 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2336  for (n = 0, y = 0; y < end_y; y += step) { \
2337  for (x = 0; x < end_x; x += step, n += step * step) { \
2338  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2339  res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2340  (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2341  c, e, p, a[x] + l[y], yscans[txtp], \
2342  ynbs[txtp], y_band_counts, qmul[0]); \
2343  a[x] = l[y] = !!res; \
2344  total_coeff |= !!res; \
2345  if (step >= 4) { \
2346  AV_WN16A(&s->eob[n], res); \
2347  } else { \
2348  s->eob[n] = res; \
2349  } \
2350  } \
2351  }
2352 
2353 #define SPLAT(la, end, step, cond) \
2354  if (step == 2) { \
2355  for (n = 1; n < end; n += step) \
2356  la[n] = la[n - 1]; \
2357  } else if (step == 4) { \
2358  if (cond) { \
2359  for (n = 0; n < end; n += step) \
2360  AV_WN32A(&la[n], la[n] * 0x01010101); \
2361  } else { \
2362  for (n = 0; n < end; n += step) \
2363  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2364  } \
2365  } else /* step == 8 */ { \
2366  if (cond) { \
2367  if (HAVE_FAST_64BIT) { \
2368  for (n = 0; n < end; n += step) \
2369  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2370  } else { \
2371  for (n = 0; n < end; n += step) { \
2372  uint32_t v32 = la[n] * 0x01010101; \
2373  AV_WN32A(&la[n], v32); \
2374  AV_WN32A(&la[n + 4], v32); \
2375  } \
2376  } \
2377  } else { \
2378  for (n = 0; n < end; n += step) \
2379  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2380  } \
2381  }
2382 #define SPLAT_CTX(step) \
2383  do { \
2384  SPLAT(a, end_x, step, end_x == w4); \
2385  SPLAT(l, end_y, step, end_y == h4); \
2386  } while (0)
2387 
2388  /* y tokens */
2389  switch (b->tx) {
2390  case TX_4X4:
2391  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2392  break;
2393  case TX_8X8:
2394  MERGE_CTX(2, AV_RN16A);
2395  DECODE_Y_COEF_LOOP(2, 0,);
2396  SPLAT_CTX(2);
2397  break;
2398  case TX_16X16:
2399  MERGE_CTX(4, AV_RN32A);
2400  DECODE_Y_COEF_LOOP(4, 0,);
2401  SPLAT_CTX(4);
2402  break;
2403  case TX_32X32:
2404  MERGE_CTX(8, AV_RN64A);
2405  DECODE_Y_COEF_LOOP(8, 0, 32);
2406  SPLAT_CTX(8);
2407  break;
2408  }
2409 
2410 #define DECODE_UV_COEF_LOOP(step, v) \
2411  for (n = 0, y = 0; y < end_y; y += step) { \
2412  for (x = 0; x < end_x; x += step, n += step * step) { \
2413  res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2414  (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2415  16 * step * step, c, e, p, a[x] + l[y], \
2416  uvscan, uvnb, uv_band_counts, qmul[1]); \
2417  a[x] = l[y] = !!res; \
2418  total_coeff |= !!res; \
2419  if (step >= 4) { \
2420  AV_WN16A(&s->uveob[pl][n], res); \
2421  } else { \
2422  s->uveob[pl][n] = res; \
2423  } \
2424  } \
2425  }
2426 
2427  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2428  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2429  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2430  w4 >>= s->ss_h;
2431  end_x >>= s->ss_h;
2432  h4 >>= s->ss_v;
2433  end_y >>= s->ss_v;
2434  for (pl = 0; pl < 2; pl++) {
2435  a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2436  l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2437  switch (b->uvtx) {
2438  case TX_4X4:
2439  DECODE_UV_COEF_LOOP(1,);
2440  break;
2441  case TX_8X8:
2442  MERGE_CTX(2, AV_RN16A);
2443  DECODE_UV_COEF_LOOP(2,);
2444  SPLAT_CTX(2);
2445  break;
2446  case TX_16X16:
2447  MERGE_CTX(4, AV_RN32A);
2448  DECODE_UV_COEF_LOOP(4,);
2449  SPLAT_CTX(4);
2450  break;
2451  case TX_32X32:
2452  MERGE_CTX(8, AV_RN64A);
2453  DECODE_UV_COEF_LOOP(8, 32);
2454  SPLAT_CTX(8);
2455  break;
2456  }
2457  }
2458 
2459  return total_coeff;
2460 }
2461 
2463 {
2464  return decode_coeffs(ctx, 1);
2465 }
2466 
2468 {
2469  return decode_coeffs(ctx, 0);
2470 }
2471 
2473  uint8_t *dst_edge, ptrdiff_t stride_edge,
2474  uint8_t *dst_inner, ptrdiff_t stride_inner,
2475  uint8_t *l, int col, int x, int w,
2476  int row, int y, enum TxfmMode tx,
2477  int p, int ss_h, int ss_v, int bytesperpixel)
2478 {
2479  int have_top = row > 0 || y > 0;
2480  int have_left = col > s->tile_col_start || x > 0;
2481  int have_right = x < w - 1;
2482  int bpp = s->bpp;
2483  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2484  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2485  { DC_127_PRED, VERT_PRED } },
2486  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2487  { HOR_PRED, HOR_PRED } },
2488  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2489  { LEFT_DC_PRED, DC_PRED } },
2499  { DC_127_PRED, VERT_LEFT_PRED } },
2500  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2501  { HOR_UP_PRED, HOR_UP_PRED } },
2502  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2503  { HOR_PRED, TM_VP8_PRED } },
2504  };
2505  static const struct {
2506  uint8_t needs_left:1;
2507  uint8_t needs_top:1;
2508  uint8_t needs_topleft:1;
2509  uint8_t needs_topright:1;
2510  uint8_t invert_left:1;
2511  } edges[N_INTRA_PRED_MODES] = {
2512  [VERT_PRED] = { .needs_top = 1 },
2513  [HOR_PRED] = { .needs_left = 1 },
2514  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2515  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2516  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2517  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2518  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2519  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2520  [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2521  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2522  [LEFT_DC_PRED] = { .needs_left = 1 },
2523  [TOP_DC_PRED] = { .needs_top = 1 },
2524  [DC_128_PRED] = { 0 },
2525  [DC_127_PRED] = { 0 },
2526  [DC_129_PRED] = { 0 }
2527  };
2528 
2529  av_assert2(mode >= 0 && mode < 10);
2530  mode = mode_conv[mode][have_left][have_top];
2531  if (edges[mode].needs_top) {
2532  uint8_t *top, *topleft;
2533  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2534  int n_px_need_tr = 0;
2535 
2536  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2537  n_px_need_tr = 4;
2538 
2539  // if top of sb64-row, use s->intra_pred_data[] instead of
2540  // dst[-stride] for intra prediction (it contains pre- instead of
2541  // post-loopfilter data)
2542  if (have_top) {
2543  top = !(row & 7) && !y ?
2544  s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2545  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2546  if (have_left)
2547  topleft = !(row & 7) && !y ?
2548  s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2549  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2550  &dst_inner[-stride_inner];
2551  }
2552 
2553  if (have_top &&
2554  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2555  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2556  n_px_need + n_px_need_tr <= n_px_have) {
2557  *a = top;
2558  } else {
2559  if (have_top) {
2560  if (n_px_need <= n_px_have) {
2561  memcpy(*a, top, n_px_need * bytesperpixel);
2562  } else {
2563 #define memset_bpp(c, i1, v, i2, num) do { \
2564  if (bytesperpixel == 1) { \
2565  memset(&(c)[(i1)], (v)[(i2)], (num)); \
2566  } else { \
2567  int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2568  for (n = 0; n < (num); n++) { \
2569  AV_WN16A(&(c)[((i1) + n) * 2], val); \
2570  } \
2571  } \
2572 } while (0)
2573  memcpy(*a, top, n_px_have * bytesperpixel);
2574  memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2575  }
2576  } else {
2577 #define memset_val(c, val, num) do { \
2578  if (bytesperpixel == 1) { \
2579  memset((c), (val), (num)); \
2580  } else { \
2581  int n; \
2582  for (n = 0; n < (num); n++) { \
2583  AV_WN16A(&(c)[n * 2], (val)); \
2584  } \
2585  } \
2586 } while (0)
2587  memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2588  }
2589  if (edges[mode].needs_topleft) {
2590  if (have_left && have_top) {
2591 #define assign_bpp(c, i1, v, i2) do { \
2592  if (bytesperpixel == 1) { \
2593  (c)[(i1)] = (v)[(i2)]; \
2594  } else { \
2595  AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2596  } \
2597 } while (0)
2598  assign_bpp(*a, -1, topleft, -1);
2599  } else {
2600 #define assign_val(c, i, v) do { \
2601  if (bytesperpixel == 1) { \
2602  (c)[(i)] = (v); \
2603  } else { \
2604  AV_WN16A(&(c)[(i) * 2], (v)); \
2605  } \
2606 } while (0)
2607  assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2608  }
2609  }
2610  if (tx == TX_4X4 && edges[mode].needs_topright) {
2611  if (have_top && have_right &&
2612  n_px_need + n_px_need_tr <= n_px_have) {
2613  memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2614  } else {
2615  memset_bpp(*a, 4, *a, 3, 4);
2616  }
2617  }
2618  }
2619  }
2620  if (edges[mode].needs_left) {
2621  if (have_left) {
2622  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2623  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2624  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2625 
2626  if (edges[mode].invert_left) {
2627  if (n_px_need <= n_px_have) {
2628  for (i = 0; i < n_px_need; i++)
2629  assign_bpp(l, i, &dst[i * stride], -1);
2630  } else {
2631  for (i = 0; i < n_px_have; i++)
2632  assign_bpp(l, i, &dst[i * stride], -1);
2633  memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2634  }
2635  } else {
2636  if (n_px_need <= n_px_have) {
2637  for (i = 0; i < n_px_need; i++)
2638  assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2639  } else {
2640  for (i = 0; i < n_px_have; i++)
2641  assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2642  memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2643  }
2644  }
2645  } else {
2646  memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2647  }
2648  }
2649 
2650  return mode;
2651 }
2652 
2653 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2654  ptrdiff_t uv_off, int bytesperpixel)
2655 {
2656  VP9Context *s = ctx->priv_data;
2657  VP9Block *b = s->b;
2658  int row = s->row, col = s->col;
2659  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2660  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2661  int end_x = FFMIN(2 * (s->cols - col), w4);
2662  int end_y = FFMIN(2 * (s->rows - row), h4);
2663  int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2664  int uvstep1d = 1 << b->uvtx, p;
2665  uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2666  LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2667  LOCAL_ALIGNED_32(uint8_t, l, [64]);
2668 
2669  for (n = 0, y = 0; y < end_y; y += step1d) {
2670  uint8_t *ptr = dst, *ptr_r = dst_r;
2671  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2672  ptr_r += 4 * step1d * bytesperpixel, n += step) {
2673  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2674  y * 2 + x : 0];
2675  uint8_t *a = &a_buf[32];
2676  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2677  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2678 
2679  mode = check_intra_mode(s, mode, &a, ptr_r,
2680  s->s.frames[CUR_FRAME].tf.f->linesize[0],
2681  ptr, s->y_stride, l,
2682  col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2683  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2684  if (eob)
2685  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2686  s->block + 16 * n * bytesperpixel, eob);
2687  }
2688  dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2689  dst += 4 * step1d * s->y_stride;
2690  }
2691 
2692  // U/V
2693  w4 >>= s->ss_h;
2694  end_x >>= s->ss_h;
2695  end_y >>= s->ss_v;
2696  step = 1 << (b->uvtx * 2);
2697  for (p = 0; p < 2; p++) {
2698  dst = s->dst[1 + p];
2699  dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2700  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2701  uint8_t *ptr = dst, *ptr_r = dst_r;
2702  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2703  ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2704  int mode = b->uvmode;
2705  uint8_t *a = &a_buf[32];
2706  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2707 
2708  mode = check_intra_mode(s, mode, &a, ptr_r,
2709  s->s.frames[CUR_FRAME].tf.f->linesize[1],
2710  ptr, s->uv_stride, l, col, x, w4, row, y,
2711  b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2712  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2713  if (eob)
2714  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2715  s->uvblock[p] + 16 * n * bytesperpixel, eob);
2716  }
2717  dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2718  dst += 4 * uvstep1d * s->uv_stride;
2719  }
2720  }
2721 }
2722 
2723 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2724 {
2725  intra_recon(ctx, y_off, uv_off, 1);
2726 }
2727 
2728 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2729 {
2730  intra_recon(ctx, y_off, uv_off, 2);
2731 }
2732 
2734  uint8_t *dst, ptrdiff_t dst_stride,
2735  const uint8_t *ref, ptrdiff_t ref_stride,
2737  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2738  int bw, int bh, int w, int h, int bytesperpixel)
2739 {
2740  int mx = mv->x, my = mv->y, th;
2741 
2742  y += my >> 3;
2743  x += mx >> 3;
2744  ref += y * ref_stride + x * bytesperpixel;
2745  mx &= 7;
2746  my &= 7;
2747  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2748  // we use +7 because the last 7 pixels of each sbrow can be changed in
2749  // the longest loopfilter of the next sbrow
2750  th = (y + bh + 4 * !!my + 7) >> 6;
2751  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2752  if (x < !!mx * 3 || y < !!my * 3 ||
2753  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2755  ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2756  160, ref_stride,
2757  bw + !!mx * 7, bh + !!my * 7,
2758  x - !!mx * 3, y - !!my * 3, w, h);
2759  ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2760  ref_stride = 160;
2761  }
2762  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2763 }
2764 
2766  uint8_t *dst_u, uint8_t *dst_v,
2767  ptrdiff_t dst_stride,
2768  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2769  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2771  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2772  int bw, int bh, int w, int h, int bytesperpixel)
2773 {
2774  int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
2775 
2776  y += my >> 4;
2777  x += mx >> 4;
2778  ref_u += y * src_stride_u + x * bytesperpixel;
2779  ref_v += y * src_stride_v + x * bytesperpixel;
2780  mx &= 15;
2781  my &= 15;
2782  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2783  // we use +7 because the last 7 pixels of each sbrow can be changed in
2784  // the longest loopfilter of the next sbrow
2785  th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2786  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2787  if (x < !!mx * 3 || y < !!my * 3 ||
2788  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2790  ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2791  160, src_stride_u,
2792  bw + !!mx * 7, bh + !!my * 7,
2793  x - !!mx * 3, y - !!my * 3, w, h);
2794  ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2795  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2796 
2798  ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2799  160, src_stride_v,
2800  bw + !!mx * 7, bh + !!my * 7,
2801  x - !!mx * 3, y - !!my * 3, w, h);
2802  ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2803  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2804  } else {
2805  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2806  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2807  }
2808 }
2809 
2810 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2811  px, py, pw, ph, bw, bh, w, h, i) \
2812  mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2813  mv, bw, bh, w, h, bytesperpixel)
2814 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2815  row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2816  mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2817  row, col, mv, bw, bh, w, h, bytesperpixel)
2818 #define SCALED 0
2819 #define FN(x) x##_8bpp
2820 #define BYTES_PER_PIXEL 1
2821 #include "vp9_mc_template.c"
2822 #undef FN
2823 #undef BYTES_PER_PIXEL
2824 #define FN(x) x##_16bpp
2825 #define BYTES_PER_PIXEL 2
2826 #include "vp9_mc_template.c"
2827 #undef mc_luma_dir
2828 #undef mc_chroma_dir
2829 #undef FN
2830 #undef BYTES_PER_PIXEL
2831 #undef SCALED
2832 
2834  vp9_mc_func (*mc)[2],
2835  uint8_t *dst, ptrdiff_t dst_stride,
2836  const uint8_t *ref, ptrdiff_t ref_stride,
2838  ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2839  int px, int py, int pw, int ph,
2840  int bw, int bh, int w, int h, int bytesperpixel,
2841  const uint16_t *scale, const uint8_t *step)
2842 {
2843  if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2844  s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2845  mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2846  y, x, in_mv, bw, bh, w, h, bytesperpixel);
2847  } else {
2848 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2849  int mx, my;
2850  int refbw_m1, refbh_m1;
2851  int th;
2852  VP56mv mv;
2853 
2854  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2855  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2856  // BUG libvpx seems to scale the two components separately. This introduces
2857  // rounding errors but we have to reproduce them to be exactly compatible
2858  // with the output from libvpx...
2859  mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2860  my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2861 
2862  y = my >> 4;
2863  x = mx >> 4;
2864  ref += y * ref_stride + x * bytesperpixel;
2865  mx &= 15;
2866  my &= 15;
2867  refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2868  refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2869  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2870  // we use +7 because the last 7 pixels of each sbrow can be changed in
2871  // the longest loopfilter of the next sbrow
2872  th = (y + refbh_m1 + 4 + 7) >> 6;
2873  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2874  if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2876  ref - 3 * ref_stride - 3 * bytesperpixel,
2877  288, ref_stride,
2878  refbw_m1 + 8, refbh_m1 + 8,
2879  x - 3, y - 3, w, h);
2880  ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2881  ref_stride = 288;
2882  }
2883  smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2884  }
2885 }
2886 
2888  vp9_mc_func (*mc)[2],
2889  uint8_t *dst_u, uint8_t *dst_v,
2890  ptrdiff_t dst_stride,
2891  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2892  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2894  ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2895  int px, int py, int pw, int ph,
2896  int bw, int bh, int w, int h, int bytesperpixel,
2897  const uint16_t *scale, const uint8_t *step)
2898 {
2899  if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2900  s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2901  mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2902  ref_v, src_stride_v, ref_frame,
2903  y, x, in_mv, bw, bh, w, h, bytesperpixel);
2904  } else {
2905  int mx, my;
2906  int refbw_m1, refbh_m1;
2907  int th;
2908  VP56mv mv;
2909 
2910  if (s->ss_h) {
2911  // BUG https://code.google.com/p/webm/issues/detail?id=820
2912  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
2913  mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2914  } else {
2915  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2916  mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2917  }
2918  if (s->ss_v) {
2919  // BUG https://code.google.com/p/webm/issues/detail?id=820
2920  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
2921  my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2922  } else {
2923  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2924  my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2925  }
2926 #undef scale_mv
2927  y = my >> 4;
2928  x = mx >> 4;
2929  ref_u += y * src_stride_u + x * bytesperpixel;
2930  ref_v += y * src_stride_v + x * bytesperpixel;
2931  mx &= 15;
2932  my &= 15;
2933  refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2934  refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2935  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2936  // we use +7 because the last 7 pixels of each sbrow can be changed in
2937  // the longest loopfilter of the next sbrow
2938  th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2939  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2940  if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2942  ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2943  288, src_stride_u,
2944  refbw_m1 + 8, refbh_m1 + 8,
2945  x - 3, y - 3, w, h);
2946  ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2947  smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2948 
2950  ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2951  288, src_stride_v,
2952  refbw_m1 + 8, refbh_m1 + 8,
2953  x - 3, y - 3, w, h);
2954  ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2955  smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2956  } else {
2957  smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2958  smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2959  }
2960  }
2961 }
2962 
2963 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2964  px, py, pw, ph, bw, bh, w, h, i) \
2965  mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2966  mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2967  s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2968 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2969  row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2970  mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2971  row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2972  s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2973 #define SCALED 1
2974 #define FN(x) x##_scaled_8bpp
2975 #define BYTES_PER_PIXEL 1
2976 #include "vp9_mc_template.c"
2977 #undef FN
2978 #undef BYTES_PER_PIXEL
2979 #define FN(x) x##_scaled_16bpp
2980 #define BYTES_PER_PIXEL 2
2981 #include "vp9_mc_template.c"
2982 #undef mc_luma_dir
2983 #undef mc_chroma_dir
2984 #undef FN
2985 #undef BYTES_PER_PIXEL
2986 #undef SCALED
2987 
2988 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2989 {
2990  VP9Context *s = ctx->priv_data;
2991  VP9Block *b = s->b;
2992  int row = s->row, col = s->col;
2993 
2994  if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2995  if (bytesperpixel == 1) {
2996  inter_pred_scaled_8bpp(ctx);
2997  } else {
2998  inter_pred_scaled_16bpp(ctx);
2999  }
3000  } else {
3001  if (bytesperpixel == 1) {
3002  inter_pred_8bpp(ctx);
3003  } else {
3004  inter_pred_16bpp(ctx);
3005  }
3006  }
3007  if (!b->skip) {
3008  /* mostly copied intra_recon() */
3009 
3010  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3011  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3012  int end_x = FFMIN(2 * (s->cols - col), w4);
3013  int end_y = FFMIN(2 * (s->rows - row), h4);
3014  int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3015  int uvstep1d = 1 << b->uvtx, p;
3016  uint8_t *dst = s->dst[0];
3017 
3018  // y itxfm add
3019  for (n = 0, y = 0; y < end_y; y += step1d) {
3020  uint8_t *ptr = dst;
3021  for (x = 0; x < end_x; x += step1d,
3022  ptr += 4 * step1d * bytesperpixel, n += step) {
3023  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3024 
3025  if (eob)
3026  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3027  s->block + 16 * n * bytesperpixel, eob);
3028  }
3029  dst += 4 * s->y_stride * step1d;
3030  }
3031 
3032  // uv itxfm add
3033  end_x >>= s->ss_h;
3034  end_y >>= s->ss_v;
3035  step = 1 << (b->uvtx * 2);
3036  for (p = 0; p < 2; p++) {
3037  dst = s->dst[p + 1];
3038  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3039  uint8_t *ptr = dst;
3040  for (x = 0; x < end_x; x += uvstep1d,
3041  ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3042  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3043 
3044  if (eob)
3045  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3046  s->uvblock[p] + 16 * n * bytesperpixel, eob);
3047  }
3048  dst += 4 * uvstep1d * s->uv_stride;
3049  }
3050  }
3051  }
3052 }
3053 
3055 {
3056  inter_recon(ctx, 1);
3057 }
3058 
3060 {
3061  inter_recon(ctx, 2);
3062 }
3063 
3064 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3065  int row_and_7, int col_and_7,
3066  int w, int h, int col_end, int row_end,
3067  enum TxfmMode tx, int skip_inter)
3068 {
3069  static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3070  static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3071 
3072  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3073  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3074  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3075  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3076 
3077  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3078  // edges. This means that for UV, we work on two subsampled blocks at
3079  // a time, and we only use the topleft block's mode information to set
3080  // things like block strength. Thus, for any block size smaller than
3081  // 16x16, ignore the odd portion of the block.
3082  if (tx == TX_4X4 && (ss_v | ss_h)) {
3083  if (h == ss_v) {
3084  if (row_and_7 & 1)
3085  return;
3086  if (!row_end)
3087  h += 1;
3088  }
3089  if (w == ss_h) {
3090  if (col_and_7 & 1)
3091  return;
3092  if (!col_end)
3093  w += 1;
3094  }
3095  }
3096 
3097  if (tx == TX_4X4 && !skip_inter) {
3098  int t = 1 << col_and_7, m_col = (t << w) - t, y;
3099  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3100  int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3101 
3102  for (y = row_and_7; y < h + row_and_7; y++) {
3103  int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3104 
3105  mask[0][y][1] |= m_row_8;
3106  mask[0][y][2] |= m_row_4;
3107  // for odd lines, if the odd col is not being filtered,
3108  // skip odd row also:
3109  // .---. <-- a
3110  // | |
3111  // |___| <-- b
3112  // ^ ^
3113  // c d
3114  //
3115  // if a/c are even row/col and b/d are odd, and d is skipped,
3116  // e.g. right edge of size-66x66.webm, then skip b also (bug)
3117  if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3118  mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3119  } else {
3120  mask[1][y][col_mask_id] |= m_col;
3121  }
3122  if (!ss_h)
3123  mask[0][y][3] |= m_col;
3124  if (!ss_v) {
3125  if (ss_h && (col_end & 1))
3126  mask[1][y][3] |= (t << (w - 1)) - t;
3127  else
3128  mask[1][y][3] |= m_col;
3129  }
3130  }
3131  } else {
3132  int y, t = 1 << col_and_7, m_col = (t << w) - t;
3133 
3134  if (!skip_inter) {
3135  int mask_id = (tx == TX_8X8);
3136  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3137  int l2 = tx + ss_h - 1, step1d;
3138  int m_row = m_col & masks[l2];
3139 
3140  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3141  // 8wd loopfilter to prevent going off the visible edge.
3142  if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3143  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3144  int m_row_8 = m_row - m_row_16;
3145 
3146  for (y = row_and_7; y < h + row_and_7; y++) {
3147  mask[0][y][0] |= m_row_16;
3148  mask[0][y][1] |= m_row_8;
3149  }
3150  } else {
3151  for (y = row_and_7; y < h + row_and_7; y++)
3152  mask[0][y][mask_id] |= m_row;
3153  }
3154 
3155  l2 = tx + ss_v - 1;
3156  step1d = 1 << l2;
3157  if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3158  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3159  mask[1][y][0] |= m_col;
3160  if (y - row_and_7 == h - 1)
3161  mask[1][y][1] |= m_col;
3162  } else {
3163  for (y = row_and_7; y < h + row_and_7; y += step1d)
3164  mask[1][y][mask_id] |= m_col;
3165  }
3166  } else if (tx != TX_4X4) {
3167  int mask_id;
3168 
3169  mask_id = (tx == TX_8X8) || (h == ss_v);
3170  mask[1][row_and_7][mask_id] |= m_col;
3171  mask_id = (tx == TX_8X8) || (w == ss_h);
3172  for (y = row_and_7; y < h + row_and_7; y++)
3173  mask[0][y][mask_id] |= t;
3174  } else {
3175  int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3176 
3177  for (y = row_and_7; y < h + row_and_7; y++) {
3178  mask[0][y][2] |= t4;
3179  mask[0][y][1] |= t8;
3180  }
3181  mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3182  }
3183  }
3184 }
3185 
3186 static void decode_b(AVCodecContext *ctx, int row, int col,
3187  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3188  enum BlockLevel bl, enum BlockPartition bp)
3189 {
3190  VP9Context *s = ctx->priv_data;
3191  VP9Block *b = s->b;
3192  enum BlockSize bs = bl * 3 + bp;
3193  int bytesperpixel = s->bytesperpixel;
3194  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3195  int emu[2];
3196  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3197 
3198  s->row = row;
3199  s->row7 = row & 7;
3200  s->col = col;
3201  s->col7 = col & 7;
3202  s->min_mv.x = -(128 + col * 64);
3203  s->min_mv.y = -(128 + row * 64);
3204  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3205  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3206  if (s->pass < 2) {
3207  b->bs = bs;
3208  b->bl = bl;
3209  b->bp = bp;
3210  decode_mode(ctx);
3211  b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3212  (s->ss_v && h4 * 2 == (1 << b->tx)));
3213 
3214  if (!b->skip) {
3215  int has_coeffs;
3216 
3217  if (bytesperpixel == 1) {
3218  has_coeffs = decode_coeffs_8bpp(ctx);
3219  } else {
3220  has_coeffs = decode_coeffs_16bpp(ctx);
3221  }
3222  if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3223  b->skip = 1;
3224  memset(&s->above_skip_ctx[col], 1, w4);
3225  memset(&s->left_skip_ctx[s->row7], 1, h4);
3226  }
3227  } else {
3228  int row7 = s->row7;
3229 
3230 #define SPLAT_ZERO_CTX(v, n) \
3231  switch (n) { \
3232  case 1: v = 0; break; \
3233  case 2: AV_ZERO16(&v); break; \
3234  case 4: AV_ZERO32(&v); break; \
3235  case 8: AV_ZERO64(&v); break; \
3236  case 16: AV_ZERO128(&v); break; \
3237  }
3238 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3239  do { \
3240  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3241  if (s->ss_##dir2) { \
3242  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3243  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3244  } else { \
3245  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3246  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3247  } \
3248  } while (0)
3249 
3250  switch (w4) {
3251  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3252  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3253  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3254  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3255  }
3256  switch (h4) {
3257  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3258  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3259  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3260  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3261  }
3262  }
3263 
3264  if (s->pass == 1) {
3265  s->b++;
3266  s->block += w4 * h4 * 64 * bytesperpixel;
3267  s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3268  s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3269  s->eob += 4 * w4 * h4;
3270  s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3271  s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3272 
3273  return;
3274  }
3275  }
3276 
3277  // emulated overhangs if the stride of the target buffer can't hold. This
3278  // makes it possible to support emu-edge and so on even if we have large block
3279  // overhangs
3280  emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3281  (row + h4) > s->rows;
3282  emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3283  (row + h4) > s->rows;
3284  if (emu[0]) {
3285  s->dst[0] = s->tmp_y;
3286  s->y_stride = 128;
3287  } else {
3288  s->dst[0] = f->data[0] + yoff;
3289  s->y_stride = f->linesize[0];
3290  }
3291  if (emu[1]) {
3292  s->dst[1] = s->tmp_uv[0];
3293  s->dst[2] = s->tmp_uv[1];
3294  s->uv_stride = 128;
3295  } else {
3296  s->dst[1] = f->data[1] + uvoff;
3297  s->dst[2] = f->data[2] + uvoff;
3298  s->uv_stride = f->linesize[1];
3299  }
3300  if (b->intra) {
3301  if (s->bpp > 8) {
3302  intra_recon_16bpp(ctx, yoff, uvoff);
3303  } else {
3304  intra_recon_8bpp(ctx, yoff, uvoff);
3305  }
3306  } else {
3307  if (s->bpp > 8) {
3308  inter_recon_16bpp(ctx);
3309  } else {
3310  inter_recon_8bpp(ctx);
3311  }
3312  }
3313  if (emu[0]) {
3314  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3315 
3316  for (n = 0; o < w; n++) {
3317  int bw = 64 >> n;
3318 
3319  av_assert2(n <= 4);
3320  if (w & bw) {
3321  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3322  s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3323  o += bw;
3324  }
3325  }
3326  }
3327  if (emu[1]) {
3328  int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3329  int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3330 
3331  for (n = s->ss_h; o < w; n++) {
3332  int bw = 64 >> n;
3333 
3334  av_assert2(n <= 4);
3335  if (w & bw) {
3336  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3337  s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3338  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3339  s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3340  o += bw;
3341  }
3342  }
3343  }
3344 
3345  // pick filter level and find edges to apply filter to
3346  if (s->s.h.filter.level &&
3347  (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3348  [b->mode[3] != ZEROMV]) > 0) {
3349  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3350  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3351 
3352  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3353  mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3354  if (s->ss_h || s->ss_v)
3355  mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3356  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3357  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3358  b->uvtx, skip_inter);
3359 
3360  if (!s->filter_lut.lim_lut[lvl]) {
3361  int sharp = s->s.h.filter.sharpness;
3362  int limit = lvl;
3363 
3364  if (sharp > 0) {
3365  limit >>= (sharp + 3) >> 2;
3366  limit = FFMIN(limit, 9 - sharp);
3367  }
3368  limit = FFMAX(limit, 1);
3369 
3370  s->filter_lut.lim_lut[lvl] = limit;
3371  s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3372  }
3373  }
3374 
3375  if (s->pass == 2) {
3376  s->b++;
3377  s->block += w4 * h4 * 64 * bytesperpixel;
3378  s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3379  s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3380  s->eob += 4 * w4 * h4;
3381  s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3382  s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3383  }
3384 }
3385 
3386 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3387  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3388 {
3389  VP9Context *s = ctx->priv_data;
3390  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3391  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3392  const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3393  s->prob.p.partition[bl][c];
3394  enum BlockPartition bp;
3395  ptrdiff_t hbs = 4 >> bl;
3396  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3397  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3398  int bytesperpixel = s->bytesperpixel;
3399 
3400  if (bl == BL_8X8) {
3401  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3402  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3403  } else if (col + hbs < s->cols) { // FIXME why not <=?
3404  if (row + hbs < s->rows) { // FIXME why not <=?
3405  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3406  switch (bp) {
3407  case PARTITION_NONE:
3408  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3409  break;
3410  case PARTITION_H:
3411  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3412  yoff += hbs * 8 * y_stride;
3413  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3414  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3415  break;
3416  case PARTITION_V:
3417  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3418  yoff += hbs * 8 * bytesperpixel;
3419  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3420  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3421  break;
3422  case PARTITION_SPLIT:
3423  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3424  decode_sb(ctx, row, col + hbs, lflvl,
3425  yoff + 8 * hbs * bytesperpixel,
3426  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3427  yoff += hbs * 8 * y_stride;
3428  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3429  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3430  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3431  yoff + 8 * hbs * bytesperpixel,
3432  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3433  break;
3434  default:
3435  av_assert0(0);
3436  }
3437  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3438  bp = PARTITION_SPLIT;
3439  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3440  decode_sb(ctx, row, col + hbs, lflvl,
3441  yoff + 8 * hbs * bytesperpixel,
3442  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3443  } else {
3444  bp = PARTITION_H;
3445  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3446  }
3447  } else if (row + hbs < s->rows) { // FIXME why not <=?
3448  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3449  bp = PARTITION_SPLIT;
3450  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3451  yoff += hbs * 8 * y_stride;
3452  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3453  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3454  } else {
3455  bp = PARTITION_V;
3456  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3457  }
3458  } else {
3459  bp = PARTITION_SPLIT;
3460  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3461  }
3462  s->counts.partition[bl][c][bp]++;
3463 }
3464 
3465 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3466  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3467 {
3468  VP9Context *s = ctx->priv_data;
3469  VP9Block *b = s->b;
3470  ptrdiff_t hbs = 4 >> bl;
3471  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3472  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3473  int bytesperpixel = s->bytesperpixel;
3474 
3475  if (bl == BL_8X8) {
3476  av_assert2(b->bl == BL_8X8);
3477  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3478  } else if (s->b->bl == bl) {
3479  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3480  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3481  yoff += hbs * 8 * y_stride;
3482  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3483  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3484  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3485  yoff += hbs * 8 * bytesperpixel;
3486  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3487  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3488  }
3489  } else {
3490  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3491  if (col + hbs < s->cols) { // FIXME why not <=?
3492  if (row + hbs < s->rows) {
3493  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3494  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3495  yoff += hbs * 8 * y_stride;
3496  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3497  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3498  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3499  yoff + 8 * hbs * bytesperpixel,
3500  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3501  } else {
3502  yoff += hbs * 8 * bytesperpixel;
3503  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3504  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3505  }
3506  } else if (row + hbs < s->rows) {
3507  yoff += hbs * 8 * y_stride;
3508  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3509  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3510  }
3511  }
3512 }
3513 
3514 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3515  uint8_t *lvl, uint8_t (*mask)[4],
3516  uint8_t *dst, ptrdiff_t ls)
3517 {
3518  int y, x, bytesperpixel = s->bytesperpixel;
3519 
3520  // filter edges between columns (e.g. block1 | block2)
3521  for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3522  uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3523  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3524  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3525  unsigned hm = hm1 | hm2 | hm13 | hm23;
3526 
3527  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3528  if (col || x > 1) {
3529  if (hm1 & x) {
3530  int L = *l, H = L >> 4;
3531  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3532 
3533  if (hmask1[0] & x) {
3534  if (hmask2[0] & x) {
3535  av_assert2(l[8 << ss_v] == L);
3536  s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3537  } else {
3538  s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3539  }
3540  } else if (hm2 & x) {
3541  L = l[8 << ss_v];
3542  H |= (L >> 4) << 8;
3543  E |= s->filter_lut.mblim_lut[L] << 8;
3544  I |= s->filter_lut.lim_lut[L] << 8;
3545  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3546  [!!(hmask2[1] & x)]
3547  [0](ptr, ls, E, I, H);
3548  } else {
3549  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3550  [0](ptr, ls, E, I, H);
3551  }
3552  } else if (hm2 & x) {
3553  int L = l[8 << ss_v], H = L >> 4;
3554  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3555 
3556  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3557  [0](ptr + 8 * ls, ls, E, I, H);
3558  }
3559  }
3560  if (ss_h) {
3561  if (x & 0xAA)
3562  l += 2;
3563  } else {
3564  if (hm13 & x) {
3565  int L = *l, H = L >> 4;
3566  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3567 
3568  if (hm23 & x) {
3569  L = l[8 << ss_v];
3570  H |= (L >> 4) << 8;
3571  E |= s->filter_lut.mblim_lut[L] << 8;
3572  I |= s->filter_lut.lim_lut[L] << 8;
3573  s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3574  } else {
3575  s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3576  }
3577  } else if (hm23 & x) {
3578  int L = l[8 << ss_v], H = L >> 4;
3579  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3580 
3581  s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3582  }
3583  l++;
3584  }
3585  }
3586  }
3587 }
3588 
3589 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3590  uint8_t *lvl, uint8_t (*mask)[4],
3591  uint8_t *dst, ptrdiff_t ls)
3592 {
3593  int y, x, bytesperpixel = s->bytesperpixel;
3594 
3595  // block1
3596  // filter edges between rows (e.g. ------)
3597  // block2
3598  for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3599  uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3600  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3601 
3602  for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3603  if (row || y) {
3604  if (vm & x) {
3605  int L = *l, H = L >> 4;
3606  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3607 
3608  if (vmask[0] & x) {
3609  if (vmask[0] & (x << (1 + ss_h))) {
3610  av_assert2(l[1 + ss_h] == L);
3611  s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3612  } else {
3613  s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3614  }
3615  } else if (vm & (x << (1 + ss_h))) {
3616  L = l[1 + ss_h];
3617  H |= (L >> 4) << 8;
3618  E |= s->filter_lut.mblim_lut[L] << 8;
3619  I |= s->filter_lut.lim_lut[L] << 8;
3620  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3621  [!!(vmask[1] & (x << (1 + ss_h)))]
3622  [1](ptr, ls, E, I, H);
3623  } else {
3624  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3625  [1](ptr, ls, E, I, H);
3626  }
3627  } else if (vm & (x << (1 + ss_h))) {
3628  int L = l[1 + ss_h], H = L >> 4;
3629  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3630 
3631  s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3632  [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3633  }
3634  }
3635  if (!ss_v) {
3636  if (vm3 & x) {
3637  int L = *l, H = L >> 4;
3638  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3639 
3640  if (vm3 & (x << (1 + ss_h))) {
3641  L = l[1 + ss_h];
3642  H |= (L >> 4) << 8;
3643  E |= s->filter_lut.mblim_lut[L] << 8;
3644  I |= s->filter_lut.lim_lut[L] << 8;
3645  s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3646  } else {
3647  s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3648  }
3649  } else if (vm3 & (x << (1 + ss_h))) {
3650  int L = l[1 + ss_h], H = L >> 4;
3651  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3652 
3653  s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3654  }
3655  }
3656  }
3657  if (ss_v) {
3658  if (y & 1)
3659  lvl += 16;
3660  } else {
3661  lvl += 8;
3662  }
3663  }
3664 }
3665 
3666 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3667  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3668 {
3669  VP9Context *s = ctx->priv_data;
3670  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3671  uint8_t *dst = f->data[0] + yoff;
3672  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3673  uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3674  int p;
3675 
3676  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3677  // if you think of them as acting on a 8x8 block max, we can interleave
3678  // each v/h within the single x loop, but that only works if we work on
3679  // 8 pixel blocks, and we won't always do that (we want at least 16px
3680  // to use SSE2 optimizations, perhaps 32 for AVX2)
3681 
3682  filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3683  filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3684 
3685  for (p = 0; p < 2; p++) {
3686  dst = f->data[1 + p] + uvoff;
3687  filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3688  filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3689  }
3690 }
3691 
3692 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3693 {
3694  int sb_start = ( idx * n) >> log2_n;
3695  int sb_end = ((idx + 1) * n) >> log2_n;
3696  *start = FFMIN(sb_start, n) << 3;
3697  *end = FFMIN(sb_end, n) << 3;
3698 }
3699 
3700 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3701  int max_count, int update_factor)
3702 {
3703  unsigned ct = ct0 + ct1, p2, p1;
3704 
3705  if (!ct)
3706  return;
3707 
3708  p1 = *p;
3709  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3710  p2 = av_clip(p2, 1, 255);
3711  ct = FFMIN(ct, max_count);
3712  update_factor = FASTDIV(update_factor * ct, max_count);
3713 
3714  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3715  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3716 }
3717 
3718 static void adapt_probs(VP9Context *s)
3719 {
3720  int i, j, k, l, m;
3721  prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3722  int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3723 
3724  // coefficients
3725  for (i = 0; i < 4; i++)
3726  for (j = 0; j < 2; j++)
3727  for (k = 0; k < 2; k++)
3728  for (l = 0; l < 6; l++)
3729  for (m = 0; m < 6; m++) {
3730  uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3731  unsigned *e = s->counts.eob[i][j][k][l][m];
3732  unsigned *c = s->counts.coef[i][j][k][l][m];
3733 
3734  if (l == 0 && m >= 3) // dc only has 3 pt
3735  break;
3736 
3737  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3738  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3739  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3740  }
3741 
3742  if (s->s.h.keyframe || s->s.h.intraonly) {
3743  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3744  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3745  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3746  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3747  return;
3748  }
3749 
3750  // skip flag
3751  for (i = 0; i < 3; i++)
3752  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3753 
3754  // intra/inter flag
3755  for (i = 0; i < 4; i++)
3756  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3757 
3758  // comppred flag
3759  if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3760  for (i = 0; i < 5; i++)
3761  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3762  }
3763 
3764  // reference frames
3765  if (s->s.h.comppredmode != PRED_SINGLEREF) {
3766  for (i = 0; i < 5; i++)
3767  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3768  s->counts.comp_ref[i][1], 20, 128);
3769  }
3770 
3771  if (s->s.h.comppredmode != PRED_COMPREF) {
3772  for (i = 0; i < 5; i++) {
3773  uint8_t *pp = p->single_ref[i];
3774  unsigned (*c)[2] = s->counts.single_ref[i];
3775 
3776  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3777  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3778  }
3779  }
3780 
3781  // block partitioning
3782  for (i = 0; i < 4; i++)
3783  for (j = 0; j < 4; j++) {
3784  uint8_t *pp = p->partition[i][j];
3785  unsigned *c = s->counts.partition[i][j];
3786 
3787  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3788  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3789  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3790  }
3791 
3792  // tx size
3793  if (s->s.h.txfmmode == TX_SWITCHABLE) {
3794  for (i = 0; i < 2; i++) {
3795  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3796 
3797  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3798  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3799  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3800  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3801  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3802  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3803  }
3804  }
3805 
3806  // interpolation filter
3807  if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3808  for (i = 0; i < 4; i++) {
3809  uint8_t *pp = p->filter[i];
3810  unsigned *c = s->counts.filter[i];
3811 
3812  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3813  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3814  }
3815  }
3816 
3817  // inter modes
3818  for (i = 0; i < 7; i++) {
3819  uint8_t *pp = p->mv_mode[i];
3820  unsigned *c = s->counts.mv_mode[i];
3821 
3822  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3823  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3824  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3825  }
3826 
3827  // mv joints
3828  {
3829  uint8_t *pp = p->mv_joint;
3830  unsigned *c = s->counts.mv_joint;
3831 
3832  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3833  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3834  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3835  }
3836 
3837  // mv components
3838  for (i = 0; i < 2; i++) {
3839  uint8_t *pp;
3840  unsigned *c, (*c2)[2], sum;
3841 
3842  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3843  s->counts.mv_comp[i].sign[1], 20, 128);
3844 
3845  pp = p->mv_comp[i].classes;
3846  c = s->counts.mv_comp[i].classes;
3847  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3848  adapt_prob(&pp[0], c[0], sum, 20, 128);
3849  sum -= c[1];
3850  adapt_prob(&pp[1], c[1], sum, 20, 128);
3851  sum -= c[2] + c[3];
3852  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3853  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3854  sum -= c[4] + c[5];
3855  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3856  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3857  sum -= c[6];
3858  adapt_prob(&pp[6], c[6], sum, 20, 128);
3859  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3860  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3861  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3862 
3863  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3864  s->counts.mv_comp[i].class0[1], 20, 128);
3865  pp = p->mv_comp[i].bits;
3866  c2 = s->counts.mv_comp[i].bits;
3867  for (j = 0; j < 10; j++)
3868  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3869 
3870  for (j = 0; j < 2; j++) {
3871  pp = p->mv_comp[i].class0_fp[j];
3872  c = s->counts.mv_comp[i].class0_fp[j];
3873  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3874  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3875  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3876  }
3877  pp = p->mv_comp[i].fp;
3878  c = s->counts.mv_comp[i].fp;
3879  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3880  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3881  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3882 
3883  if (s->s.h.highprecisionmvs) {
3884  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3885  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3886  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3887  s->counts.mv_comp[i].hp[1], 20, 128);
3888  }
3889  }
3890 
3891  // y intra modes
3892  for (i = 0; i < 4; i++) {
3893  uint8_t *pp = p->y_mode[i];
3894  unsigned *c = s->counts.y_mode[i], sum, s2;
3895 
3896  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3897  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3898  sum -= c[TM_VP8_PRED];
3899  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3900  sum -= c[VERT_PRED];
3901  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3902  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3903  sum -= s2;
3904  adapt_prob(&pp[3], s2, sum, 20, 128);
3905  s2 -= c[HOR_PRED];
3906  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3907  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3908  sum -= c[DIAG_DOWN_LEFT_PRED];
3909  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3910  sum -= c[VERT_LEFT_PRED];
3911  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3912  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3913  }
3914 
3915  // uv intra modes
3916  for (i = 0; i < 10; i++) {
3917  uint8_t *pp = p->uv_mode[i];
3918  unsigned *c = s->counts.uv_mode[i], sum, s2;
3919 
3920  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3921  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3922  sum -= c[TM_VP8_PRED];
3923  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3924  sum -= c[VERT_PRED];
3925  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3926  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3927  sum -= s2;
3928  adapt_prob(&pp[3], s2, sum, 20, 128);
3929  s2 -= c[HOR_PRED];
3930  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3931  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3932  sum -= c[DIAG_DOWN_LEFT_PRED];
3933  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3934  sum -= c[VERT_LEFT_PRED];
3935  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3936  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3937  }
3938 }
3939 
3940 static void free_buffers(VP9Context *s)
3941 {
3942  av_freep(&s->intra_pred_data[0]);
3943  av_freep(&s->b_base);
3944  av_freep(&s->block_base);
3945 }
3946 
3948 {
3949  VP9Context *s = ctx->priv_data;
3950  int i;
3951 
3952  for (i = 0; i < 3; i++) {
3953  if (s->s.frames[i].tf.f->buf[0])
3954  vp9_unref_frame(ctx, &s->s.frames[i]);
3955  av_frame_free(&s->s.frames[i].tf.f);
3956  }
3957  for (i = 0; i < 8; i++) {
3958  if (s->s.refs[i].f->buf[0])
3959  ff_thread_release_buffer(ctx, &s->s.refs[i]);
3960  av_frame_free(&s->s.refs[i].f);
3961  if (s->next_refs[i].f->buf[0])
3962  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3963  av_frame_free(&s->next_refs[i].f);
3964  }
3965  free_buffers(s);
3966  av_freep(&s->c_b);
3967  s->c_b_size = 0;
3968 
3969  return 0;
3970 }
3971 
3972 
3974  int *got_frame, AVPacket *pkt)
3975 {
3976  const uint8_t *data = pkt->data;
3977  int size = pkt->size;
3978  VP9Context *s = ctx->priv_data;
3979  int res, tile_row, tile_col, i, ref, row, col;
3980  int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3982  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3983  AVFrame *f;
3984  int bytesperpixel;
3985 
3986  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3987  return res;
3988  } else if (res == 0) {
3989  if (!s->s.refs[ref].f->buf[0]) {
3990  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3991  return AVERROR_INVALIDDATA;
3992  }
3993  if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3994  return res;
3995  ((AVFrame *)frame)->pkt_pts = pkt->pts;
3996  ((AVFrame *)frame)->pkt_dts = pkt->dts;
3997  for (i = 0; i < 8; i++) {
3998  if (s->next_refs[i].f->buf[0])
3999  ff_thread_release_buffer(ctx, &s->next_refs[i]);
4000  if (s->s.refs[i].f->buf[0] &&
4001  (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4002  return res;
4003  }
4004  *got_frame = 1;
4005  return pkt->size;
4006  }
4007  data += res;
4008  size -= res;
4009 
4010  if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4011  if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4013  if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4014  (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4015  return res;
4016  }
4017  if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4019  if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4020  (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4021  return res;
4022  if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4023  vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4024  if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4025  return res;
4026  f = s->s.frames[CUR_FRAME].tf.f;
4027  f->key_frame = s->s.h.keyframe;
4029  ls_y = f->linesize[0];
4030  ls_uv =f->linesize[1];
4031 
4032  if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4036  }
4037 
4038  // ref frame setup
4039  for (i = 0; i < 8; i++) {
4040  if (s->next_refs[i].f->buf[0])
4041  ff_thread_release_buffer(ctx, &s->next_refs[i]);
4042  if (s->s.h.refreshrefmask & (1 << i)) {
4043  res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4044  } else if (s->s.refs[i].f->buf[0]) {
4045  res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4046  }
4047  if (res < 0)
4048  return res;
4049  }
4050 
4051  if (ctx->hwaccel) {
4052  res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4053  if (res < 0)
4054  return res;
4055  res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4056  if (res < 0)
4057  return res;
4058  res = ctx->hwaccel->end_frame(ctx);
4059  if (res < 0)
4060  return res;
4061  goto finish;
4062  }
4063 
4064  // main tile decode loop
4065  bytesperpixel = s->bytesperpixel;
4066  memset(s->above_partition_ctx, 0, s->cols);
4067  memset(s->above_skip_ctx, 0, s->cols);
4068  if (s->s.h.keyframe || s->s.h.intraonly) {
4069  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4070  } else {
4071  memset(s->above_mode_ctx, NEARESTMV, s->cols);
4072  }
4073  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4074  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4075  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4076  memset(s->above_segpred_ctx, 0, s->cols);
4077  s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4079  if ((res = update_block_buffers(ctx)) < 0) {
4080  av_log(ctx, AV_LOG_ERROR,
4081  "Failed to allocate block buffers\n");
4082  return res;
4083  }
4084  if (s->s.h.refreshctx && s->s.h.parallelmode) {
4085  int j, k, l, m;
4086 
4087  for (i = 0; i < 4; i++) {
4088  for (j = 0; j < 2; j++)
4089  for (k = 0; k < 2; k++)
4090  for (l = 0; l < 6; l++)
4091  for (m = 0; m < 6; m++)
4092  memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4093  s->prob.coef[i][j][k][l][m], 3);
4094  if (s->s.h.txfmmode == i)
4095  break;
4096  }
4097  s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4099  } else if (!s->s.h.refreshctx) {
4101  }
4102 
4103  do {
4104  yoff = uvoff = 0;
4105  s->b = s->b_base;
4106  s->block = s->block_base;
4107  s->uvblock[0] = s->uvblock_base[0];
4108  s->uvblock[1] = s->uvblock_base[1];
4109  s->eob = s->eob_base;
4110  s->uveob[0] = s->uveob_base[0];
4111  s->uveob[1] = s->uveob_base[1];
4112 
4113  for (tile_row = 0; tile_row < s->s.h.